summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
Diffstat (limited to 'runsc')
-rw-r--r--runsc/boot/BUILD6
-rw-r--r--runsc/boot/controller.go45
-rw-r--r--runsc/boot/filter/config.go21
-rw-r--r--runsc/boot/fs.go14
-rw-r--r--runsc/boot/loader.go70
-rw-r--r--runsc/boot/network.go22
-rw-r--r--runsc/boot/profile.go95
-rw-r--r--runsc/boot/strace.go9
-rw-r--r--runsc/boot/vfs.go6
-rw-r--r--runsc/cli/main.go2
-rw-r--r--runsc/cmd/BUILD1
-rw-r--r--runsc/cmd/boot.go50
-rw-r--r--runsc/cmd/debug.go95
-rw-r--r--runsc/cmd/do.go1
-rw-r--r--runsc/cmd/events.go13
-rw-r--r--runsc/cmd/gofer.go120
-rw-r--r--runsc/cmd/run.go9
-rw-r--r--runsc/cmd/spec.go1
-rw-r--r--runsc/cmd/usage.go93
-rw-r--r--runsc/cmd/verity_prepare.go7
-rw-r--r--runsc/config/BUILD6
-rw-r--r--runsc/config/config.go140
-rw-r--r--runsc/config/config_test.go23
-rw-r--r--runsc/config/flags.go9
-rw-r--r--runsc/container/BUILD2
-rw-r--r--runsc/container/container.go28
-rw-r--r--runsc/container/container_test.go217
-rw-r--r--runsc/fsgofer/BUILD16
-rw-r--r--runsc/fsgofer/fsgofer.go16
-rw-r--r--runsc/fsgofer/fsgofer_test.go50
-rw-r--r--runsc/fsgofer/fsgofer_unsafe.go30
-rw-r--r--runsc/fsgofer/lisafs.go1084
-rw-r--r--runsc/fsgofer/lisafs_test.go56
-rw-r--r--runsc/sandbox/BUILD2
-rw-r--r--runsc/sandbox/sandbox.go141
-rw-r--r--runsc/specutils/specutils.go4
36 files changed, 2327 insertions, 177 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index c9d2b3eff..36806b740 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -15,6 +15,7 @@ go_library(
"limits.go",
"loader.go",
"network.go",
+ "profile.go",
"strace.go",
"vfs.go",
],
@@ -45,6 +46,7 @@ go_library(
"//pkg/sentry/arch",
"//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/control",
+ "//pkg/sentry/control:control_go_proto",
"//pkg/sentry/devices/memdev",
"//pkg/sentry/devices/ttydev",
"//pkg/sentry/devices/tundev",
@@ -78,7 +80,6 @@ go_library(
"//pkg/sentry/loader",
"//pkg/sentry/pgalloc",
"//pkg/sentry/platform",
- "//pkg/sentry/sighandling",
"//pkg/sentry/socket/hostinet",
"//pkg/sentry/socket/netfilter",
"//pkg/sentry/socket/netlink",
@@ -94,11 +95,12 @@ go_library(
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
"//pkg/sentry/watchdog",
+ "//pkg/sighandling",
"//pkg/sync",
"//pkg/tcpip",
+ "//pkg/tcpip/link/ethernet",
"//pkg/tcpip/link/fdbased",
"//pkg/tcpip/link/loopback",
- "//pkg/tcpip/link/packetsocket",
"//pkg/tcpip/link/qdisc/fifo",
"//pkg/tcpip/link/sniffer",
"//pkg/tcpip/network/arp",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 60b532798..76e1f596b 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
+ controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
@@ -114,6 +115,18 @@ const (
FsCat = "Fs.Cat"
)
+// Usage related commands (see usage.go for more details).
+const (
+ UsageCollect = "Usage.Collect"
+ UsageUsageFD = "Usage.UsageFD"
+ UsageReduce = "Usage.Reduce"
+)
+
+// Events related commands (see events.go for more details).
+const (
+ EventsAttachDebugEmitter = "Events.AttachDebugEmitter"
+)
+
// ControlSocketAddr generates an abstract unix socket name for the given ID.
func ControlSocketAddr(id string) string {
return fmt.Sprintf("\x00runsc-sandbox.%s", id)
@@ -153,13 +166,31 @@ func newController(fd int, l *Loader) (*controller, error) {
ctrl.srv.Register(net)
}
- ctrl.srv.Register(&debug{})
- ctrl.srv.Register(&control.Logging{})
- ctrl.srv.Register(&control.Lifecycle{l.k})
- ctrl.srv.Register(&control.Fs{l.k})
-
- if l.root.conf.ProfileEnable {
- ctrl.srv.Register(control.NewProfile(l.k))
+ if l.root.conf.Controls.Controls != nil {
+ for _, c := range l.root.conf.Controls.Controls.AllowedControls {
+ switch c {
+ case controlpb.ControlConfig_EVENTS:
+ ctrl.srv.Register(&control.Events{})
+ case controlpb.ControlConfig_FS:
+ ctrl.srv.Register(&control.Fs{Kernel: l.k})
+ case controlpb.ControlConfig_LIFECYCLE:
+ ctrl.srv.Register(&control.Lifecycle{Kernel: l.k})
+ case controlpb.ControlConfig_LOGGING:
+ ctrl.srv.Register(&control.Logging{})
+ case controlpb.ControlConfig_PROFILE:
+ if l.root.conf.ProfileEnable {
+ ctrl.srv.Register(control.NewProfile(l.k))
+ }
+ case controlpb.ControlConfig_USAGE:
+ ctrl.srv.Register(&control.Usage{Kernel: l.k})
+ case controlpb.ControlConfig_PROC:
+ ctrl.srv.Register(&control.Proc{Kernel: l.k})
+ case controlpb.ControlConfig_STATE:
+ ctrl.srv.Register(&control.State{Kernel: l.k})
+ case controlpb.ControlConfig_DEBUG:
+ ctrl.srv.Register(&debug{})
+ }
+ }
}
return ctrl, nil
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 703f34827..db363435b 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -304,6 +304,22 @@ var allowedSyscalls = seccomp.SyscallRules{
seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */
},
},
+ unix.SYS_TIMER_CREATE: []seccomp.Rule{
+ {
+ seccomp.EqualTo(unix.CLOCK_THREAD_CPUTIME_ID), /* which */
+ seccomp.MatchAny{}, /* sevp */
+ seccomp.MatchAny{}, /* timerid */
+ },
+ },
+ unix.SYS_TIMER_DELETE: []seccomp.Rule{},
+ unix.SYS_TIMER_SETTIME: []seccomp.Rule{
+ {
+ seccomp.MatchAny{}, /* timerid */
+ seccomp.EqualTo(0), /* flags */
+ seccomp.MatchAny{}, /* new_value */
+ seccomp.EqualTo(0), /* old_value */
+ },
+ },
unix.SYS_TGKILL: []seccomp.Rule{
{
seccomp.EqualTo(uint64(os.Getpid())),
@@ -630,6 +646,11 @@ func hostInetFilters() seccomp.SyscallRules {
func controlServerFilters(fd int) seccomp.SyscallRules {
return seccomp.SyscallRules{
+ unix.SYS_ACCEPT4: []seccomp.Rule{
+ {
+ seccomp.EqualTo(fd),
+ },
+ },
unix.SYS_ACCEPT: []seccomp.Rule{
{
seccomp.EqualTo(fd),
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 40cf2a3df..ba5460f1c 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -188,8 +188,8 @@ func compileMounts(spec *specs.Spec, conf *config.Config, vfs2Enabled bool) []sp
return mounts
}
-// p9MountData creates a slice of p9 mount data.
-func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
+// goferMountData creates a slice of gofer mount data.
+func goferMountData(fd int, fa config.FileAccessType, attachPath string, vfs2 bool, lisafs bool) []string {
opts := []string{
"trans=fd",
"rfdno=" + strconv.Itoa(fd),
@@ -203,6 +203,10 @@ func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
if fa == config.FileAccessShared {
opts = append(opts, "cache=remote_revalidating")
}
+ if vfs2 && lisafs {
+ opts = append(opts, "lisafs=true")
+ opts = append(opts, "aname="+attachPath)
+ }
return opts
}
@@ -761,7 +765,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con
fd := c.fds.remove()
log.Infof("Mounting root over 9P, ioFD: %d", fd)
p9FS := mustFindFilesystem("9p")
- opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
+ opts := goferMountData(fd, conf.FileAccess, "/", false /* vfs2 */, false /* lisafs */)
// We can't check for overlayfs here because sandbox is chroot'ed and gofer
// can only send mount options for specs.Mounts (specs.Root is missing
@@ -822,7 +826,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m *specs.
case bind:
fd := c.fds.remove()
fsName = gofervfs2.Name
- opts = p9MountData(fd, c.getMountAccessType(conf, m), conf.VFS2)
+ opts = goferMountData(fd, c.getMountAccessType(conf, m), m.Destination, conf.VFS2, conf.Lisafs)
// If configured, add overlay to all writable mounts.
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
case cgroupfs.Name:
@@ -980,7 +984,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.Re
// Add root mount.
fd := c.fds.remove()
- opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
+ opts := goferMountData(fd, conf.FileAccess, "/", false /* vfs2 */, false /* lisafs */)
mf := fs.MountSourceFlags{}
if c.root.Readonly || conf.Overlay {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index ec9188021..2f2d4df5e 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -49,15 +49,16 @@ import (
"gvisor.dev/gvisor/pkg/sentry/loader"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
- "gvisor.dev/gvisor/pkg/sentry/sighandling"
"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
"gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
+ "gvisor.dev/gvisor/pkg/sighandling"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -119,6 +120,10 @@ type Loader struct {
// container. It should be called when a sandbox is destroyed.
stopSignalForwarding func()
+ // stopProfiling stops profiling started at container creation. It
+ // should be called when a sandbox is destroyed.
+ stopProfiling func()
+
// restore is set to true if we are restoring a container.
restore bool
@@ -198,6 +203,21 @@ type Args struct {
TotalMem uint64
// UserLogFD is the file descriptor to write user logs to.
UserLogFD int
+ // ProfileBlockFD is the file descriptor to write a block profile to.
+ // Valid if >=0.
+ ProfileBlockFD int
+ // ProfileCPUFD is the file descriptor to write a CPU profile to.
+ // Valid if >=0.
+ ProfileCPUFD int
+ // ProfileHeapFD is the file descriptor to write a heap profile to.
+ // Valid if >=0.
+ ProfileHeapFD int
+ // ProfileMutexFD is the file descriptor to write a mutex profile to.
+ // Valid if >=0.
+ ProfileMutexFD int
+ // TraceFD is the file descriptor to write a Go execution trace to.
+ // Valid if >=0.
+ TraceFD int
}
// make sure stdioFDs are always the same on initial start and on restore
@@ -206,6 +226,8 @@ const startingStdioFD = 256
// New initializes a new kernel loader configured by spec.
// New also handles setting up a kernel for restoring a container.
func New(args Args) (*Loader, error) {
+ stopProfiling := startProfiling(args)
+
// We initialize the rand package now to make sure /dev/urandom is pre-opened
// on kernels that do not support getrandom(2).
if err := rand.Init(); err != nil {
@@ -219,10 +241,8 @@ func New(args Args) (*Loader, error) {
// Is this a VFSv2 kernel?
if args.Conf.VFS2 {
kernel.VFS2Enabled = true
- if args.Conf.FUSE {
- kernel.FUSEEnabled = true
- }
-
+ kernel.FUSEEnabled = args.Conf.FUSE
+ kernel.LISAFSEnabled = args.Conf.Lisafs
vfs2.Override()
}
@@ -399,12 +419,13 @@ func New(args Args) (*Loader, error) {
eid := execID{cid: args.ID}
l := &Loader{
- k: k,
- watchdog: dog,
- sandboxID: args.ID,
- processes: map[execID]*execProcess{eid: {}},
- mountHints: mountHints,
- root: info,
+ k: k,
+ watchdog: dog,
+ sandboxID: args.ID,
+ processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
+ root: info,
+ stopProfiling: stopProfiling,
}
// We don't care about child signals; some platforms can generate a
@@ -497,6 +518,8 @@ func (l *Loader) Destroy() {
for _, f := range l.root.goferFDs {
_ = f.Close()
}
+
+ l.stopProfiling()
}
func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
@@ -1088,13 +1111,14 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st
return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
case config.NetworkNone, config.NetworkSandbox:
- s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+ s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite)
if err != nil {
return nil, err
}
creator := &sandboxNetstackCreator{
- clock: clock,
- uniqueID: uniqueID,
+ clock: clock,
+ uniqueID: uniqueID,
+ allowPacketEndpointWrite: conf.AllowPacketEndpointWrite,
}
return inet.NewRootNamespace(s, creator), nil
@@ -1104,7 +1128,7 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st
}
-func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) {
netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
transProtos := []stack.TransportProtocolFactory{
tcp.NewProtocol,
@@ -1120,9 +1144,10 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
HandleLocal: true,
// Enable raw sockets for users with sufficient
// privileges.
- RawFactory: raw.EndpointFactory{},
- UniqueID: uniqueID,
- DefaultIPTables: netfilter.DefaultLinuxTables,
+ RawFactory: raw.EndpointFactory{},
+ AllowPacketEndpointWrite: allowPacketEndpointWrite,
+ UniqueID: uniqueID,
+ DefaultIPTables: netfilter.DefaultLinuxTables,
})}
// Enable SACK Recovery.
@@ -1159,13 +1184,14 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
//
// +stateify savable
type sandboxNetstackCreator struct {
- clock tcpip.Clock
- uniqueID stack.UniqueID
+ clock tcpip.Clock
+ uniqueID stack.UniqueID
+ allowPacketEndpointWrite bool
}
// CreateStack implements kernel.NetworkStackCreator.CreateStack.
func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
- s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+ s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite)
if err != nil {
return nil, err
}
@@ -1174,7 +1200,7 @@ func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
n := &Network{Stack: s.(*netstack.Stack).Stack}
nicID := tcpip.NICID(f.uniqueID.UniqueID())
link := DefaultLoopbackLink
- linkEP := loopback.New()
+ linkEP := ethernet.New(loopback.New())
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return nil, err
}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 7e627e4c6..9fb3ebd95 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -23,9 +23,9 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
- "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -169,7 +169,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
nicID++
nicids[link.Name] = nicID
- linkEP := loopback.New()
+ linkEP := ethernet.New(loopback.New())
log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
@@ -209,7 +209,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
linkEP, err := fdbased.New(&fdbased.Options{
FDs: FDs,
MTU: uint32(link.MTU),
- EthernetHeader: true,
+ EthernetHeader: mac != "",
Address: mac,
PacketDispatchMode: fdbased.RecvMMsg,
GSOMaxSize: link.GSOMaxSize,
@@ -228,9 +228,6 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
}
- // Enable support for AF_PACKET sockets to receive outgoing packets.
- linkEP = packetsocket.New(linkEP)
-
log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return err
@@ -285,12 +282,15 @@ func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkE
for _, addr := range addrs {
proto, tcpipAddr := ipToAddressAndProto(addr.Address)
- ap := tcpip.AddressWithPrefix{
- Address: tcpipAddr,
- PrefixLen: addr.PrefixLen,
+ protocolAddr := tcpip.ProtocolAddress{
+ Protocol: proto,
+ AddressWithPrefix: tcpip.AddressWithPrefix{
+ Address: tcpipAddr,
+ PrefixLen: addr.PrefixLen,
+ },
}
- if err := n.Stack.AddAddressWithPrefix(id, proto, ap); err != nil {
- return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
+ if err := n.Stack.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil {
+ return fmt.Errorf("AddProtocolAddress(%d, %+v, {}) failed: %s", id, protocolAddr, err)
}
}
return nil
diff --git a/runsc/boot/profile.go b/runsc/boot/profile.go
new file mode 100644
index 000000000..3ecd3e532
--- /dev/null
+++ b/runsc/boot/profile.go
@@ -0,0 +1,95 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "os"
+ "runtime"
+ "runtime/pprof"
+ "runtime/trace"
+
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/control"
+)
+
+// startProfiling initiates profiling as defined by the ProfileConfig, and
+// returns a function that should be called to stop profiling.
+func startProfiling(args Args) func() {
+ var onStopProfiling []func()
+ stopProfiling := func() {
+ for _, f := range onStopProfiling {
+ f()
+ }
+ }
+
+ if args.ProfileBlockFD >= 0 {
+ file := os.NewFile(uintptr(args.ProfileBlockFD), "profile-block")
+
+ runtime.SetBlockProfileRate(control.DefaultBlockProfileRate)
+ onStopProfiling = append(onStopProfiling, func() {
+ if err := pprof.Lookup("block").WriteTo(file, 0); err != nil {
+ log.Warningf("Error writing block profile: %v", err)
+ }
+ file.Close()
+ runtime.SetBlockProfileRate(0)
+ })
+ }
+
+ if args.ProfileCPUFD >= 0 {
+ file := os.NewFile(uintptr(args.ProfileCPUFD), "profile-cpu")
+
+ pprof.StartCPUProfile(file)
+ onStopProfiling = append(onStopProfiling, func() {
+ pprof.StopCPUProfile()
+ file.Close()
+ })
+ }
+
+ if args.ProfileHeapFD >= 0 {
+ file := os.NewFile(uintptr(args.ProfileHeapFD), "profile-heap")
+
+ onStopProfiling = append(onStopProfiling, func() {
+ if err := pprof.Lookup("heap").WriteTo(file, 0); err != nil {
+ log.Warningf("Error writing heap profile: %v", err)
+ }
+ file.Close()
+ })
+ }
+
+ if args.ProfileMutexFD >= 0 {
+ file := os.NewFile(uintptr(args.ProfileMutexFD), "profile-mutex")
+
+ prev := runtime.SetMutexProfileFraction(control.DefaultMutexProfileRate)
+ onStopProfiling = append(onStopProfiling, func() {
+ if err := pprof.Lookup("mutex").WriteTo(file, 0); err != nil {
+ log.Warningf("Error writing mutex profile: %v", err)
+ }
+ file.Close()
+ runtime.SetMutexProfileFraction(prev)
+ })
+ }
+
+ if args.TraceFD >= 0 {
+ file := os.NewFile(uintptr(args.TraceFD), "trace")
+
+ trace.Start(file)
+ onStopProfiling = append(onStopProfiling, func() {
+ trace.Stop()
+ file.Close()
+ })
+ }
+
+ return stopProfiling
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index c21648a32..cf5be34cd 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -35,9 +35,14 @@ func enableStrace(conf *config.Config) error {
}
strace.LogMaximumSize = max
+ sink := strace.SinkTypeLog
+ if conf.StraceEvent {
+ sink = strace.SinkTypeEvent
+ }
+
if len(conf.StraceSyscalls) == 0 {
- strace.EnableAll(strace.SinkTypeLog)
+ strace.EnableAll(sink)
return nil
}
- return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog)
+ return strace.Enable(strings.Split(conf.StraceSyscalls, ","), sink)
}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 346796d9c..ac1e5ac37 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -173,7 +173,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create
rootProcArgs.Credentials = rootCreds
rootProcArgs.Umask = 0022
rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
- rootCtx := procArgs.NewContext(c.k)
+ rootCtx := rootProcArgs.NewContext(c.k)
mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
if err != nil {
@@ -208,7 +208,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create
// createMountNamespaceVFS2 creates the container's root mount and namespace.
func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
fd := c.fds.remove()
- data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+ data := goferMountData(fd, conf.FileAccess, "/", true /* vfs2 */, conf.Lisafs)
// We can't check for overlayfs here because sandbox is chroot'ed and gofer
// can only send mount options for specs.Mounts (specs.Root is missing
@@ -515,7 +515,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
// but unlikely to be correct in this context.
return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
}
- data = p9MountData(m.fd, c.getMountAccessType(conf, m.mount), true /* vfs2 */)
+ data = goferMountData(m.fd, c.getMountAccessType(conf, m.mount), m.mount.Destination, true /* vfs2 */, conf.Lisafs)
internalData = gofer.InternalFilesystemOptions{
UniqueID: m.mount.Destination,
}
diff --git a/runsc/cli/main.go b/runsc/cli/main.go
index 3556d7665..058ab8232 100644
--- a/runsc/cli/main.go
+++ b/runsc/cli/main.go
@@ -228,7 +228,7 @@ func Main(version string) {
log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
- log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
+ log.Infof("\t\tVFS2 enabled: %t, LISAFS: %t", conf.VFS2, conf.Lisafs)
log.Infof("***************************")
if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 031ddd57e..c5e32807d 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -36,6 +36,7 @@ go_library(
"statefile.go",
"symbolize.go",
"syscalls.go",
+ "usage.go",
"verity_prepare.go",
"wait.go",
],
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index f5c9821b2..e33a7f3cb 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -79,6 +79,26 @@ type Boot struct {
// sandbox (e.g. gofer) and sent through this FD.
mountsFD int
+ // profileBlockFD is the file descriptor to write a block profile to.
+ // Valid if >= 0.
+ profileBlockFD int
+
+ // profileCPUFD is the file descriptor to write a CPU profile to.
+ // Valid if >= 0.
+ profileCPUFD int
+
+ // profileHeapFD is the file descriptor to write a heap profile to.
+ // Valid if >= 0.
+ profileHeapFD int
+
+ // profileMutexFD is the file descriptor to write a mutex profile to.
+ // Valid if >= 0.
+ profileMutexFD int
+
+ // traceFD is the file descriptor to write a Go execution trace to.
+ // Valid if >= 0.
+ traceFD int
+
// pidns is set if the sandbox is in its own pid namespace.
pidns bool
@@ -119,6 +139,11 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
+ f.IntVar(&b.profileBlockFD, "profile-block-fd", -1, "file descriptor to write block profile to. -1 disables profiling.")
+ f.IntVar(&b.profileCPUFD, "profile-cpu-fd", -1, "file descriptor to write CPU profile to. -1 disables profiling.")
+ f.IntVar(&b.profileHeapFD, "profile-heap-fd", -1, "file descriptor to write heap profile to. -1 disables profiling.")
+ f.IntVar(&b.profileMutexFD, "profile-mutex-fd", -1, "file descriptor to write mutex profile to. -1 disables profiling.")
+ f.IntVar(&b.traceFD, "trace-fd", -1, "file descriptor to write Go execution trace to. -1 disables tracing.")
f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
}
@@ -213,16 +238,21 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Create the loader.
bootArgs := boot.Args{
- ID: f.Arg(0),
- Spec: spec,
- Conf: conf,
- ControllerFD: b.controllerFD,
- Device: os.NewFile(uintptr(b.deviceFD), "platform device"),
- GoferFDs: b.ioFDs.GetArray(),
- StdioFDs: b.stdioFDs.GetArray(),
- NumCPU: b.cpuNum,
- TotalMem: b.totalMem,
- UserLogFD: b.userLogFD,
+ ID: f.Arg(0),
+ Spec: spec,
+ Conf: conf,
+ ControllerFD: b.controllerFD,
+ Device: os.NewFile(uintptr(b.deviceFD), "platform device"),
+ GoferFDs: b.ioFDs.GetArray(),
+ StdioFDs: b.stdioFDs.GetArray(),
+ NumCPU: b.cpuNum,
+ TotalMem: b.totalMem,
+ UserLogFD: b.userLogFD,
+ ProfileBlockFD: b.profileBlockFD,
+ ProfileCPUFD: b.profileCPUFD,
+ ProfileHeapFD: b.profileHeapFD,
+ ProfileMutexFD: b.profileMutexFD,
+ TraceFD: b.traceFD,
}
l, err := boot.New(bootArgs)
if err != nil {
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index f773ccca0..318753728 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -37,9 +37,9 @@ type Debug struct {
pid int
stacks bool
signal int
- profileHeap string
- profileCPU string
profileBlock string
+ profileCPU string
+ profileHeap string
profileMutex string
trace string
strace string
@@ -70,9 +70,9 @@ func (*Debug) Usage() string {
func (d *Debug) SetFlags(f *flag.FlagSet) {
f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set")
f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
- f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
- f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
+ f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
+ f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
f.DurationVar(&d.delay, "delay", time.Hour, "amount of time to delay for collecting heap and goroutine profiles.")
f.DurationVar(&d.duration, "duration", time.Hour, "amount of time to wait for CPU and trace profiles.")
@@ -90,6 +90,13 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
var c *container.Container
conf := args[0].(*config.Config)
+ if conf.ProfileBlock != "" || conf.ProfileCPU != "" || conf.ProfileHeap != "" || conf.ProfileMutex != "" {
+ return Errorf("global -profile-{block,cpu,heap,mutex} flags have no effect on runsc debug. Pass runsc debug -profile-{block,cpu,heap,mutex} instead")
+ }
+ if conf.TraceFile != "" {
+ return Errorf("global -trace flag has no effect on runsc debug. Pass runsc debug -trace instead")
+ }
+
if d.pid == 0 {
// No pid, container ID must have been provided.
if f.NArg() != 1 {
@@ -219,19 +226,19 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Open profiling files.
var (
- heapFile *os.File
- cpuFile *os.File
- traceFile *os.File
blockFile *os.File
+ cpuFile *os.File
+ heapFile *os.File
mutexFile *os.File
+ traceFile *os.File
)
- if d.profileHeap != "" {
- f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+ if d.profileBlock != "" {
+ f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
if err != nil {
- return Errorf("error opening heap profile output: %v", err)
+ return Errorf("error opening blocking profile output: %v", err)
}
defer f.Close()
- heapFile = f
+ blockFile = f
}
if d.profileCPU != "" {
f, err := os.OpenFile(d.profileCPU, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
@@ -241,20 +248,13 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
defer f.Close()
cpuFile = f
}
- if d.trace != "" {
- f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
- if err != nil {
- return Errorf("error opening trace profile output: %v", err)
- }
- traceFile = f
- }
- if d.profileBlock != "" {
- f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+ if d.profileHeap != "" {
+ f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
if err != nil {
- return Errorf("error opening blocking profile output: %v", err)
+ return Errorf("error opening heap profile output: %v", err)
}
defer f.Close()
- blockFile = f
+ heapFile = f
}
if d.profileMutex != "" {
f, err := os.OpenFile(d.profileMutex, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
@@ -264,21 +264,28 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
defer f.Close()
mutexFile = f
}
+ if d.trace != "" {
+ f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+ if err != nil {
+ return Errorf("error opening trace profile output: %v", err)
+ }
+ traceFile = f
+ }
// Collect profiles.
var (
wg sync.WaitGroup
- heapErr error
- cpuErr error
- traceErr error
blockErr error
+ cpuErr error
+ heapErr error
mutexErr error
+ traceErr error
)
- if heapFile != nil {
+ if blockFile != nil {
wg.Add(1)
go func() {
defer wg.Done()
- heapErr = c.Sandbox.HeapProfile(heapFile, d.delay)
+ blockErr = c.Sandbox.BlockProfile(blockFile, d.duration)
}()
}
if cpuFile != nil {
@@ -288,25 +295,25 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
cpuErr = c.Sandbox.CPUProfile(cpuFile, d.duration)
}()
}
- if traceFile != nil {
+ if heapFile != nil {
wg.Add(1)
go func() {
defer wg.Done()
- traceErr = c.Sandbox.Trace(traceFile, d.duration)
+ heapErr = c.Sandbox.HeapProfile(heapFile, d.delay)
}()
}
- if blockFile != nil {
+ if mutexFile != nil {
wg.Add(1)
go func() {
defer wg.Done()
- blockErr = c.Sandbox.BlockProfile(blockFile, d.duration)
+ mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration)
}()
}
- if mutexFile != nil {
+ if traceFile != nil {
wg.Add(1)
go func() {
defer wg.Done()
- mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration)
+ traceErr = c.Sandbox.Trace(traceFile, d.duration)
}()
}
@@ -339,31 +346,31 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Collect all errors.
errorCount := 0
- if heapErr != nil {
+ if blockErr != nil {
errorCount++
- log.Infof("error collecting heap profile: %v", heapErr)
- os.Remove(heapFile.Name())
+ log.Infof("error collecting block profile: %v", blockErr)
+ os.Remove(blockFile.Name())
}
if cpuErr != nil {
errorCount++
log.Infof("error collecting cpu profile: %v", cpuErr)
os.Remove(cpuFile.Name())
}
- if traceErr != nil {
- errorCount++
- log.Infof("error collecting trace profile: %v", traceErr)
- os.Remove(traceFile.Name())
- }
- if blockErr != nil {
+ if heapErr != nil {
errorCount++
- log.Infof("error collecting block profile: %v", blockErr)
- os.Remove(blockFile.Name())
+ log.Infof("error collecting heap profile: %v", heapErr)
+ os.Remove(heapFile.Name())
}
if mutexErr != nil {
errorCount++
log.Infof("error collecting mutex profile: %v", mutexErr)
os.Remove(mutexFile.Name())
}
+ if traceErr != nil {
+ errorCount++
+ log.Infof("error collecting trace profile: %v", traceErr)
+ os.Remove(traceFile.Name())
+ }
if errorCount > 0 {
return subcommands.ExitFailure
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 6cf76f644..4eb5a96f1 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -130,7 +130,6 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
if conf.Network == config.NetworkNone {
addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace})
-
} else if conf.Rootless {
if conf.Network == config.NetworkSandbox {
c.notifyUser("*** Warning: sandbox network isn't supported with --rootless, switching to host ***")
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index c1d029d7f..08246e543 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -33,6 +33,10 @@ type Events struct {
intervalSec int
// If true, events will print a single group of stats and exit.
stats bool
+ // If true, events will dump all filtered events to stdout.
+ stream bool
+ // filters for streamed events.
+ filters stringSlice
}
// Name implements subcommands.Command.Name.
@@ -62,6 +66,8 @@ OPTIONS:
func (evs *Events) SetFlags(f *flag.FlagSet) {
f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds")
f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit")
+ f.BoolVar(&evs.stream, "stream", false, "dump all filtered events to stdout")
+ f.Var(&evs.filters, "filters", "only display matching events")
}
// Execute implements subcommands.Command.Execute.
@@ -79,6 +85,13 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
Fatalf("loading sandbox: %v", err)
}
+ if evs.stream {
+ if err := c.Stream(evs.filters, os.Stdout); err != nil {
+ Fatalf("Stream failed: %v", err)
+ }
+ return subcommands.ExitSuccess
+ }
+
// Repeatedly get stats from the container.
for {
// Get the event and print it as JSON.
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 2193e9040..c65e0267a 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -71,8 +71,8 @@ func (*Gofer) Name() string {
}
// Synopsis implements subcommands.Command.
-func (*Gofer) Synopsis() string {
- return "launch a gofer process that serves files over 9P protocol (internal use only)"
+func (g *Gofer) Synopsis() string {
+ return fmt.Sprintf("launch a gofer process that serves files over the protocol (9P or lisafs) defined in the config (internal use only)")
}
// Usage implements subcommands.Command.
@@ -83,7 +83,7 @@ func (*Gofer) Usage() string {
// SetFlags implements subcommands.Command.
func (g *Gofer) SetFlags(f *flag.FlagSet) {
f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
- f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
+ f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. They must follow this order: root first, then mounts as defined in the spec")
f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
@@ -160,10 +160,98 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
log.Infof("Process chroot'd to %q", root)
+ // Initialize filters.
+ if conf.FSGoferHostUDS {
+ filter.InstallUDSFilters()
+ }
+
+ if conf.Verity {
+ filter.InstallXattrFilters()
+ }
+
+ if err := filter.Install(); err != nil {
+ Fatalf("installing seccomp filters: %v", err)
+ }
+
+ if conf.Lisafs {
+ return g.serveLisafs(spec, conf, root)
+ }
+ return g.serve9P(spec, conf, root)
+}
+
+func newSocket(ioFD int) *unet.Socket {
+ socket, err := unet.NewSocket(ioFD)
+ if err != nil {
+ Fatalf("creating server on FD %d: %v", ioFD, err)
+ }
+ return socket
+}
+
+func (g *Gofer) serveLisafs(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus {
+ type connectionConfig struct {
+ sock *unet.Socket
+ readonly bool
+ }
+ cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1)
+ server := fsgofer.NewLisafsServer(fsgofer.Config{
+ // These are global options. Ignore readonly configuration, that is set on
+ // a per connection basis.
+ HostUDS: conf.FSGoferHostUDS,
+ EnableVerityXattr: conf.Verity,
+ })
+
+ // Start with root mount, then add any other additional mount as needed.
+ cfgs = append(cfgs, connectionConfig{
+ sock: newSocket(g.ioFDs[0]),
+ readonly: spec.Root.Readonly || conf.Overlay,
+ })
+ log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], cfgs[0].readonly)
+
+ mountIdx := 1 // first one is the root
+ for _, m := range spec.Mounts {
+ if !specutils.IsGoferMount(m, conf.VFS2) {
+ continue
+ }
+
+ if !filepath.IsAbs(m.Destination) {
+ Fatalf("mount destination must be absolute: %q", m.Destination)
+ }
+ if mountIdx >= len(g.ioFDs) {
+ Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m)
+ }
+
+ cfgs = append(cfgs, connectionConfig{
+ sock: newSocket(g.ioFDs[mountIdx]),
+ readonly: isReadonlyMount(m.Options) || conf.Overlay,
+ })
+
+ log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfgs[mountIdx].readonly)
+ mountIdx++
+ }
+
+ if mountIdx != len(g.ioFDs) {
+ Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
+ }
+ cfgs = cfgs[:mountIdx]
+
+ for _, cfg := range cfgs {
+ conn, err := server.CreateConnection(cfg.sock, cfg.readonly)
+ if err != nil {
+ Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err)
+ }
+ server.StartConnection(conn)
+ }
+ server.Wait()
+ log.Infof("All lisafs servers exited.")
+ return subcommands.ExitSuccess
+}
+
+func (g *Gofer) serve9P(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus {
// Start with root mount, then add any other additional mount as needed.
ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
ROMount: spec.Root.Readonly || conf.Overlay,
+ HostUDS: conf.FSGoferHostUDS,
EnableVerityXattr: conf.Verity,
})
if err != nil {
@@ -174,7 +262,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
mountIdx := 1 // first one is the root
for _, m := range spec.Mounts {
- if specutils.Is9PMount(m, conf.VFS2) {
+ if specutils.IsGoferMount(m, conf.VFS2) {
cfg := fsgofer.Config{
ROMount: isReadonlyMount(m.Options) || conf.Overlay,
HostUDS: conf.FSGoferHostUDS,
@@ -197,26 +285,9 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
}
- if conf.FSGoferHostUDS {
- filter.InstallUDSFilters()
- }
-
- if conf.Verity {
- filter.InstallXattrFilters()
- }
-
- if err := filter.Install(); err != nil {
- Fatalf("installing seccomp filters: %v", err)
- }
-
- runServers(ats, g.ioFDs)
- return subcommands.ExitSuccess
-}
-
-func runServers(ats []p9.Attacher, ioFDs []int) {
// Run the loops and wait for all to exit.
var wg sync.WaitGroup
- for i, ioFD := range ioFDs {
+ for i, ioFD := range g.ioFDs {
wg.Add(1)
go func(ioFD int, at p9.Attacher) {
socket, err := unet.NewSocket(ioFD)
@@ -232,6 +303,7 @@ func runServers(ats []p9.Attacher, ioFDs []int) {
}
wg.Wait()
log.Infof("All 9P servers exited.")
+ return subcommands.ExitSuccess
}
func (g *Gofer) writeMounts(mounts []specs.Mount) error {
@@ -362,7 +434,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error {
// creates directories as needed.
func setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error {
for _, m := range mounts {
- if !specutils.Is9PMount(m, conf.VFS2) {
+ if !specutils.IsGoferMount(m, conf.VFS2) {
continue
}
@@ -402,7 +474,7 @@ func setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath strin
func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
cleanMounts := make([]specs.Mount, 0, len(mounts))
for _, m := range mounts {
- if !specutils.Is9PMount(m, conf.VFS2) {
+ if !specutils.IsGoferMount(m, conf.VFS2) {
cleanMounts = append(cleanMounts, m)
continue
}
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 722181aff..da11c9d06 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -68,7 +68,14 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
waitStatus := args[1].(*unix.WaitStatus)
if conf.Rootless {
- return Errorf("Rootless mode not supported with %q", r.Name())
+ if conf.Network == config.NetworkSandbox {
+ return Errorf("sandbox network isn't supported with --rootless, use --network=none or --network=host")
+ }
+
+ if err := specutils.MaybeRunAsRoot(); err != nil {
+ return Errorf("Error executing inside namespace: %v", err)
+ }
+ // Execution will continue here if no more capabilities are needed...
}
bundleDir := r.bundleDir
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 55194e641..3a7f2a2c4 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -30,7 +30,6 @@ func writeSpec(w io.Writer, cwd string, netns string, args []string) error {
spec := &specs.Spec{
Version: "1.0.0",
Process: &specs.Process{
- Terminal: true,
User: specs.User{
UID: 0,
GID: 0,
diff --git a/runsc/cmd/usage.go b/runsc/cmd/usage.go
new file mode 100644
index 000000000..d2aeafa28
--- /dev/null
+++ b/runsc/cmd/usage.go
@@ -0,0 +1,93 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "os"
+
+ "github.com/google/subcommands"
+ "gvisor.dev/gvisor/runsc/config"
+ "gvisor.dev/gvisor/runsc/container"
+ "gvisor.dev/gvisor/runsc/flag"
+)
+
+// Usage implements subcommands.Command for the "usage" command.
+type Usage struct {
+ full bool
+ fd bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Usage) Name() string {
+ return "usage"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Usage) Synopsis() string {
+ return "Usage shows application memory usage across various categories in bytes."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Usage) Usage() string {
+ return `usage [flags] <container id> - print memory usages to standard output.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (u *Usage) SetFlags(f *flag.FlagSet) {
+ f.BoolVar(&u.full, "full", false, "enumerate all usage by categories")
+ f.BoolVar(&u.fd, "fd", false, "retrieves a subset of usage through the established usage FD")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (u *Usage) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() < 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*config.Config)
+
+ cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
+ if err != nil {
+ Fatalf("loading container: %v", err)
+ }
+
+ if !u.fd {
+ m, err := cont.Usage(u.full)
+ if err != nil {
+ Fatalf("usage failed: %v", err)
+ }
+ if err := json.NewEncoder(os.Stdout).Encode(m); err != nil {
+ Fatalf("Encode MemoryUsage failed: %v", err)
+ }
+ } else {
+ m, err := cont.UsageFD()
+ if err != nil {
+ Fatalf("usagefd failed: %v", err)
+ }
+
+ mapped, unknown, total, err := m.Fetch()
+ if err != nil {
+ Fatalf("Fetch memory usage failed: %v", err)
+ }
+
+ fmt.Printf("Mapped %v, Unknown %v, Total %v\n", mapped, unknown, total)
+ }
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/verity_prepare.go b/runsc/cmd/verity_prepare.go
index 85d762a51..44c1d05db 100644
--- a/runsc/cmd/verity_prepare.go
+++ b/runsc/cmd/verity_prepare.go
@@ -82,7 +82,7 @@ func (c *VerityPrepare) Execute(_ context.Context, f *flag.FlagSet, args ...inte
},
Process: &specs.Process{
Cwd: absRoot,
- Args: []string{c.tool, "--path", "/verityroot"},
+ Args: []string{c.tool, "--path", "/verityroot", "--rawpath", "/rawroot"},
Env: os.Environ(),
Capabilities: specutils.AllCapabilities(),
},
@@ -94,6 +94,11 @@ func (c *VerityPrepare) Execute(_ context.Context, f *flag.FlagSet, args ...inte
Type: "bind",
Options: []string{"verity.roothash="},
},
+ {
+ Source: c.dir,
+ Destination: "/rawroot",
+ Type: "bind",
+ },
},
}
diff --git a/runsc/config/BUILD b/runsc/config/BUILD
index b1672bb9d..64295d283 100644
--- a/runsc/config/BUILD
+++ b/runsc/config/BUILD
@@ -11,6 +11,7 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/refs",
+ "//pkg/sentry/control:control_go_proto",
"//pkg/sentry/watchdog",
"//pkg/sync",
"//runsc/flag",
@@ -24,5 +25,8 @@ go_test(
"config_test.go",
],
library = ":config",
- deps = ["//runsc/flag"],
+ deps = [
+ "//pkg/sentry/control:control_go_proto",
+ "//runsc/flag",
+ ],
)
diff --git a/runsc/config/config.go b/runsc/config/config.go
index cc4650180..91142888f 100644
--- a/runsc/config/config.go
+++ b/runsc/config/config.go
@@ -19,8 +19,10 @@ package config
import (
"fmt"
+ "strings"
"gvisor.dev/gvisor/pkg/refs"
+ controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
)
@@ -84,6 +86,9 @@ type Config struct {
// capabilities.
EnableRaw bool `flag:"net-raw"`
+ // AllowPacketEndpointWrite enables write operations on packet endpoints.
+ AllowPacketEndpointWrite bool `flag:"TESTONLY-allow-packet-endpoint-write"`
+
// HardwareGSO indicates that hardware segmentation offload is enabled.
HardwareGSO bool `flag:"gso"`
@@ -117,6 +122,10 @@ type Config struct {
// StraceLogSize is the max size of data blobs to display.
StraceLogSize uint `flag:"strace-log-size"`
+ // StraceEvent indicates sending strace to events if true. Strace is
+ // sent to log if false.
+ StraceEvent bool `flag:"strace-event"`
+
// DisableSeccomp indicates whether seccomp syscall filters should be
// disabled. Pardon the double negation, but default to enabled is important.
DisableSeccomp bool
@@ -131,6 +140,29 @@ type Config struct {
// ProfileEnable is set to prepare the sandbox to be profiled.
ProfileEnable bool `flag:"profile"`
+ // ProfileBlock collects a block profile to the passed file for the
+ // duration of the container execution. Requires ProfileEnabled.
+ ProfileBlock string `flag:"profile-block"`
+
+ // ProfileCPU collects a CPU profile to the passed file for the
+ // duration of the container execution. Requires ProfileEnabled.
+ ProfileCPU string `flag:"profile-cpu"`
+
+ // ProfileHeap collects a heap profile to the passed file for the
+ // duration of the container execution. Requires ProfileEnabled.
+ ProfileHeap string `flag:"profile-heap"`
+
+ // ProfileMutex collects a mutex profile to the passed file for the
+ // duration of the container execution. Requires ProfileEnabled.
+ ProfileMutex string `flag:"profile-mutex"`
+
+ // TraceFile collects a Go runtime execution trace to the passed file
+ // for the duration of the container execution.
+ TraceFile string `flag:"trace"`
+
+ // Controls defines the controls that may be enabled.
+ Controls controlConfig `flag:"controls"`
+
// RestoreFile is the path to the saved container image
RestoreFile string
@@ -161,6 +193,9 @@ type Config struct {
// Enables VFS2.
VFS2 bool `flag:"vfs2"`
+ // Enable lisafs.
+ Lisafs bool `flag:"lisafs"`
+
// Enables FUSE usage.
FUSE bool `flag:"fuse"`
@@ -195,6 +230,21 @@ func (c *Config) validate() error {
if c.NumNetworkChannels <= 0 {
return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels)
}
+ // Require profile flags to explicitly opt-in to profiling with
+ // -profile rather than implying it since these options have security
+ // implications.
+ if c.ProfileBlock != "" && !c.ProfileEnable {
+ return fmt.Errorf("profile-block flag requires enabling profiling with profile flag")
+ }
+ if c.ProfileCPU != "" && !c.ProfileEnable {
+ return fmt.Errorf("profile-cpu flag requires enabling profiling with profile flag")
+ }
+ if c.ProfileHeap != "" && !c.ProfileEnable {
+ return fmt.Errorf("profile-heap flag requires enabling profiling with profile flag")
+ }
+ if c.ProfileMutex != "" && !c.ProfileEnable {
+ return fmt.Errorf("profile-mutex flag requires enabling profiling with profile flag")
+ }
return nil
}
@@ -347,6 +397,96 @@ func (q QueueingDiscipline) String() string {
panic(fmt.Sprintf("Invalid qdisc %d", q))
}
+// controlConfig represents control endpoints.
+type controlConfig struct {
+ Controls *controlpb.ControlConfig
+}
+
+// Set implements flag.Value.
+func (c *controlConfig) Set(v string) error {
+ controls := strings.Split(v, ",")
+ var controlList []controlpb.ControlConfig_Endpoint
+ for _, control := range controls {
+ switch control {
+ case "EVENTS":
+ controlList = append(controlList, controlpb.ControlConfig_EVENTS)
+ case "FS":
+ controlList = append(controlList, controlpb.ControlConfig_FS)
+ case "LIFECYCLE":
+ controlList = append(controlList, controlpb.ControlConfig_LIFECYCLE)
+ case "LOGGING":
+ controlList = append(controlList, controlpb.ControlConfig_LOGGING)
+ case "PROFILE":
+ controlList = append(controlList, controlpb.ControlConfig_PROFILE)
+ case "USAGE":
+ controlList = append(controlList, controlpb.ControlConfig_USAGE)
+ case "PROC":
+ controlList = append(controlList, controlpb.ControlConfig_PROC)
+ case "STATE":
+ controlList = append(controlList, controlpb.ControlConfig_STATE)
+ case "DEBUG":
+ controlList = append(controlList, controlpb.ControlConfig_DEBUG)
+ default:
+ return fmt.Errorf("invalid control %q", control)
+ }
+ }
+ c.Controls.AllowedControls = controlList
+ return nil
+}
+
+// Get implements flag.Value.
+func (c *controlConfig) Get() interface{} {
+ return *c
+}
+
+// String implements flag.Value.
+func (c *controlConfig) String() string {
+ v := ""
+ for _, control := range c.Controls.GetAllowedControls() {
+ if len(v) > 0 {
+ v += ","
+ }
+ switch control {
+ case controlpb.ControlConfig_EVENTS:
+ v += "EVENTS"
+ case controlpb.ControlConfig_FS:
+ v += "FS"
+ case controlpb.ControlConfig_LIFECYCLE:
+ v += "LIFECYCLE"
+ case controlpb.ControlConfig_LOGGING:
+ v += "LOGGING"
+ case controlpb.ControlConfig_PROFILE:
+ v += "PROFILE"
+ case controlpb.ControlConfig_USAGE:
+ v += "USAGE"
+ case controlpb.ControlConfig_PROC:
+ v += "PROC"
+ case controlpb.ControlConfig_STATE:
+ v += "STATE"
+ case controlpb.ControlConfig_DEBUG:
+ v += "DEBUG"
+ default:
+ panic(fmt.Sprintf("Invalid control %d", control))
+ }
+ }
+ return v
+}
+
+func defaultControlConfig() *controlConfig {
+ c := controlConfig{}
+ c.Controls = &controlpb.ControlConfig{}
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_EVENTS)
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_FS)
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_LIFECYCLE)
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_LOGGING)
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_PROFILE)
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_USAGE)
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_PROC)
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_STATE)
+ c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_DEBUG)
+ return &c
+}
+
func leakModePtr(v refs.LeakMode) *refs.LeakMode {
return &v
}
diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go
index 80ff2c0a6..57c241c86 100644
--- a/runsc/config/config_test.go
+++ b/runsc/config/config_test.go
@@ -18,6 +18,7 @@ import (
"strings"
"testing"
+ controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -59,6 +60,9 @@ func TestFromFlags(t *testing.T) {
if err := flag.CommandLine.Lookup("network").Value.Set("none"); err != nil {
t.Errorf("Flag set: %v", err)
}
+ if err := flag.CommandLine.Lookup("controls").Value.Set("EVENTS,FS"); err != nil {
+ t.Errorf("Flag set: %v", err)
+ }
defer func() {
if err := setDefault("root"); err != nil {
t.Errorf("Flag set: %v", err)
@@ -72,6 +76,9 @@ func TestFromFlags(t *testing.T) {
if err := setDefault("network"); err != nil {
t.Errorf("Flag set: %v", err)
}
+ if err := setDefault("controls"); err != nil {
+ t.Errorf("Flag set: %v", err)
+ }
}()
c, err := NewFromFlags()
@@ -90,6 +97,12 @@ func TestFromFlags(t *testing.T) {
if want := NetworkNone; c.Network != want {
t.Errorf("Network=%v, want: %v", c.Network, want)
}
+ wants := []controlpb.ControlConfig_Endpoint{controlpb.ControlConfig_EVENTS, controlpb.ControlConfig_FS}
+ for i, want := range wants {
+ if c.Controls.Controls.AllowedControls[i] != want {
+ t.Errorf("Controls.Controls.AllowedControls[%d]=%v, want: %v", i, c.Controls.Controls.AllowedControls[i], want)
+ }
+ }
}
func TestToFlags(t *testing.T) {
@@ -101,10 +114,15 @@ func TestToFlags(t *testing.T) {
c.Debug = true
c.NumNetworkChannels = 123
c.Network = NetworkNone
+ c.Controls = controlConfig{
+ Controls: &controlpb.ControlConfig{
+ AllowedControls: []controlpb.ControlConfig_Endpoint{controlpb.ControlConfig_EVENTS, controlpb.ControlConfig_FS},
+ },
+ }
flags := c.ToFlags()
- if len(flags) != 4 {
- t.Errorf("wrong number of flags set, want: 4, got: %d: %s", len(flags), flags)
+ if len(flags) != 5 {
+ t.Errorf("wrong number of flags set, want: 5, got: %d: %s", len(flags), flags)
}
t.Logf("Flags: %s", flags)
fm := map[string]string{}
@@ -117,6 +135,7 @@ func TestToFlags(t *testing.T) {
"--debug": "true",
"--num-network-channels": "123",
"--network": "none",
+ "--controls": "EVENTS,FS",
} {
if got, ok := fm[name]; ok {
if got != want {
diff --git a/runsc/config/flags.go b/runsc/config/flags.go
index 6f1b5927a..11cea71b8 100644
--- a/runsc/config/flags.go
+++ b/runsc/config/flags.go
@@ -56,16 +56,23 @@ func RegisterFlags() {
flag.Bool("strace", false, "enable strace.")
flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
+ flag.Bool("strace-event", false, "send strace to event.")
// Flags that control sandbox runtime behavior.
flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
flag.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.")
flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+ flag.String("profile-block", "", "collects a block profile to this file path for the duration of the container execution. Requires -profile=true.")
+ flag.String("profile-cpu", "", "collects a CPU profile to this file path for the duration of the container execution. Requires -profile=true.")
+ flag.String("profile-heap", "", "collects a heap profile to this file path for the duration of the container execution. Requires -profile=true.")
+ flag.String("profile-mutex", "", "collects a mutex profile to this file path for the duration of the container execution. Requires -profile=true.")
+ flag.String("trace", "", "collects a Go runtime execution trace to this file path for the duration of the container execution.")
flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.")
flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
flag.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.")
+ flag.Var(defaultControlConfig(), "controls", "Sentry control endpoints.")
// Flags that control sandbox runtime behavior: FS related.
flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem validation to use for the root mount: exclusive (default), shared.")
@@ -75,6 +82,7 @@ func RegisterFlags() {
flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
flag.Bool("vfs2", false, "enables VFSv2. This uses the new VFS layer that is faster than the previous one.")
flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
+ flag.Bool("lisafs", false, "Enables lisafs protocol instead of 9P. This is only effective with VFS2.")
flag.Bool("cgroupfs", false, "Automatically mount cgroupfs.")
// Flags that control sandbox runtime behavior: network related.
@@ -90,6 +98,7 @@ func RegisterFlags() {
// Test flags, not to be used outside tests, ever.
flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+ flag.Bool("TESTONLY-allow-packet-endpoint-write", false, "TEST ONLY; do not ever use! Used for tests to allow writes on packet sockets.")
})
}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 5314549d6..4e744e604 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -19,7 +19,7 @@ go_library(
"//pkg/cleanup",
"//pkg/log",
"//pkg/sentry/control",
- "//pkg/sentry/sighandling",
+ "//pkg/sighandling",
"//pkg/sync",
"//runsc/boot",
"//runsc/cgroup",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index d1f979eb2..7f991444e 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -35,7 +35,7 @@ import (
"gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
- "gvisor.dev/gvisor/pkg/sentry/sighandling"
+ "gvisor.dev/gvisor/pkg/sighandling"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/cgroup"
"gvisor.dev/gvisor/runsc/config"
@@ -652,6 +652,30 @@ func (c *Container) Cat(files []string, out *os.File) error {
return c.Sandbox.Cat(c.ID, files, out)
}
+// Usage displays memory used by the application.
+func (c *Container) Usage(full bool) (control.MemoryUsage, error) {
+ log.Debugf("Usage in container, cid: %s, full: %v", c.ID, full)
+ return c.Sandbox.Usage(c.ID, full)
+}
+
+// UsageFD shows application memory usage using two donated FDs.
+func (c *Container) UsageFD() (*control.MemoryUsageRecord, error) {
+ log.Debugf("UsageFD in container, cid: %s", c.ID)
+ return c.Sandbox.UsageFD(c.ID)
+}
+
+// Reduce requests that the sentry attempt to reduce its memory usage.
+func (c *Container) Reduce(wait bool) error {
+ log.Debugf("Reduce in container, cid: %s", c.ID)
+ return c.Sandbox.Reduce(c.ID, wait)
+}
+
+// Stream dumps all events to out.
+func (c *Container) Stream(filters []string, out *os.File) error {
+ log.Debugf("Stream in container, cid: %s", c.ID)
+ return c.Sandbox.Stream(c.ID, filters, out)
+}
+
// State returns the metadata of the container.
func (c *Container) State() specs.State {
return specs.State{
@@ -893,7 +917,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu
// Add root mount and then add any other additional mounts.
mountCount := 1
for _, m := range spec.Mounts {
- if specutils.Is9PMount(m, conf.VFS2) {
+ if specutils.IsGoferMount(m, conf.VFS2) {
mountCount++
}
}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 960c36946..69dcf3f03 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -2655,3 +2655,220 @@ func TestCat(t *testing.T) {
t.Errorf("out got %s, want include %s", buf, want)
}
}
+
+// TestUsage checks that usage generates the expected memory usage.
+func TestUsage(t *testing.T) {
+ spec, conf := sleepSpecConf(t)
+ _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ args := Args{
+ ID: testutil.RandomContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ }
+
+ cont, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("Creating container: %v", err)
+ }
+ defer cont.Destroy()
+
+ if err := cont.Start(conf); err != nil {
+ t.Fatalf("starting container: %v", err)
+ }
+
+ for _, full := range []bool{false, true} {
+ m, err := cont.Usage(full)
+ if err != nil {
+ t.Fatalf("error usage from container: %v", err)
+ }
+ if m.Mapped == 0 {
+ t.Errorf("Usage mapped got zero")
+ }
+ if m.Total == 0 {
+ t.Errorf("Usage total got zero")
+ }
+ if full {
+ if m.System == 0 {
+ t.Errorf("Usage system got zero")
+ }
+ if m.Anonymous == 0 {
+ t.Errorf("Usage anonymous got zero")
+ }
+ }
+ }
+}
+
+// TestUsageFD checks that usagefd generates the expected memory usage.
+func TestUsageFD(t *testing.T) {
+ spec, conf := sleepSpecConf(t)
+
+ _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ args := Args{
+ ID: testutil.RandomContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ }
+
+ cont, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("Creating container: %v", err)
+ }
+ defer cont.Destroy()
+
+ if err := cont.Start(conf); err != nil {
+ t.Fatalf("starting container: %v", err)
+ }
+
+ m, err := cont.UsageFD()
+ if err != nil {
+ t.Fatalf("error usageFD from container: %v", err)
+ }
+
+ mapped, unknown, total, err := m.Fetch()
+ if err != nil {
+ t.Fatalf("error Fetch memory usage: %v", err)
+ }
+
+ if mapped == 0 {
+ t.Errorf("UsageFD Mapped got zero")
+ }
+ if unknown == 0 {
+ t.Errorf("UsageFD unknown got zero")
+ }
+ if total == 0 {
+ t.Errorf("UsageFD total got zero")
+ }
+}
+
+// TestReduce checks that reduce call succeeds.
+func TestReduce(t *testing.T) {
+ spec, conf := sleepSpecConf(t)
+ _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ args := Args{
+ ID: testutil.RandomContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ }
+
+ cont, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("Creating container: %v", err)
+ }
+ defer cont.Destroy()
+
+ if err := cont.Start(conf); err != nil {
+ t.Fatalf("starting container: %v", err)
+ }
+
+ if err := cont.Reduce(false); err != nil {
+ t.Fatalf("error reduce from container: %v", err)
+ }
+}
+
+// TestStream checks that Stream dumps expected events.
+func TestStream(t *testing.T) {
+ spec, conf := sleepSpecConf(t)
+ conf.Strace = true
+ conf.StraceEvent = true
+ conf.StraceSyscalls = ""
+
+ _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ args := Args{
+ ID: testutil.RandomContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ }
+
+ cont, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("Creating container: %v", err)
+ }
+ defer cont.Destroy()
+
+ if err := cont.Start(conf); err != nil {
+ t.Fatalf("starting container: %v", err)
+ }
+
+ r, w, err := os.Pipe()
+ if err != nil {
+ t.Fatalf("os.Create(): %v", err)
+ }
+
+ // Spawn a new thread to Stream events as it blocks indefinitely.
+ go func() {
+ cont.Stream(nil, w)
+ }()
+
+ buf := make([]byte, 1024)
+ if _, err := r.Read(buf); err != nil {
+ t.Fatalf("Read out: %v", err)
+ }
+
+ // A syscall strace event includes "Strace".
+ if got, want := string(buf), "Strace"; !strings.Contains(got, want) {
+ t.Errorf("out got %s, want include %s", buf, want)
+ }
+}
+
+// TestProfile checks that profiling options generate profiles.
+func TestProfile(t *testing.T) {
+ // Perform a non-trivial amount of work so we actually capture
+ // something in the profiles.
+ spec := testutil.NewSpecWithArgs("/bin/bash", "-c", "true")
+ conf := testutil.TestConfig(t)
+ conf.ProfileEnable = true
+ conf.ProfileBlock = filepath.Join(t.TempDir(), "block.pprof")
+ conf.ProfileCPU = filepath.Join(t.TempDir(), "cpu.pprof")
+ conf.ProfileHeap = filepath.Join(t.TempDir(), "heap.pprof")
+ conf.ProfileMutex = filepath.Join(t.TempDir(), "mutex.pprof")
+ conf.TraceFile = filepath.Join(t.TempDir(), "trace.out")
+
+ _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ args := Args{
+ ID: testutil.RandomContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ Attached: true,
+ }
+
+ _, err = Run(conf, args)
+ if err != nil {
+ t.Fatalf("Creating container: %v", err)
+ }
+
+ // Basic test; simply assert that the profiles are not empty.
+ for _, name := range []string{conf.ProfileBlock, conf.ProfileCPU, conf.ProfileHeap, conf.ProfileMutex, conf.TraceFile} {
+ fi, err := os.Stat(name)
+ if err != nil {
+ t.Fatalf("Unable to stat profile file %s: %v", name, err)
+ }
+ if fi.Size() == 0 {
+ t.Errorf("Profile file %s is empty: %+v", name, fi)
+ }
+ }
+}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 3280b74fe..8d5a6d300 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -9,12 +9,16 @@ go_library(
"fsgofer_amd64_unsafe.go",
"fsgofer_arm64_unsafe.go",
"fsgofer_unsafe.go",
+ "lisafs.go",
],
visibility = ["//runsc:__subpackages__"],
deps = [
+ "//pkg/abi/linux",
"//pkg/cleanup",
"//pkg/fd",
+ "//pkg/lisafs",
"//pkg/log",
+ "//pkg/marshal/primitive",
"//pkg/p9",
"//pkg/sync",
"//pkg/syserr",
@@ -37,3 +41,15 @@ go_test(
"@org_golang_x_sys//unix:go_default_library",
],
)
+
+go_test(
+ name = "lisafs_test",
+ size = "small",
+ srcs = ["lisafs_test.go"],
+ deps = [
+ ":fsgofer",
+ "//pkg/lisafs",
+ "//pkg/lisafs/testsuite",
+ "//pkg/log",
+ ],
+)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 07497e47b..600b21189 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -1242,13 +1242,14 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) {
}
parent := l.file.FD()
- for _, name := range names {
- child, err := unix.Openat(parent, name, openFlags|unix.O_PATH, 0)
+ closeParent := func() {
if parent != l.file.FD() {
- // Parent is no longer needed.
_ = unix.Close(parent)
- parent = -1
}
+ }
+ defer closeParent()
+ for _, name := range names {
+ child, err := unix.Openat(parent, name, openFlags|unix.O_PATH, 0)
if err != nil {
if errors.Is(err, unix.ENOENT) {
// No pont in continuing any further.
@@ -1256,10 +1257,11 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) {
}
return nil, err
}
+ closeParent()
+ parent = child
var stat unix.Stat_t
if err := unix.Fstat(child, &stat); err != nil {
- _ = unix.Close(child)
return nil, err
}
valid, attr := l.fillAttr(&stat)
@@ -1271,13 +1273,9 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) {
if (stat.Mode & unix.S_IFMT) != unix.S_IFDIR {
// Doesn't need to continue if entry is not a dir. Including symlinks
// that cannot be followed.
- _ = unix.Close(child)
break
}
parent = child
}
- if parent != -1 && parent != l.file.FD() {
- _ = unix.Close(parent)
- }
return stats, nil
}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index ee6cc97df..6cdd6d695 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -105,14 +105,14 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
return nil
}
-type state struct {
+type fileState struct {
root *localFile
file *localFile
conf Config
fileType uint32
}
-func (s state) String() string {
+func (s fileState) String() string {
return fmt.Sprintf("type(%v)", s.fileType)
}
@@ -129,11 +129,11 @@ func typeName(fileType uint32) string {
}
}
-func runAll(t *testing.T, test func(*testing.T, state)) {
+func runAll(t *testing.T, test func(*testing.T, fileState)) {
runCustom(t, allTypes, allConfs, test)
}
-func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, state)) {
+func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, fileState)) {
for _, c := range confs {
for _, ft := range types {
name := fmt.Sprintf("%s/%s", configTestName(&c), typeName(ft))
@@ -159,7 +159,7 @@ func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.
t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
}
- st := state{
+ st := fileState{
root: root.(*localFile),
file: file.(*localFile),
conf: c,
@@ -227,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) {
}
func TestReadWrite(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) {
child, err := createFile(s.file, "test")
if err != nil {
t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -261,7 +261,7 @@ func TestReadWrite(t *testing.T) {
}
func TestCreate(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) {
for i, flags := range allOpenFlags {
_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
if err != nil {
@@ -296,7 +296,7 @@ func TestCreateSetGID(t *testing.T) {
t.Skipf("Test requires CAP_CHOWN")
}
- runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) {
// Change group and set setgid to the parent dir.
if err := unix.Chown(s.file.hostPath, os.Getuid(), nobody); err != nil {
t.Fatalf("Chown() failed: %v", err)
@@ -364,7 +364,7 @@ func TestCreateSetGID(t *testing.T) {
// TestReadWriteDup tests that a file opened in any mode can be dup'ed and
// reopened in any other mode.
func TestReadWriteDup(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) {
child, err := createFile(s.file, "test")
if err != nil {
t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -410,7 +410,7 @@ func TestReadWriteDup(t *testing.T) {
}
func TestUnopened(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s fileState) {
b := []byte("foobar")
if _, err := s.file.WriteAt(b, 0); err != unix.EBADF {
t.Errorf("%v: WriteAt() should have failed, got: %v, expected: unix.EBADF", s, err)
@@ -432,7 +432,7 @@ func TestUnopened(t *testing.T) {
// was open with O_PATH, but Open() was not checking for it and allowing the
// control file to be reused.
func TestOpenOPath(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s fileState) {
// Fist remove all permissions on the file.
if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil {
t.Fatalf("SetAttr(): %v", err)
@@ -465,7 +465,7 @@ func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, e
}
func TestSetAttrPerm(t *testing.T) {
- runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) {
valid := p9.SetAttrMask{Permissions: true}
attr := p9.SetAttr{Permissions: 0777}
got, err := SetGetAttr(s.file, valid, attr)
@@ -485,7 +485,7 @@ func TestSetAttrPerm(t *testing.T) {
}
func TestSetAttrSize(t *testing.T) {
- runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) {
for _, size := range []uint64{1024, 0, 1024 * 1024} {
valid := p9.SetAttrMask{Size: true}
attr := p9.SetAttr{Size: size}
@@ -508,7 +508,7 @@ func TestSetAttrSize(t *testing.T) {
}
func TestSetAttrTime(t *testing.T) {
- runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) {
valid := p9.SetAttrMask{ATime: true, ATimeNotSystemTime: true}
attr := p9.SetAttr{ATimeSeconds: 123, ATimeNanoSeconds: 456}
got, err := SetGetAttr(s.file, valid, attr)
@@ -542,7 +542,7 @@ func TestSetAttrOwner(t *testing.T) {
t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid())
}
- runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) {
newUID := os.Getuid() + 1
valid := p9.SetAttrMask{UID: true}
attr := p9.SetAttr{UID: p9.UID(newUID)}
@@ -571,7 +571,7 @@ func SetGetXattr(l *localFile, name string, value string) error {
}
func TestSetGetDisabledXattr(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s fileState) {
name := "user.merkle.offset"
value := "tmp"
err := SetGetXattr(s.file, name, value)
@@ -582,7 +582,7 @@ func TestSetGetDisabledXattr(t *testing.T) {
}
func TestSetGetXattr(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFREG}, []Config{{ROMount: false, EnableVerityXattr: true}}, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFREG}, []Config{{ROMount: false, EnableVerityXattr: true}}, func(t *testing.T, s fileState) {
name := "user.merkle.offset"
value := "tmp"
err := SetGetXattr(s.file, name, value)
@@ -596,7 +596,7 @@ func TestLink(t *testing.T) {
if !specutils.HasCapabilities(capability.CAP_DAC_READ_SEARCH) {
t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid())
}
- runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) {
const dirName = "linkdir"
const linkFile = "link"
if _, err := s.root.Mkdir(dirName, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
@@ -625,7 +625,7 @@ func TestROMountChecks(t *testing.T) {
uid := p9.UID(os.Getuid())
gid := p9.GID(os.Getgid())
- runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
+ runCustom(t, allTypes, roConfs, func(t *testing.T, s fileState) {
if s.fileType != unix.S_IFLNK {
if _, _, _, err := s.file.Open(p9.WriteOnly); err != want {
t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
@@ -676,7 +676,7 @@ func TestROMountChecks(t *testing.T) {
}
func TestWalkNotFound(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s fileState) {
if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT {
t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "nobody-here", err)
}
@@ -695,7 +695,7 @@ func TestWalkNotFound(t *testing.T) {
}
func TestWalkDup(t *testing.T) {
- runAll(t, func(t *testing.T, s state) {
+ runAll(t, func(t *testing.T, s fileState) {
_, dup, err := s.file.Walk([]string{})
if err != nil {
t.Fatalf("%v: Walk(nil) failed, err: %v", s, err)
@@ -708,7 +708,7 @@ func TestWalkDup(t *testing.T) {
}
func TestWalkMultiple(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) {
var names []string
var parent p9.File = s.file
for i := 0; i < 5; i++ {
@@ -729,7 +729,7 @@ func TestWalkMultiple(t *testing.T) {
}
func TestReaddir(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) {
name := "dir"
if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
@@ -915,7 +915,7 @@ func TestDoubleAttachError(t *testing.T) {
}
func TestTruncate(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) {
child, err := createFile(s.file, "test")
if err != nil {
t.Fatalf("createFile() failed: %v", err)
@@ -951,7 +951,7 @@ func TestTruncate(t *testing.T) {
}
func TestMknod(t *testing.T) {
- runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) {
_, err := s.file.Mknod("test", p9.ModeRegular|0777, 1, 2, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
if err != nil {
t.Fatalf("Mknod() failed: %v", err)
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index f11fea40d..fb4fbe0d2 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -21,6 +21,8 @@ import (
"gvisor.dev/gvisor/pkg/syserr"
)
+var unixDirentMaxSize uint32 = uint32(unsafe.Sizeof(unix.Dirent{}))
+
func utimensat(dirFd int, name string, times [2]unix.Timespec, flags int) error {
// utimensat(2) doesn't accept empty name, instead name must be nil to make it
// operate directly on 'dirFd' unlike other *at syscalls.
@@ -80,3 +82,31 @@ func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error
}
return nil
}
+
+func parseDirents(buf []byte, handleDirent func(ino uint64, off int64, ftype uint8, name string) bool) {
+ for len(buf) > 0 {
+ // Interpret the buf populated by unix.Getdents as unix.Dirent.
+ dirent := *(*unix.Dirent)(unsafe.Pointer(&buf[0]))
+
+ // Extracting the name is pretty tedious...
+ var nameBuf [unix.NAME_MAX]byte
+ var nameLen int
+ for i := 0; i < len(dirent.Name); i++ {
+ // The name is null terminated.
+ if dirent.Name[i] == 0 {
+ nameLen = i
+ break
+ }
+ nameBuf[i] = byte(dirent.Name[i])
+ }
+ name := string(nameBuf[:nameLen])
+
+ // Deliver results to caller.
+ if !handleDirent(dirent.Ino, dirent.Off, dirent.Type, name) {
+ return
+ }
+
+ // Advance buf for the next dirent.
+ buf = buf[dirent.Reclen:]
+ }
+}
diff --git a/runsc/fsgofer/lisafs.go b/runsc/fsgofer/lisafs.go
new file mode 100644
index 000000000..0db44ff6a
--- /dev/null
+++ b/runsc/fsgofer/lisafs.go
@@ -0,0 +1,1084 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+ "io"
+ "math"
+ "path"
+ "strconv"
+ "sync/atomic"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
+ rwfd "gvisor.dev/gvisor/pkg/fd"
+ "gvisor.dev/gvisor/pkg/lisafs"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/p9"
+)
+
+// LisafsServer implements lisafs.ServerImpl for fsgofer.
+type LisafsServer struct {
+ lisafs.Server
+ config Config
+}
+
+var _ lisafs.ServerImpl = (*LisafsServer)(nil)
+
+// NewLisafsServer initializes a new lisafs server for fsgofer.
+func NewLisafsServer(config Config) *LisafsServer {
+ s := &LisafsServer{config: config}
+ s.Server.Init(s)
+ return s
+}
+
+// Mount implements lisafs.ServerImpl.Mount.
+func (s *LisafsServer) Mount(c *lisafs.Connection, mountPath string) (lisafs.ControlFDImpl, lisafs.Inode, error) {
+ s.RenameMu.RLock()
+ defer s.RenameMu.RUnlock()
+
+ rootFD, rootStat, err := tryStepLocked(c, mountPath, nil, func(flags int) (int, error) {
+ return unix.Open(mountPath, flags, 0)
+ })
+ if err != nil {
+ return nil, lisafs.Inode{}, err
+ }
+
+ var rootIno lisafs.Inode
+ rootFD.initInodeWithStat(&rootIno, &rootStat)
+ return rootFD, rootIno, nil
+}
+
+// MaxMessageSize implements lisafs.ServerImpl.MaxMessageSize.
+func (s *LisafsServer) MaxMessageSize() uint32 {
+ return lisafs.MaxMessageSize()
+}
+
+// SupportedMessages implements lisafs.ServerImpl.SupportedMessages.
+func (s *LisafsServer) SupportedMessages() []lisafs.MID {
+ // Note that Flush, FListXattr and FRemoveXattr are not supported.
+ return []lisafs.MID{
+ lisafs.Mount,
+ lisafs.Channel,
+ lisafs.FStat,
+ lisafs.SetStat,
+ lisafs.Walk,
+ lisafs.WalkStat,
+ lisafs.OpenAt,
+ lisafs.OpenCreateAt,
+ lisafs.Close,
+ lisafs.FSync,
+ lisafs.PWrite,
+ lisafs.PRead,
+ lisafs.MkdirAt,
+ lisafs.MknodAt,
+ lisafs.SymlinkAt,
+ lisafs.LinkAt,
+ lisafs.FStatFS,
+ lisafs.FAllocate,
+ lisafs.ReadLinkAt,
+ lisafs.Connect,
+ lisafs.UnlinkAt,
+ lisafs.RenameAt,
+ lisafs.Getdents64,
+ lisafs.FGetXattr,
+ lisafs.FSetXattr,
+ }
+}
+
+// controlFDLisa implements lisafs.ControlFDImpl.
+type controlFDLisa struct {
+ lisafs.ControlFD
+
+ // hostFD is the file descriptor which can be used to make host syscalls.
+ hostFD int
+
+ // writableHostFD is the file descriptor number for a writable FD opened on the
+ // same FD as `hostFD`. writableHostFD must only be accessed using atomic
+ // operations. It is initialized to -1, and can change in value exactly once.
+ writableHostFD int32
+}
+
+var _ lisafs.ControlFDImpl = (*controlFDLisa)(nil)
+
+// Precondition: server's rename mutex must be at least read locked.
+func newControlFDLisaLocked(c *lisafs.Connection, hostFD int, parent *controlFDLisa, name string, mode linux.FileMode) *controlFDLisa {
+ fd := &controlFDLisa{
+ hostFD: hostFD,
+ writableHostFD: -1,
+ }
+ fd.ControlFD.Init(c, parent.FD(), name, mode, fd)
+ return fd
+}
+
+func (fd *controlFDLisa) initInode(inode *lisafs.Inode) error {
+ inode.ControlFD = fd.ID()
+ return fstatTo(fd.hostFD, &inode.Stat)
+}
+
+func (fd *controlFDLisa) initInodeWithStat(inode *lisafs.Inode, unixStat *unix.Stat_t) {
+ inode.ControlFD = fd.ID()
+ unixToLinuxStat(unixStat, &inode.Stat)
+}
+
+func (fd *controlFDLisa) getWritableFD() (int, error) {
+ if writableFD := atomic.LoadInt32(&fd.writableHostFD); writableFD != -1 {
+ return int(writableFD), nil
+ }
+
+ writableFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(fd.hostFD), (unix.O_WRONLY|openFlags)&^unix.O_NOFOLLOW, 0)
+ if err != nil {
+ return -1, err
+ }
+ if !atomic.CompareAndSwapInt32(&fd.writableHostFD, -1, int32(writableFD)) {
+ // Race detected, use the new value and clean this up.
+ unix.Close(writableFD)
+ return int(atomic.LoadInt32(&fd.writableHostFD)), nil
+ }
+ return writableFD, nil
+}
+
+// FD implements lisafs.ControlFDImpl.FD.
+func (fd *controlFDLisa) FD() *lisafs.ControlFD {
+ if fd == nil {
+ return nil
+ }
+ return &fd.ControlFD
+}
+
+// Close implements lisafs.ControlFDImpl.Close.
+func (fd *controlFDLisa) Close(c *lisafs.Connection) {
+ if fd.hostFD >= 0 {
+ _ = unix.Close(fd.hostFD)
+ fd.hostFD = -1
+ }
+ // No concurrent access is possible so no need to use atomics.
+ if fd.writableHostFD >= 0 {
+ _ = unix.Close(int(fd.writableHostFD))
+ fd.writableHostFD = -1
+ }
+}
+
+// Stat implements lisafs.ControlFDImpl.Stat.
+func (fd *controlFDLisa) Stat(c *lisafs.Connection, comm lisafs.Communicator) (uint32, error) {
+ var resp linux.Statx
+ if err := fstatTo(fd.hostFD, &resp); err != nil {
+ return 0, err
+ }
+
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// SetStat implements lisafs.ControlFDImpl.SetStat.
+func (fd *controlFDLisa) SetStat(c *lisafs.Connection, comm lisafs.Communicator, stat lisafs.SetStatReq) (uint32, error) {
+ var resp lisafs.SetStatResp
+ if stat.Mask&unix.STATX_MODE != 0 {
+ if err := unix.Fchmod(fd.hostFD, stat.Mode&^unix.S_IFMT); err != nil {
+ log.Debugf("SetStat fchmod failed %q, err: %v", fd.FilePath(), err)
+ resp.FailureMask |= unix.STATX_MODE
+ resp.FailureErrNo = uint32(p9.ExtractErrno(err))
+ }
+ }
+
+ if stat.Mask&unix.STATX_SIZE != 0 {
+ // ftruncate(2) requires the FD to be open for writing.
+ writableFD, err := fd.getWritableFD()
+ if err == nil {
+ err = unix.Ftruncate(writableFD, int64(stat.Size))
+ }
+ if err != nil {
+ log.Debugf("SetStat ftruncate failed %q, err: %v", fd.FilePath(), err)
+ resp.FailureMask |= unix.STATX_SIZE
+ resp.FailureErrNo = uint32(p9.ExtractErrno(err))
+ }
+ }
+
+ if stat.Mask&(unix.STATX_ATIME|unix.STATX_MTIME) != 0 {
+ utimes := [2]unix.Timespec{
+ {Sec: 0, Nsec: unix.UTIME_OMIT},
+ {Sec: 0, Nsec: unix.UTIME_OMIT},
+ }
+ if stat.Mask&unix.STATX_ATIME != 0 {
+ utimes[0].Sec = stat.Atime.Sec
+ utimes[0].Nsec = stat.Atime.Nsec
+ }
+ if stat.Mask&unix.STATX_MTIME != 0 {
+ utimes[1].Sec = stat.Mtime.Sec
+ utimes[1].Nsec = stat.Mtime.Nsec
+ }
+
+ if fd.IsSymlink() {
+ // utimensat operates different that other syscalls. To operate on a
+ // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
+ // name.
+ c.Server().WithRenameReadLock(func() error {
+ if err := utimensat(fd.ParentLocked().(*controlFDLisa).hostFD, fd.NameLocked(), utimes, unix.AT_SYMLINK_NOFOLLOW); err != nil {
+ log.Debugf("SetStat utimens failed %q, err: %v", fd.FilePathLocked(), err)
+ resp.FailureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME))
+ resp.FailureErrNo = uint32(p9.ExtractErrno(err))
+ }
+ return nil
+ })
+ } else {
+ hostFD := fd.hostFD
+ if fd.IsRegular() {
+ // For regular files, utimensat(2) requires the FD to be open for
+ // writing, see BUGS section.
+ writableFD, err := fd.getWritableFD()
+ if err != nil {
+ return 0, err
+ }
+ hostFD = writableFD
+ }
+ // Directories and regular files can operate directly on the fd
+ // using empty name.
+ err := utimensat(hostFD, "", utimes, 0)
+ if err != nil {
+ log.Debugf("SetStat utimens failed %q, err: %v", fd.FilePath(), err)
+ resp.FailureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME))
+ resp.FailureErrNo = uint32(p9.ExtractErrno(err))
+ }
+ }
+ }
+
+ if stat.Mask&(unix.STATX_UID|unix.STATX_GID) != 0 {
+ // "If the owner or group is specified as -1, then that ID is not changed"
+ // - chown(2)
+ uid := -1
+ if stat.Mask&unix.STATX_UID != 0 {
+ uid = int(stat.UID)
+ }
+ gid := -1
+ if stat.Mask&unix.STATX_GID != 0 {
+ gid = int(stat.GID)
+ }
+ if err := unix.Fchownat(fd.hostFD, "", uid, gid, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil {
+ log.Debugf("SetStat fchown failed %q, err: %v", fd.FilePath(), err)
+ resp.FailureMask |= stat.Mask & (unix.STATX_UID | unix.STATX_GID)
+ resp.FailureErrNo = uint32(p9.ExtractErrno(err))
+ }
+ }
+
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// Walk implements lisafs.ControlFDImpl.Walk.
+func (fd *controlFDLisa) Walk(c *lisafs.Connection, comm lisafs.Communicator, path lisafs.StringArray) (uint32, error) {
+ // We need to generate inodes for each component walked. We will manually
+ // marshal the inodes into the payload buffer as they are generated to avoid
+ // the slice allocation. The memory format should be lisafs.WalkResp's.
+ var numInodes primitive.Uint32
+ var status lisafs.WalkStatus
+ maxPayloadSize := status.SizeBytes() + numInodes.SizeBytes() + (len(path) * (*lisafs.Inode)(nil).SizeBytes())
+ if maxPayloadSize > math.MaxUint32 {
+ // Too much to walk, can't do.
+ return 0, unix.EIO
+ }
+ payloadBuf := comm.PayloadBuf(uint32(maxPayloadSize))
+ payloadPos := status.SizeBytes() + numInodes.SizeBytes()
+
+ s := c.Server()
+ s.RenameMu.RLock()
+ defer s.RenameMu.RUnlock()
+
+ curDirFD := fd
+ cu := cleanup.Make(func() {
+ // Destroy all newly created FDs until now. Walk upward from curDirFD to
+ // fd. Do not destroy fd as the client still owns that.
+ for curDirFD != fd {
+ c.RemoveControlFDLocked(curDirFD.ID())
+ curDirFD = curDirFD.ParentLocked().(*controlFDLisa)
+ }
+ })
+ defer cu.Clean()
+
+ for _, name := range path {
+ // Symlinks terminate walk. This client gets the symlink inode, but will
+ // have to invoke Walk again with the resolved path.
+ if curDirFD.IsSymlink() {
+ status = lisafs.WalkComponentSymlink
+ break
+ }
+
+ child, childStat, err := tryStepLocked(c, name, curDirFD, func(flags int) (int, error) {
+ return unix.Openat(curDirFD.hostFD, name, flags, 0)
+ })
+ if err == unix.ENOENT {
+ status = lisafs.WalkComponentDoesNotExist
+ break
+ }
+ if err != nil {
+ return 0, err
+ }
+
+ // Write inode to payloadBuf and update state.
+ var childInode lisafs.Inode
+ child.initInodeWithStat(&childInode, &childStat)
+ childInode.MarshalUnsafe(payloadBuf[payloadPos:])
+ payloadPos += childInode.SizeBytes()
+ numInodes++
+ curDirFD = child
+ }
+ cu.Release()
+
+ // lisafs.WalkResp writes the walk status followed by the number of inodes in
+ // the beginning.
+ status.MarshalUnsafe(payloadBuf)
+ numInodes.MarshalUnsafe(payloadBuf[status.SizeBytes():])
+ return uint32(payloadPos), nil
+}
+
+// WalkStat implements lisafs.ControlFDImpl.WalkStat.
+func (fd *controlFDLisa) WalkStat(c *lisafs.Connection, comm lisafs.Communicator, path lisafs.StringArray) (uint32, error) {
+ // We may need to generate statx for dirFD + each component walked. We will
+ // manually marshal the statx results into the payload buffer as they are
+ // generated to avoid the slice allocation. The memory format should be the
+ // same as lisafs.WalkStatResp's.
+ var numStats primitive.Uint32
+ maxPayloadSize := numStats.SizeBytes() + (len(path) * linux.SizeOfStatx)
+ if maxPayloadSize > math.MaxUint32 {
+ // Too much to walk, can't do.
+ return 0, unix.EIO
+ }
+ payloadBuf := comm.PayloadBuf(uint32(maxPayloadSize))
+ payloadPos := numStats.SizeBytes()
+
+ s := c.Server()
+ s.RenameMu.RLock()
+ defer s.RenameMu.RUnlock()
+
+ curDirFD := fd.hostFD
+ closeCurDirFD := func() {
+ if curDirFD != fd.hostFD {
+ unix.Close(curDirFD)
+ }
+ }
+ defer closeCurDirFD()
+ var (
+ stat linux.Statx
+ unixStat unix.Stat_t
+ )
+ if len(path) > 0 && len(path[0]) == 0 {
+ // Write stat results for dirFD if the first path component is "".
+ if err := unix.Fstat(fd.hostFD, &unixStat); err != nil {
+ return 0, err
+ }
+ unixToLinuxStat(&unixStat, &stat)
+ stat.MarshalUnsafe(payloadBuf[payloadPos:])
+ payloadPos += stat.SizeBytes()
+ path = path[1:]
+ numStats++
+ }
+
+ // Don't attempt walking if parent is a symlink.
+ if fd.IsSymlink() {
+ return 0, nil
+ }
+ for _, name := range path {
+ curFD, err := unix.Openat(curDirFD, name, unix.O_PATH|openFlags, 0)
+ if err == unix.ENOENT {
+ // No more path components exist on the filesystem. Return the partial
+ // walk to the client.
+ break
+ }
+ if err != nil {
+ return 0, err
+ }
+ closeCurDirFD()
+ curDirFD = curFD
+
+ // Write stat results for curFD.
+ if err := unix.Fstat(curFD, &unixStat); err != nil {
+ return 0, err
+ }
+ unixToLinuxStat(&unixStat, &stat)
+ stat.MarshalUnsafe(payloadBuf[payloadPos:])
+ payloadPos += stat.SizeBytes()
+ numStats++
+
+ // Symlinks terminate walk. This client gets the symlink stat result, but
+ // will have to invoke Walk again with the resolved path.
+ if unixStat.Mode&unix.S_IFMT == unix.S_IFLNK {
+ break
+ }
+ }
+
+ // lisafs.WalkStatResp writes the number of stats in the beginning.
+ numStats.MarshalUnsafe(payloadBuf)
+ return uint32(payloadPos), nil
+}
+
+// Open implements lisafs.ControlFDImpl.Open.
+func (fd *controlFDLisa) Open(c *lisafs.Connection, comm lisafs.Communicator, flags uint32) (uint32, error) {
+ flags |= openFlags
+ newHostFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(fd.hostFD), int(flags)&^unix.O_NOFOLLOW, 0)
+ if err != nil {
+ return 0, err
+ }
+ newFD := fd.newOpenFDLisa(newHostFD, flags)
+
+ if fd.IsRegular() {
+ // Donate FD for regular files only. Since FD donation is a destructive
+ // operation, we should duplicate the to-be-donated FD. Eat the error if
+ // one occurs, it is better to have an FD without a host FD, than failing
+ // the Open attempt.
+ if dupFD, err := unix.Dup(newFD.hostFD); err == nil {
+ _ = comm.DonateFD(dupFD)
+ }
+ }
+
+ resp := lisafs.OpenAtResp{NewFD: newFD.ID()}
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// OpenCreate implements lisafs.ControlFDImpl.OpenCreate.
+func (fd *controlFDLisa) OpenCreate(c *lisafs.Connection, comm lisafs.Communicator, mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string, flags uint32) (uint32, error) {
+ // Need to hold rename mutex for reading while performing the walk. Also keep
+ // holding it while the cleanup is still possible.
+ var resp lisafs.OpenCreateAtResp
+ var newFD *openFDLisa
+ if err := c.Server().WithRenameReadLock(func() error {
+ createFlags := unix.O_CREAT | unix.O_EXCL | unix.O_RDONLY | unix.O_NONBLOCK | openFlags
+ childHostFD, err := unix.Openat(fd.hostFD, name, createFlags, uint32(mode&^linux.FileTypeMask))
+ if err != nil {
+ return err
+ }
+
+ childFD := newControlFDLisaLocked(c, childHostFD, fd, name, linux.ModeRegular)
+ cu := cleanup.Make(func() {
+ // Best effort attempt to remove the file in case of failure.
+ if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.FilePathLocked(), name), err)
+ }
+ c.RemoveControlFDLocked(childFD.ID())
+ })
+ defer cu.Clean()
+
+ // Set the owners as requested by the client.
+ if err := unix.Fchownat(childFD.hostFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil {
+ log.Infof("ayush: Fchownat %v", err)
+ return err
+ }
+
+ // Do not use the stat result from tryOpen because the owners might have
+ // changed. initInode() will stat the FD again and use fresh results.
+ if err := childFD.initInode(&resp.Child); err != nil {
+ log.Infof("ayush: initInode %v", err)
+ return err
+ }
+
+ // Now open an FD to the newly created file with the flags requested by the client.
+ flags |= openFlags
+ newHostFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(childFD.hostFD), int(flags)&^unix.O_NOFOLLOW, 0)
+ if err != nil {
+ log.Infof("ayush: Openat %v", err)
+ return err
+ }
+ cu.Release()
+
+ newFD = childFD.newOpenFDLisa(newHostFD, uint32(flags))
+ resp.NewFD = newFD.ID()
+ return nil
+ }); err != nil {
+ return 0, err
+ }
+
+ // Donate FD because open(O_CREAT|O_EXCL) always creates a regular file.
+ // Since FD donation is a destructive operation, we should duplicate the
+ // to-be-donated FD. Eat the error if one occurs, it is better to have an FD
+ // without a host FD, than failing the Open attempt.
+ if dupFD, err := unix.Dup(newFD.hostFD); err == nil {
+ _ = comm.DonateFD(dupFD)
+ }
+
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// Mkdir implements lisafs.ControlFDImpl.Mkdir.
+func (fd *controlFDLisa) Mkdir(c *lisafs.Connection, comm lisafs.Communicator, mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string) (uint32, error) {
+ var resp lisafs.MkdirAtResp
+ if err := c.Server().WithRenameReadLock(func() error {
+ if err := unix.Mkdirat(fd.hostFD, name, uint32(mode&^linux.FileTypeMask)); err != nil {
+ return err
+ }
+ cu := cleanup.Make(func() {
+ // Best effort attempt to remove the dir in case of failure.
+ if err := unix.Unlinkat(fd.hostFD, name, unix.AT_REMOVEDIR); err != nil {
+ log.Warningf("error unlinking dir %q after failure: %v", path.Join(fd.FilePathLocked(), name), err)
+ }
+ })
+ defer cu.Clean()
+
+ // Open directory to change ownership.
+ childDirFd, err := unix.Openat(fd.hostFD, name, unix.O_DIRECTORY|unix.O_RDONLY|openFlags, 0)
+ if err != nil {
+ return err
+ }
+ if err := unix.Fchownat(childDirFd, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil {
+ unix.Close(childDirFd)
+ return err
+ }
+
+ childDir := newControlFDLisaLocked(c, childDirFd, fd, name, linux.ModeDirectory)
+ if err := childDir.initInode(&resp.ChildDir); err != nil {
+ c.RemoveControlFDLocked(childDir.ID())
+ return err
+ }
+ cu.Release()
+
+ return nil
+ }); err != nil {
+ return 0, err
+ }
+
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// Mknod implements lisafs.ControlFDImpl.Mknod.
+func (fd *controlFDLisa) Mknod(c *lisafs.Connection, comm lisafs.Communicator, mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string, minor uint32, major uint32) (uint32, error) {
+ // From mknod(2) man page:
+ // "EPERM: [...] if the filesystem containing pathname does not support
+ // the type of node requested."
+ if mode.FileType() != linux.ModeRegular {
+ return 0, unix.EPERM
+ }
+
+ var resp lisafs.MknodAtResp
+ if err := c.Server().WithRenameReadLock(func() error {
+ if err := unix.Mknodat(fd.hostFD, name, uint32(mode), 0); err != nil {
+ return err
+ }
+ cu := cleanup.Make(func() {
+ // Best effort attempt to remove the file in case of failure.
+ if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.FilePathLocked(), name), err)
+ }
+ })
+ defer cu.Clean()
+
+ // Open file to change ownership.
+ childFD, err := unix.Openat(fd.hostFD, name, unix.O_PATH|openFlags, 0)
+ if err != nil {
+ return err
+ }
+ if err := unix.Fchownat(childFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil {
+ unix.Close(childFD)
+ return err
+ }
+
+ child := newControlFDLisaLocked(c, childFD, fd, name, mode)
+ if err := child.initInode(&resp.Child); err != nil {
+ c.RemoveControlFDLocked(child.ID())
+ return err
+ }
+ cu.Release()
+ return nil
+ }); err != nil {
+ return 0, err
+ }
+
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// Symlink implements lisafs.ControlFDImpl.Symlink.
+func (fd *controlFDLisa) Symlink(c *lisafs.Connection, comm lisafs.Communicator, name string, target string, uid lisafs.UID, gid lisafs.GID) (uint32, error) {
+ var resp lisafs.SymlinkAtResp
+ if err := c.Server().WithRenameReadLock(func() error {
+ if err := unix.Symlinkat(target, fd.hostFD, name); err != nil {
+ return err
+ }
+ cu := cleanup.Make(func() {
+ // Best effort attempt to remove the symlink in case of failure.
+ if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.FilePathLocked(), name), err)
+ }
+ })
+ defer cu.Clean()
+
+ // Open symlink to change ownership.
+ symlinkFD, err := unix.Openat(fd.hostFD, name, unix.O_PATH|openFlags, 0)
+ if err != nil {
+ return err
+ }
+ if err := unix.Fchownat(symlinkFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil {
+ unix.Close(symlinkFD)
+ return err
+ }
+
+ symlink := newControlFDLisaLocked(c, symlinkFD, fd, name, linux.ModeSymlink)
+ if err := symlink.initInode(&resp.Symlink); err != nil {
+ c.RemoveControlFDLocked(symlink.ID())
+ return err
+ }
+ cu.Release()
+ return nil
+ }); err != nil {
+ return 0, err
+ }
+
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// Link implements lisafs.ControlFDImpl.Link.
+func (fd *controlFDLisa) Link(c *lisafs.Connection, comm lisafs.Communicator, dir lisafs.ControlFDImpl, name string) (uint32, error) {
+ var resp lisafs.LinkAtResp
+ if err := c.Server().WithRenameReadLock(func() error {
+ dirFD := dir.(*controlFDLisa)
+ if err := unix.Linkat(fd.hostFD, "", dirFD.hostFD, name, unix.AT_EMPTY_PATH); err != nil {
+ return err
+ }
+ cu := cleanup.Make(func() {
+ // Best effort attempt to remove the hard link in case of failure.
+ if err := unix.Unlinkat(dirFD.hostFD, name, 0); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(dirFD.FilePathLocked(), name), err)
+ }
+ })
+ defer cu.Clean()
+
+ linkFD, linkStat, err := tryStepLocked(c, name, dirFD, func(flags int) (int, error) {
+ return unix.Openat(dirFD.hostFD, name, flags, 0)
+ })
+ if err != nil {
+ return err
+ }
+ cu.Release()
+
+ linkFD.initInodeWithStat(&resp.Link, &linkStat)
+ return nil
+ }); err != nil {
+ return 0, err
+ }
+
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// StatFS implements lisafs.ControlFDImpl.StatFS.
+func (fd *controlFDLisa) StatFS(c *lisafs.Connection, comm lisafs.Communicator) (uint32, error) {
+ var s unix.Statfs_t
+ if err := unix.Fstatfs(fd.hostFD, &s); err != nil {
+ return 0, err
+ }
+
+ resp := lisafs.StatFS{
+ Type: uint64(s.Type),
+ BlockSize: s.Bsize,
+ Blocks: s.Blocks,
+ BlocksFree: s.Bfree,
+ BlocksAvailable: s.Bavail,
+ Files: s.Files,
+ FilesFree: s.Ffree,
+ NameLength: uint64(s.Namelen),
+ }
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// Readlink implements lisafs.ControlFDImpl.Readlink.
+func (fd *controlFDLisa) Readlink(c *lisafs.Connection, comm lisafs.Communicator) (uint32, error) {
+ // We will manually marshal lisafs.ReadLinkAtResp, which just contains a
+ // lisafs.SizedString. Let unix.Readlinkat directly write into the payload
+ // buffer and manually write the string size before it.
+
+ // This is similar to what os.Readlink does.
+ const limit = primitive.Uint32(1024 * 1024)
+ for linkLen := primitive.Uint32(128); linkLen < limit; linkLen *= 2 {
+ b := comm.PayloadBuf(uint32(linkLen) + uint32(linkLen.SizeBytes()))
+ n, err := unix.Readlinkat(fd.hostFD, "", b[linkLen.SizeBytes():])
+ if err != nil {
+ return 0, err
+ }
+ if n < int(linkLen) {
+ linkLen = primitive.Uint32(n)
+ linkLen.MarshalUnsafe(b[:linkLen.SizeBytes()])
+ return uint32(linkLen) + uint32(linkLen.SizeBytes()), nil
+ }
+ }
+ return 0, unix.ENOMEM
+}
+
+// Connect implements lisafs.ControlFDImpl.Connect.
+func (fd *controlFDLisa) Connect(c *lisafs.Connection, comm lisafs.Communicator, sockType uint32) error {
+ s := c.ServerImpl().(*LisafsServer)
+ if !s.config.HostUDS {
+ return unix.ECONNREFUSED
+ }
+
+ // Lock RenameMu so that the hostPath read stays valid and is not tampered
+ // with until it is actually connected to.
+ s.RenameMu.RLock()
+ defer s.RenameMu.RUnlock()
+
+ // TODO(gvisor.dev/issue/1003): Due to different app vs replacement
+ // mappings, the app path may have fit in the sockaddr, but we can't fit
+ // hostPath in our sockaddr. We'd need to redirect through a shorter path
+ // in order to actually connect to this socket.
+ hostPath := fd.FilePathLocked()
+ if len(hostPath) > 108 { // UNIX_PATH_MAX = 108 is defined in afunix.h.
+ return unix.ECONNREFUSED
+ }
+
+ // Only the following types are supported.
+ switch sockType {
+ case unix.SOCK_STREAM, unix.SOCK_DGRAM, unix.SOCK_SEQPACKET:
+ default:
+ return unix.ENXIO
+ }
+
+ sock, err := unix.Socket(unix.AF_UNIX, int(sockType), 0)
+ if err != nil {
+ return err
+ }
+ if err := comm.DonateFD(sock); err != nil {
+ return err
+ }
+
+ sa := unix.SockaddrUnix{Name: hostPath}
+ if err := unix.Connect(sock, &sa); err != nil {
+ return err
+ }
+ return nil
+}
+
+// Unlink implements lisafs.ControlFDImpl.Unlink.
+func (fd *controlFDLisa) Unlink(c *lisafs.Connection, name string, flags uint32) error {
+ return c.Server().WithRenameReadLock(func() error {
+ return unix.Unlinkat(fd.hostFD, name, int(flags))
+ })
+}
+
+// RenameLocked implements lisafs.ControlFDImpl.RenameLocked.
+func (fd *controlFDLisa) RenameLocked(c *lisafs.Connection, newDir lisafs.ControlFDImpl, newName string) (func(lisafs.ControlFDImpl), func(), error) {
+ // Note that there is no controlFDLisa specific update needed on rename.
+ return nil, nil, renameat(fd.ParentLocked().(*controlFDLisa).hostFD, fd.NameLocked(), newDir.(*controlFDLisa).hostFD, newName)
+}
+
+// GetXattr implements lisafs.ControlFDImpl.GetXattr.
+func (fd *controlFDLisa) GetXattr(c *lisafs.Connection, comm lisafs.Communicator, name string, size uint32) (uint32, error) {
+ if !c.ServerImpl().(*LisafsServer).config.EnableVerityXattr {
+ return 0, unix.EOPNOTSUPP
+ }
+ if _, ok := verityXattrs[name]; !ok {
+ return 0, unix.EOPNOTSUPP
+ }
+
+ // Manually marshal lisafs.FGetXattrResp to avoid allocations and copying.
+ var valueLen primitive.Uint32
+ buf := comm.PayloadBuf(uint32(valueLen.SizeBytes()) + size)
+ n, err := unix.Fgetxattr(fd.hostFD, name, buf[valueLen.SizeBytes():])
+ if err != nil {
+ return 0, err
+ }
+ valueLen = primitive.Uint32(n)
+ valueLen.MarshalBytes(buf[:valueLen.SizeBytes()])
+
+ return uint32(valueLen.SizeBytes() + n), nil
+}
+
+// SetXattr implements lisafs.ControlFDImpl.SetXattr.
+func (fd *controlFDLisa) SetXattr(c *lisafs.Connection, name string, value string, flags uint32) error {
+ if !c.ServerImpl().(*LisafsServer).config.EnableVerityXattr {
+ return unix.EOPNOTSUPP
+ }
+ if _, ok := verityXattrs[name]; !ok {
+ return unix.EOPNOTSUPP
+ }
+ return unix.Fsetxattr(fd.hostFD, name, []byte(value) /* sigh */, int(flags))
+}
+
+// ListXattr implements lisafs.ControlFDImpl.ListXattr.
+func (fd *controlFDLisa) ListXattr(c *lisafs.Connection, comm lisafs.Communicator, size uint64) (uint32, error) {
+ return 0, unix.EOPNOTSUPP
+}
+
+// RemoveXattr implements lisafs.ControlFDImpl.RemoveXattr.
+func (fd *controlFDLisa) RemoveXattr(c *lisafs.Connection, comm lisafs.Communicator, name string) error {
+ return unix.EOPNOTSUPP
+}
+
+// openFDLisa implements lisafs.OpenFDImpl.
+type openFDLisa struct {
+ lisafs.OpenFD
+
+ // hostFD is the host file descriptor which can be used to make syscalls.
+ hostFD int
+}
+
+var _ lisafs.OpenFDImpl = (*openFDLisa)(nil)
+
+func (fd *controlFDLisa) newOpenFDLisa(hostFD int, flags uint32) *openFDLisa {
+ newFD := &openFDLisa{
+ hostFD: hostFD,
+ }
+ newFD.OpenFD.Init(fd.FD(), flags, newFD)
+ return newFD
+}
+
+// FD implements lisafs.OpenFDImpl.FD.
+func (fd *openFDLisa) FD() *lisafs.OpenFD {
+ if fd == nil {
+ return nil
+ }
+ return &fd.OpenFD
+}
+
+// Close implements lisafs.OpenFDImpl.Close.
+func (fd *openFDLisa) Close(c *lisafs.Connection) {
+ if fd.hostFD >= 0 {
+ _ = unix.Close(fd.hostFD)
+ fd.hostFD = -1
+ }
+}
+
+// Stat implements lisafs.OpenFDImpl.Stat.
+func (fd *openFDLisa) Stat(c *lisafs.Connection, comm lisafs.Communicator) (uint32, error) {
+ var resp linux.Statx
+ if err := fstatTo(fd.hostFD, &resp); err != nil {
+ return 0, err
+ }
+
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// Sync implements lisafs.OpenFDImpl.Sync.
+func (fd *openFDLisa) Sync(c *lisafs.Connection) error {
+ return unix.Fsync(fd.hostFD)
+}
+
+// Write implements lisafs.OpenFDImpl.Write.
+func (fd *openFDLisa) Write(c *lisafs.Connection, comm lisafs.Communicator, buf []byte, off uint64) (uint32, error) {
+ rw := rwfd.NewReadWriter(fd.hostFD)
+ n, err := rw.WriteAt(buf, int64(off))
+ if err != nil {
+ return 0, err
+ }
+
+ resp := &lisafs.PWriteResp{Count: uint64(n)}
+ respLen := uint32(resp.SizeBytes())
+ resp.MarshalUnsafe(comm.PayloadBuf(respLen))
+ return respLen, nil
+}
+
+// Read implements lisafs.OpenFDImpl.Read.
+func (fd *openFDLisa) Read(c *lisafs.Connection, comm lisafs.Communicator, off uint64, count uint32) (uint32, error) {
+ // To save an allocation and a copy, we directly read into the payload
+ // buffer. The rest of the response message is manually marshalled.
+ var resp lisafs.PReadResp
+ respMetaSize := uint32(resp.NumBytes.SizeBytes())
+ maxRespLen := respMetaSize + count
+
+ payloadBuf := comm.PayloadBuf(maxRespLen)
+ rw := rwfd.NewReadWriter(fd.hostFD)
+ n, err := rw.ReadAt(payloadBuf[respMetaSize:], int64(off))
+ if err != nil && err != io.EOF {
+ return 0, err
+ }
+
+ // Write the response metadata onto the payload buffer. The response contents
+ // already have been written immediately after it.
+ resp.NumBytes = primitive.Uint32(n)
+ resp.NumBytes.MarshalUnsafe(payloadBuf[:respMetaSize])
+ return respMetaSize + uint32(n), nil
+}
+
+// Allocate implements lisafs.OpenFDImpl.Allocate.
+func (fd *openFDLisa) Allocate(c *lisafs.Connection, mode, off, length uint64) error {
+ return unix.Fallocate(fd.hostFD, uint32(mode), int64(off), int64(length))
+}
+
+// Flush implements lisafs.OpenFDImpl.Flush.
+func (fd *openFDLisa) Flush(c *lisafs.Connection) error {
+ return nil
+}
+
+// Getdent64 implements lisafs.OpenFDImpl.Getdent64.
+func (fd *openFDLisa) Getdent64(c *lisafs.Connection, comm lisafs.Communicator, count uint32, seek0 bool) (uint32, error) {
+ if seek0 {
+ if _, err := unix.Seek(fd.hostFD, 0, 0); err != nil {
+ return 0, err
+ }
+ }
+
+ // We will manually marshal the response lisafs.Getdents64Resp.
+
+ // numDirents is the number of dirents marshalled into the payload.
+ var numDirents primitive.Uint32
+ // The payload starts with numDirents, dirents go right after that.
+ // payloadBufPos represents the position at which to write the next dirent.
+ payloadBufPos := uint32(numDirents.SizeBytes())
+ // Request enough payloadBuf for 10 dirents, we will extend when needed.
+ payloadBuf := comm.PayloadBuf(payloadBufPos + 10*unixDirentMaxSize)
+
+ var direntsBuf [8192]byte
+ var bytesRead int
+ for bytesRead < int(count) {
+ bufEnd := len(direntsBuf)
+ if remaining := int(count) - bytesRead; remaining < bufEnd {
+ bufEnd = remaining
+ }
+ n, err := unix.Getdents(fd.hostFD, direntsBuf[:bufEnd])
+ if err != nil {
+ if err == unix.EINVAL && bufEnd < 268 {
+ // getdents64(2) returns EINVAL is returned when the result
+ // buffer is too small. If bufEnd is smaller than the max
+ // size of unix.Dirent, then just break here to return all
+ // dirents collected till now.
+ break
+ }
+ return 0, err
+ }
+ if n <= 0 {
+ break
+ }
+ bytesRead += n
+
+ var statErr error
+ parseDirents(direntsBuf[:n], func(ino uint64, off int64, ftype uint8, name string) bool {
+ dirent := lisafs.Dirent64{
+ Ino: primitive.Uint64(ino),
+ Off: primitive.Uint64(off),
+ Type: primitive.Uint8(ftype),
+ Name: lisafs.SizedString(name),
+ }
+
+ // The client also wants the device ID, which annoyingly incurs an
+ // additional syscall per dirent. Live with it.
+ stat, err := statAt(fd.hostFD, name)
+ if err != nil {
+ statErr = err
+ return false
+ }
+ dirent.DevMinor = primitive.Uint32(unix.Minor(stat.Dev))
+ dirent.DevMajor = primitive.Uint32(unix.Major(stat.Dev))
+
+ // Paste the dirent into the payload buffer without having the dirent
+ // escape. Request a larger buffer if needed.
+ if int(payloadBufPos)+dirent.SizeBytes() > len(payloadBuf) {
+ // Ask for 10 large dirents worth of more space.
+ payloadBuf = comm.PayloadBuf(payloadBufPos + 10*unixDirentMaxSize)
+ }
+ dirent.MarshalBytes(payloadBuf[payloadBufPos:])
+ payloadBufPos += uint32(dirent.SizeBytes())
+ numDirents++
+ return true
+ })
+ if statErr != nil {
+ return 0, statErr
+ }
+ }
+
+ // The number of dirents goes at the beginning of the payload.
+ numDirents.MarshalUnsafe(payloadBuf)
+ return payloadBufPos, nil
+}
+
+// tryStepLocked tries to walk via open() with different modes as documented.
+// It then initializes and returns the control FD.
+//
+// Precondition: server's rename mutex must at least be read locked.
+func tryStepLocked(c *lisafs.Connection, name string, parent *controlFDLisa, open func(flags int) (int, error)) (*controlFDLisa, unix.Stat_t, error) {
+ // Attempt to open file in the following in order:
+ // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
+ // Use non-blocking to prevent getting stuck inside open(2) for
+ // FIFOs. This option has no effect on regular files.
+ // 2. PATH: for symlinks, sockets.
+ options := []struct {
+ flag int
+ readable bool
+ }{
+ {
+ flag: unix.O_RDONLY | unix.O_NONBLOCK,
+ readable: true,
+ },
+ {
+ flag: unix.O_PATH,
+ readable: false,
+ },
+ }
+
+ for i, option := range options {
+ hostFD, err := open(option.flag | openFlags)
+ if err == nil {
+ var stat unix.Stat_t
+ if err = unix.Fstat(hostFD, &stat); err == nil {
+ return newControlFDLisaLocked(c, hostFD, parent, name, linux.FileMode(stat.Mode)), stat, nil
+ }
+ unix.Close(hostFD)
+ }
+
+ e := extractErrno(err)
+ if e == unix.ENOENT {
+ // File doesn't exist, no point in retrying.
+ return nil, unix.Stat_t{}, e
+ }
+ if i < len(options)-1 {
+ continue
+ }
+ return nil, unix.Stat_t{}, e
+ }
+ panic("unreachable")
+}
+
+func fstatTo(hostFD int, stat *linux.Statx) error {
+ var unixStat unix.Stat_t
+ if err := unix.Fstat(hostFD, &unixStat); err != nil {
+ return err
+ }
+
+ unixToLinuxStat(&unixStat, stat)
+ return nil
+}
+
+func unixToLinuxStat(from *unix.Stat_t, to *linux.Statx) {
+ to.Mask = unix.STATX_TYPE | unix.STATX_MODE | unix.STATX_INO | unix.STATX_NLINK | unix.STATX_UID | unix.STATX_GID | unix.STATX_SIZE | unix.STATX_BLOCKS | unix.STATX_ATIME | unix.STATX_MTIME | unix.STATX_CTIME
+ to.Mode = uint16(from.Mode)
+ to.DevMinor = unix.Minor(from.Dev)
+ to.DevMajor = unix.Major(from.Dev)
+ to.Ino = from.Ino
+ to.Nlink = uint32(from.Nlink)
+ to.UID = from.Uid
+ to.GID = from.Gid
+ to.RdevMinor = unix.Minor(from.Rdev)
+ to.RdevMajor = unix.Major(from.Rdev)
+ to.Size = uint64(from.Size)
+ to.Blksize = uint32(from.Blksize)
+ to.Blocks = uint64(from.Blocks)
+ to.Atime.Sec = from.Atim.Sec
+ to.Atime.Nsec = uint32(from.Atim.Nsec)
+ to.Mtime.Sec = from.Mtim.Sec
+ to.Mtime.Nsec = uint32(from.Mtim.Nsec)
+ to.Ctime.Sec = from.Ctim.Sec
+ to.Ctime.Nsec = uint32(from.Ctim.Nsec)
+}
diff --git a/runsc/fsgofer/lisafs_test.go b/runsc/fsgofer/lisafs_test.go
new file mode 100644
index 000000000..4653f9955
--- /dev/null
+++ b/runsc/fsgofer/lisafs_test.go
@@ -0,0 +1,56 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package lisafs_test
+
+import (
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/lisafs"
+ "gvisor.dev/gvisor/pkg/lisafs/testsuite"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/runsc/fsgofer"
+)
+
+// Note that these are not supposed to be extensive or robust tests. These unit
+// tests provide a sanity check that all RPCs at least work in obvious ways.
+
+func init() {
+ log.SetLevel(log.Debug)
+ if err := fsgofer.OpenProcSelfFD(); err != nil {
+ panic(err)
+ }
+}
+
+// tester implements testsuite.Tester.
+type tester struct{}
+
+// NewServer implements testsuite.Tester.NewServer.
+func (tester) NewServer(t *testing.T) *lisafs.Server {
+ return &fsgofer.NewLisafsServer(fsgofer.Config{HostUDS: true, EnableVerityXattr: true}).Server
+}
+
+// LinkSupported implements testsuite.Tester.LinkSupported.
+func (tester) LinkSupported() bool {
+ return true
+}
+
+// SetUserGroupIDSupported implements testsuite.Tester.SetUserGroupIDSupported.
+func (tester) SetUserGroupIDSupported() bool {
+ return true
+}
+
+func TestFSGofer(t *testing.T) {
+ testsuite.RunAllLocalFSTests(t, tester{})
+}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index bc4a3fa32..d625230dd 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -17,12 +17,14 @@ go_library(
"//pkg/control/client",
"//pkg/control/server",
"//pkg/coverage",
+ "//pkg/eventchannel",
"//pkg/log",
"//pkg/sentry/control",
"//pkg/sentry/platform",
"//pkg/sync",
"//pkg/tcpip/header",
"//pkg/tcpip/stack",
+ "//pkg/unet",
"//pkg/urpc",
"//runsc/boot",
"//runsc/boot/platforms",
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index b15572a98..f4a37cedc 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -35,10 +35,12 @@ import (
"gvisor.dev/gvisor/pkg/control/client"
"gvisor.dev/gvisor/pkg/control/server"
"gvisor.dev/gvisor/pkg/coverage"
+ "gvisor.dev/gvisor/pkg/eventchannel"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/boot/platforms"
@@ -488,6 +490,61 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD))
nextFD++
+ if conf.ProfileBlock != "" {
+ blockFile, err := os.OpenFile(conf.ProfileBlock, os.O_CREATE|os.O_WRONLY, 0644)
+ if err != nil {
+ return fmt.Errorf("opening block profiling file %q: %v", conf.ProfileBlock, err)
+ }
+ defer blockFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, blockFile)
+ cmd.Args = append(cmd.Args, "--profile-block-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ if conf.ProfileCPU != "" {
+ cpuFile, err := os.OpenFile(conf.ProfileCPU, os.O_CREATE|os.O_WRONLY, 0644)
+ if err != nil {
+ return fmt.Errorf("opening cpu profiling file %q: %v", conf.ProfileCPU, err)
+ }
+ defer cpuFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, cpuFile)
+ cmd.Args = append(cmd.Args, "--profile-cpu-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ if conf.ProfileHeap != "" {
+ heapFile, err := os.OpenFile(conf.ProfileHeap, os.O_CREATE|os.O_WRONLY, 0644)
+ if err != nil {
+ return fmt.Errorf("opening heap profiling file %q: %v", conf.ProfileHeap, err)
+ }
+ defer heapFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, heapFile)
+ cmd.Args = append(cmd.Args, "--profile-heap-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ if conf.ProfileMutex != "" {
+ mutexFile, err := os.OpenFile(conf.ProfileMutex, os.O_CREATE|os.O_WRONLY, 0644)
+ if err != nil {
+ return fmt.Errorf("opening mutex profiling file %q: %v", conf.ProfileMutex, err)
+ }
+ defer mutexFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, mutexFile)
+ cmd.Args = append(cmd.Args, "--profile-mutex-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ if conf.TraceFile != "" {
+ traceFile, err := os.OpenFile(conf.TraceFile, os.O_CREATE|os.O_WRONLY, 0644)
+ if err != nil {
+ return fmt.Errorf("opening trace file %q: %v", conf.TraceFile, err)
+ }
+ defer traceFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, traceFile)
+ cmd.Args = append(cmd.Args, "--trace-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
// If there is a gofer, sends all socket ends to the sandbox.
for _, f := range args.IOFiles {
defer f.Close()
@@ -1020,6 +1077,90 @@ func (s *Sandbox) Cat(cid string, files []string, out *os.File) error {
return nil
}
+// Usage sends the collect call for a container in the sandbox.
+func (s *Sandbox) Usage(cid string, Full bool) (control.MemoryUsage, error) {
+ log.Debugf("Usage sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return control.MemoryUsage{}, err
+ }
+ defer conn.Close()
+
+ var m control.MemoryUsage
+ err = conn.Call(boot.UsageCollect, &control.MemoryUsageOpts{
+ Full: Full,
+ }, &m)
+ return m, err
+}
+
+// UsageFD sends the usagefd call for a container in the sandbox.
+func (s *Sandbox) UsageFD(cid string) (*control.MemoryUsageRecord, error) {
+ log.Debugf("Usage sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return nil, err
+ }
+ defer conn.Close()
+
+ var m control.MemoryUsageFile
+ if err := conn.Call(boot.UsageUsageFD, &control.MemoryUsageFileOpts{
+ Version: 1,
+ }, &m); err != nil {
+ return nil, fmt.Errorf("UsageFD failed: %v", err)
+ }
+
+ if len(m.FilePayload.Files) != 2 {
+ return nil, fmt.Errorf("wants exactly two fds")
+ }
+
+ return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1])
+}
+
+// Reduce sends the reduce call for a container in the sandbox.
+func (s *Sandbox) Reduce(cid string, wait bool) error {
+ log.Debugf("Reduce sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ return conn.Call(boot.UsageReduce, &control.UsageReduceOpts{
+ Wait: wait,
+ }, nil)
+}
+
+// Stream sends the AttachDebugEmitter call for a container in the sandbox, and
+// dumps filtered events to out.
+func (s *Sandbox) Stream(cid string, filters []string, out *os.File) error {
+ log.Debugf("Stream sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ r, w, err := unet.SocketPair(false)
+ if err != nil {
+ return err
+ }
+
+ wfd, err := w.Release()
+ if err != nil {
+ return fmt.Errorf("failed to release write socket FD: %v", err)
+ }
+
+ if err := conn.Call(boot.EventsAttachDebugEmitter, &control.EventsOpts{
+ FilePayload: urpc.FilePayload{Files: []*os.File{
+ os.NewFile(uintptr(wfd), "event sink"),
+ }},
+ }, nil); err != nil {
+ return fmt.Errorf("AttachDebugEmitter failed: %v", err)
+ }
+
+ return eventchannel.ProcessAll(r, filters, out)
+}
+
// IsRunning returns true if the sandbox or gofer process is running.
func (s *Sandbox) IsRunning() bool {
if s.Pid != 0 {
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 5365b5b1b..9d3b97277 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -332,9 +332,9 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.
return auth.CapabilitySetOfMany(caps), nil
}
-// Is9PMount returns true if the given mount can be mounted as an external
+// IsGoferMount returns true if the given mount can be mounted as an external
// gofer.
-func Is9PMount(m specs.Mount, vfs2Enabled bool) bool {
+func IsGoferMount(m specs.Mount, vfs2Enabled bool) bool {
MaybeConvertToBindMount(&m)
return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m, vfs2Enabled)
}