diff options
Diffstat (limited to 'runsc')
36 files changed, 2327 insertions, 177 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index c9d2b3eff..36806b740 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -15,6 +15,7 @@ go_library( "limits.go", "loader.go", "network.go", + "profile.go", "strace.go", "vfs.go", ], @@ -45,6 +46,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", + "//pkg/sentry/control:control_go_proto", "//pkg/sentry/devices/memdev", "//pkg/sentry/devices/ttydev", "//pkg/sentry/devices/tundev", @@ -78,7 +80,6 @@ go_library( "//pkg/sentry/loader", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/sighandling", "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/netfilter", "//pkg/sentry/socket/netlink", @@ -94,11 +95,12 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/watchdog", + "//pkg/sighandling", "//pkg/sync", "//pkg/tcpip", + "//pkg/tcpip/link/ethernet", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", - "//pkg/tcpip/link/packetsocket", "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 60b532798..76e1f596b 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" + controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" @@ -114,6 +115,18 @@ const ( FsCat = "Fs.Cat" ) +// Usage related commands (see usage.go for more details). +const ( + UsageCollect = "Usage.Collect" + UsageUsageFD = "Usage.UsageFD" + UsageReduce = "Usage.Reduce" +) + +// Events related commands (see events.go for more details). +const ( + EventsAttachDebugEmitter = "Events.AttachDebugEmitter" +) + // ControlSocketAddr generates an abstract unix socket name for the given ID. func ControlSocketAddr(id string) string { return fmt.Sprintf("\x00runsc-sandbox.%s", id) @@ -153,13 +166,31 @@ func newController(fd int, l *Loader) (*controller, error) { ctrl.srv.Register(net) } - ctrl.srv.Register(&debug{}) - ctrl.srv.Register(&control.Logging{}) - ctrl.srv.Register(&control.Lifecycle{l.k}) - ctrl.srv.Register(&control.Fs{l.k}) - - if l.root.conf.ProfileEnable { - ctrl.srv.Register(control.NewProfile(l.k)) + if l.root.conf.Controls.Controls != nil { + for _, c := range l.root.conf.Controls.Controls.AllowedControls { + switch c { + case controlpb.ControlConfig_EVENTS: + ctrl.srv.Register(&control.Events{}) + case controlpb.ControlConfig_FS: + ctrl.srv.Register(&control.Fs{Kernel: l.k}) + case controlpb.ControlConfig_LIFECYCLE: + ctrl.srv.Register(&control.Lifecycle{Kernel: l.k}) + case controlpb.ControlConfig_LOGGING: + ctrl.srv.Register(&control.Logging{}) + case controlpb.ControlConfig_PROFILE: + if l.root.conf.ProfileEnable { + ctrl.srv.Register(control.NewProfile(l.k)) + } + case controlpb.ControlConfig_USAGE: + ctrl.srv.Register(&control.Usage{Kernel: l.k}) + case controlpb.ControlConfig_PROC: + ctrl.srv.Register(&control.Proc{Kernel: l.k}) + case controlpb.ControlConfig_STATE: + ctrl.srv.Register(&control.State{Kernel: l.k}) + case controlpb.ControlConfig_DEBUG: + ctrl.srv.Register(&debug{}) + } + } } return ctrl, nil diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 703f34827..db363435b 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -304,6 +304,22 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */ }, }, + unix.SYS_TIMER_CREATE: []seccomp.Rule{ + { + seccomp.EqualTo(unix.CLOCK_THREAD_CPUTIME_ID), /* which */ + seccomp.MatchAny{}, /* sevp */ + seccomp.MatchAny{}, /* timerid */ + }, + }, + unix.SYS_TIMER_DELETE: []seccomp.Rule{}, + unix.SYS_TIMER_SETTIME: []seccomp.Rule{ + { + seccomp.MatchAny{}, /* timerid */ + seccomp.EqualTo(0), /* flags */ + seccomp.MatchAny{}, /* new_value */ + seccomp.EqualTo(0), /* old_value */ + }, + }, unix.SYS_TGKILL: []seccomp.Rule{ { seccomp.EqualTo(uint64(os.Getpid())), @@ -630,6 +646,11 @@ func hostInetFilters() seccomp.SyscallRules { func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ + unix.SYS_ACCEPT4: []seccomp.Rule{ + { + seccomp.EqualTo(fd), + }, + }, unix.SYS_ACCEPT: []seccomp.Rule{ { seccomp.EqualTo(fd), diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 40cf2a3df..ba5460f1c 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -188,8 +188,8 @@ func compileMounts(spec *specs.Spec, conf *config.Config, vfs2Enabled bool) []sp return mounts } -// p9MountData creates a slice of p9 mount data. -func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string { +// goferMountData creates a slice of gofer mount data. +func goferMountData(fd int, fa config.FileAccessType, attachPath string, vfs2 bool, lisafs bool) []string { opts := []string{ "trans=fd", "rfdno=" + strconv.Itoa(fd), @@ -203,6 +203,10 @@ func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string { if fa == config.FileAccessShared { opts = append(opts, "cache=remote_revalidating") } + if vfs2 && lisafs { + opts = append(opts, "lisafs=true") + opts = append(opts, "aname="+attachPath) + } return opts } @@ -761,7 +765,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con fd := c.fds.remove() log.Infof("Mounting root over 9P, ioFD: %d", fd) p9FS := mustFindFilesystem("9p") - opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) + opts := goferMountData(fd, conf.FileAccess, "/", false /* vfs2 */, false /* lisafs */) // We can't check for overlayfs here because sandbox is chroot'ed and gofer // can only send mount options for specs.Mounts (specs.Root is missing @@ -822,7 +826,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m *specs. case bind: fd := c.fds.remove() fsName = gofervfs2.Name - opts = p9MountData(fd, c.getMountAccessType(conf, m), conf.VFS2) + opts = goferMountData(fd, c.getMountAccessType(conf, m), m.Destination, conf.VFS2, conf.Lisafs) // If configured, add overlay to all writable mounts. useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly case cgroupfs.Name: @@ -980,7 +984,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.Re // Add root mount. fd := c.fds.remove() - opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) + opts := goferMountData(fd, conf.FileAccess, "/", false /* vfs2 */, false /* lisafs */) mf := fs.MountSourceFlags{} if c.root.Readonly || conf.Overlay { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index ec9188021..2f2d4df5e 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -49,15 +49,16 @@ import ( "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/sighandling" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" @@ -119,6 +120,10 @@ type Loader struct { // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() + // stopProfiling stops profiling started at container creation. It + // should be called when a sandbox is destroyed. + stopProfiling func() + // restore is set to true if we are restoring a container. restore bool @@ -198,6 +203,21 @@ type Args struct { TotalMem uint64 // UserLogFD is the file descriptor to write user logs to. UserLogFD int + // ProfileBlockFD is the file descriptor to write a block profile to. + // Valid if >=0. + ProfileBlockFD int + // ProfileCPUFD is the file descriptor to write a CPU profile to. + // Valid if >=0. + ProfileCPUFD int + // ProfileHeapFD is the file descriptor to write a heap profile to. + // Valid if >=0. + ProfileHeapFD int + // ProfileMutexFD is the file descriptor to write a mutex profile to. + // Valid if >=0. + ProfileMutexFD int + // TraceFD is the file descriptor to write a Go execution trace to. + // Valid if >=0. + TraceFD int } // make sure stdioFDs are always the same on initial start and on restore @@ -206,6 +226,8 @@ const startingStdioFD = 256 // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. func New(args Args) (*Loader, error) { + stopProfiling := startProfiling(args) + // We initialize the rand package now to make sure /dev/urandom is pre-opened // on kernels that do not support getrandom(2). if err := rand.Init(); err != nil { @@ -219,10 +241,8 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true - if args.Conf.FUSE { - kernel.FUSEEnabled = true - } - + kernel.FUSEEnabled = args.Conf.FUSE + kernel.LISAFSEnabled = args.Conf.Lisafs vfs2.Override() } @@ -399,12 +419,13 @@ func New(args Args) (*Loader, error) { eid := execID{cid: args.ID} l := &Loader{ - k: k, - watchdog: dog, - sandboxID: args.ID, - processes: map[execID]*execProcess{eid: {}}, - mountHints: mountHints, - root: info, + k: k, + watchdog: dog, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + root: info, + stopProfiling: stopProfiling, } // We don't care about child signals; some platforms can generate a @@ -497,6 +518,8 @@ func (l *Loader) Destroy() { for _, f := range l.root.goferFDs { _ = f.Close() } + + l.stopProfiling() } func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { @@ -1088,13 +1111,14 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st return inet.NewRootNamespace(hostinet.NewStack(), nil), nil case config.NetworkNone, config.NetworkSandbox: - s, err := newEmptySandboxNetworkStack(clock, uniqueID) + s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite) if err != nil { return nil, err } creator := &sandboxNetstackCreator{ - clock: clock, - uniqueID: uniqueID, + clock: clock, + uniqueID: uniqueID, + allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, } return inet.NewRootNamespace(s, creator), nil @@ -1104,7 +1128,7 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st } -func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) { netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} transProtos := []stack.TransportProtocolFactory{ tcp.NewProtocol, @@ -1120,9 +1144,10 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - RawFactory: raw.EndpointFactory{}, - UniqueID: uniqueID, - DefaultIPTables: netfilter.DefaultLinuxTables, + RawFactory: raw.EndpointFactory{}, + AllowPacketEndpointWrite: allowPacketEndpointWrite, + UniqueID: uniqueID, + DefaultIPTables: netfilter.DefaultLinuxTables, })} // Enable SACK Recovery. @@ -1159,13 +1184,14 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in // // +stateify savable type sandboxNetstackCreator struct { - clock tcpip.Clock - uniqueID stack.UniqueID + clock tcpip.Clock + uniqueID stack.UniqueID + allowPacketEndpointWrite bool } // CreateStack implements kernel.NetworkStackCreator.CreateStack. func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { - s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite) if err != nil { return nil, err } @@ -1174,7 +1200,7 @@ func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { n := &Network{Stack: s.(*netstack.Stack).Stack} nicID := tcpip.NICID(f.uniqueID.UniqueID()) link := DefaultLoopbackLink - linkEP := loopback.New() + linkEP := ethernet.New(loopback.New()) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return nil, err } diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 7e627e4c6..9fb3ebd95 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -23,9 +23,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" - "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -169,7 +169,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct nicID++ nicids[link.Name] = nicID - linkEP := loopback.New() + linkEP := ethernet.New(loopback.New()) log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { @@ -209,7 +209,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct linkEP, err := fdbased.New(&fdbased.Options{ FDs: FDs, MTU: uint32(link.MTU), - EthernetHeader: true, + EthernetHeader: mac != "", Address: mac, PacketDispatchMode: fdbased.RecvMMsg, GSOMaxSize: link.GSOMaxSize, @@ -228,9 +228,6 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } - // Enable support for AF_PACKET sockets to receive outgoing packets. - linkEP = packetsocket.New(linkEP) - log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err @@ -285,12 +282,15 @@ func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkE for _, addr := range addrs { proto, tcpipAddr := ipToAddressAndProto(addr.Address) - ap := tcpip.AddressWithPrefix{ - Address: tcpipAddr, - PrefixLen: addr.PrefixLen, + protocolAddr := tcpip.ProtocolAddress{ + Protocol: proto, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: tcpipAddr, + PrefixLen: addr.PrefixLen, + }, } - if err := n.Stack.AddAddressWithPrefix(id, proto, ap); err != nil { - return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err) + if err := n.Stack.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil { + return fmt.Errorf("AddProtocolAddress(%d, %+v, {}) failed: %s", id, protocolAddr, err) } } return nil diff --git a/runsc/boot/profile.go b/runsc/boot/profile.go new file mode 100644 index 000000000..3ecd3e532 --- /dev/null +++ b/runsc/boot/profile.go @@ -0,0 +1,95 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "os" + "runtime" + "runtime/pprof" + "runtime/trace" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" +) + +// startProfiling initiates profiling as defined by the ProfileConfig, and +// returns a function that should be called to stop profiling. +func startProfiling(args Args) func() { + var onStopProfiling []func() + stopProfiling := func() { + for _, f := range onStopProfiling { + f() + } + } + + if args.ProfileBlockFD >= 0 { + file := os.NewFile(uintptr(args.ProfileBlockFD), "profile-block") + + runtime.SetBlockProfileRate(control.DefaultBlockProfileRate) + onStopProfiling = append(onStopProfiling, func() { + if err := pprof.Lookup("block").WriteTo(file, 0); err != nil { + log.Warningf("Error writing block profile: %v", err) + } + file.Close() + runtime.SetBlockProfileRate(0) + }) + } + + if args.ProfileCPUFD >= 0 { + file := os.NewFile(uintptr(args.ProfileCPUFD), "profile-cpu") + + pprof.StartCPUProfile(file) + onStopProfiling = append(onStopProfiling, func() { + pprof.StopCPUProfile() + file.Close() + }) + } + + if args.ProfileHeapFD >= 0 { + file := os.NewFile(uintptr(args.ProfileHeapFD), "profile-heap") + + onStopProfiling = append(onStopProfiling, func() { + if err := pprof.Lookup("heap").WriteTo(file, 0); err != nil { + log.Warningf("Error writing heap profile: %v", err) + } + file.Close() + }) + } + + if args.ProfileMutexFD >= 0 { + file := os.NewFile(uintptr(args.ProfileMutexFD), "profile-mutex") + + prev := runtime.SetMutexProfileFraction(control.DefaultMutexProfileRate) + onStopProfiling = append(onStopProfiling, func() { + if err := pprof.Lookup("mutex").WriteTo(file, 0); err != nil { + log.Warningf("Error writing mutex profile: %v", err) + } + file.Close() + runtime.SetMutexProfileFraction(prev) + }) + } + + if args.TraceFD >= 0 { + file := os.NewFile(uintptr(args.TraceFD), "trace") + + trace.Start(file) + onStopProfiling = append(onStopProfiling, func() { + trace.Stop() + file.Close() + }) + } + + return stopProfiling +} diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go index c21648a32..cf5be34cd 100644 --- a/runsc/boot/strace.go +++ b/runsc/boot/strace.go @@ -35,9 +35,14 @@ func enableStrace(conf *config.Config) error { } strace.LogMaximumSize = max + sink := strace.SinkTypeLog + if conf.StraceEvent { + sink = strace.SinkTypeEvent + } + if len(conf.StraceSyscalls) == 0 { - strace.EnableAll(strace.SinkTypeLog) + strace.EnableAll(sink) return nil } - return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog) + return strace.Enable(strings.Split(conf.StraceSyscalls, ","), sink) } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 346796d9c..ac1e5ac37 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -173,7 +173,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create rootProcArgs.Credentials = rootCreds rootProcArgs.Umask = 0022 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals - rootCtx := procArgs.NewContext(c.k) + rootCtx := rootProcArgs.NewContext(c.k) mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { @@ -208,7 +208,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create // createMountNamespaceVFS2 creates the container's root mount and namespace. func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - data := p9MountData(fd, conf.FileAccess, true /* vfs2 */) + data := goferMountData(fd, conf.FileAccess, "/", true /* vfs2 */, conf.Lisafs) // We can't check for overlayfs here because sandbox is chroot'ed and gofer // can only send mount options for specs.Mounts (specs.Root is missing @@ -515,7 +515,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo // but unlikely to be correct in this context. return "", nil, false, fmt.Errorf("9P mount requires a connection FD") } - data = p9MountData(m.fd, c.getMountAccessType(conf, m.mount), true /* vfs2 */) + data = goferMountData(m.fd, c.getMountAccessType(conf, m.mount), m.mount.Destination, true /* vfs2 */, conf.Lisafs) internalData = gofer.InternalFilesystemOptions{ UniqueID: m.mount.Destination, } diff --git a/runsc/cli/main.go b/runsc/cli/main.go index 3556d7665..058ab8232 100644 --- a/runsc/cli/main.go +++ b/runsc/cli/main.go @@ -228,7 +228,7 @@ func Main(version string) { log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay) log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets) log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls) - log.Infof("\t\tVFS2 enabled: %v", conf.VFS2) + log.Infof("\t\tVFS2 enabled: %t, LISAFS: %t", conf.VFS2, conf.Lisafs) log.Infof("***************************") if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 031ddd57e..c5e32807d 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -36,6 +36,7 @@ go_library( "statefile.go", "symbolize.go", "syscalls.go", + "usage.go", "verity_prepare.go", "wait.go", ], diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index f5c9821b2..e33a7f3cb 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -79,6 +79,26 @@ type Boot struct { // sandbox (e.g. gofer) and sent through this FD. mountsFD int + // profileBlockFD is the file descriptor to write a block profile to. + // Valid if >= 0. + profileBlockFD int + + // profileCPUFD is the file descriptor to write a CPU profile to. + // Valid if >= 0. + profileCPUFD int + + // profileHeapFD is the file descriptor to write a heap profile to. + // Valid if >= 0. + profileHeapFD int + + // profileMutexFD is the file descriptor to write a mutex profile to. + // Valid if >= 0. + profileMutexFD int + + // traceFD is the file descriptor to write a Go execution trace to. + // Valid if >= 0. + traceFD int + // pidns is set if the sandbox is in its own pid namespace. pidns bool @@ -119,6 +139,11 @@ func (b *Boot) SetFlags(f *flag.FlagSet) { f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") + f.IntVar(&b.profileBlockFD, "profile-block-fd", -1, "file descriptor to write block profile to. -1 disables profiling.") + f.IntVar(&b.profileCPUFD, "profile-cpu-fd", -1, "file descriptor to write CPU profile to. -1 disables profiling.") + f.IntVar(&b.profileHeapFD, "profile-heap-fd", -1, "file descriptor to write heap profile to. -1 disables profiling.") + f.IntVar(&b.profileMutexFD, "profile-mutex-fd", -1, "file descriptor to write mutex profile to. -1 disables profiling.") + f.IntVar(&b.traceFD, "trace-fd", -1, "file descriptor to write Go execution trace to. -1 disables tracing.") f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") } @@ -213,16 +238,21 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Create the loader. bootArgs := boot.Args{ - ID: f.Arg(0), - Spec: spec, - Conf: conf, - ControllerFD: b.controllerFD, - Device: os.NewFile(uintptr(b.deviceFD), "platform device"), - GoferFDs: b.ioFDs.GetArray(), - StdioFDs: b.stdioFDs.GetArray(), - NumCPU: b.cpuNum, - TotalMem: b.totalMem, - UserLogFD: b.userLogFD, + ID: f.Arg(0), + Spec: spec, + Conf: conf, + ControllerFD: b.controllerFD, + Device: os.NewFile(uintptr(b.deviceFD), "platform device"), + GoferFDs: b.ioFDs.GetArray(), + StdioFDs: b.stdioFDs.GetArray(), + NumCPU: b.cpuNum, + TotalMem: b.totalMem, + UserLogFD: b.userLogFD, + ProfileBlockFD: b.profileBlockFD, + ProfileCPUFD: b.profileCPUFD, + ProfileHeapFD: b.profileHeapFD, + ProfileMutexFD: b.profileMutexFD, + TraceFD: b.traceFD, } l, err := boot.New(bootArgs) if err != nil { diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index f773ccca0..318753728 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -37,9 +37,9 @@ type Debug struct { pid int stacks bool signal int - profileHeap string - profileCPU string profileBlock string + profileCPU string + profileHeap string profileMutex string trace string strace string @@ -70,9 +70,9 @@ func (*Debug) Usage() string { func (d *Debug) SetFlags(f *flag.FlagSet) { f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set") f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") - f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") - f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.") + f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") + f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.") f.DurationVar(&d.delay, "delay", time.Hour, "amount of time to delay for collecting heap and goroutine profiles.") f.DurationVar(&d.duration, "duration", time.Hour, "amount of time to wait for CPU and trace profiles.") @@ -90,6 +90,13 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) var c *container.Container conf := args[0].(*config.Config) + if conf.ProfileBlock != "" || conf.ProfileCPU != "" || conf.ProfileHeap != "" || conf.ProfileMutex != "" { + return Errorf("global -profile-{block,cpu,heap,mutex} flags have no effect on runsc debug. Pass runsc debug -profile-{block,cpu,heap,mutex} instead") + } + if conf.TraceFile != "" { + return Errorf("global -trace flag has no effect on runsc debug. Pass runsc debug -trace instead") + } + if d.pid == 0 { // No pid, container ID must have been provided. if f.NArg() != 1 { @@ -219,19 +226,19 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Open profiling files. var ( - heapFile *os.File - cpuFile *os.File - traceFile *os.File blockFile *os.File + cpuFile *os.File + heapFile *os.File mutexFile *os.File + traceFile *os.File ) - if d.profileHeap != "" { - f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if d.profileBlock != "" { + f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { - return Errorf("error opening heap profile output: %v", err) + return Errorf("error opening blocking profile output: %v", err) } defer f.Close() - heapFile = f + blockFile = f } if d.profileCPU != "" { f, err := os.OpenFile(d.profileCPU, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) @@ -241,20 +248,13 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) defer f.Close() cpuFile = f } - if d.trace != "" { - f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) - if err != nil { - return Errorf("error opening trace profile output: %v", err) - } - traceFile = f - } - if d.profileBlock != "" { - f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if d.profileHeap != "" { + f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { - return Errorf("error opening blocking profile output: %v", err) + return Errorf("error opening heap profile output: %v", err) } defer f.Close() - blockFile = f + heapFile = f } if d.profileMutex != "" { f, err := os.OpenFile(d.profileMutex, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) @@ -264,21 +264,28 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) defer f.Close() mutexFile = f } + if d.trace != "" { + f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return Errorf("error opening trace profile output: %v", err) + } + traceFile = f + } // Collect profiles. var ( wg sync.WaitGroup - heapErr error - cpuErr error - traceErr error blockErr error + cpuErr error + heapErr error mutexErr error + traceErr error ) - if heapFile != nil { + if blockFile != nil { wg.Add(1) go func() { defer wg.Done() - heapErr = c.Sandbox.HeapProfile(heapFile, d.delay) + blockErr = c.Sandbox.BlockProfile(blockFile, d.duration) }() } if cpuFile != nil { @@ -288,25 +295,25 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) cpuErr = c.Sandbox.CPUProfile(cpuFile, d.duration) }() } - if traceFile != nil { + if heapFile != nil { wg.Add(1) go func() { defer wg.Done() - traceErr = c.Sandbox.Trace(traceFile, d.duration) + heapErr = c.Sandbox.HeapProfile(heapFile, d.delay) }() } - if blockFile != nil { + if mutexFile != nil { wg.Add(1) go func() { defer wg.Done() - blockErr = c.Sandbox.BlockProfile(blockFile, d.duration) + mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration) }() } - if mutexFile != nil { + if traceFile != nil { wg.Add(1) go func() { defer wg.Done() - mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration) + traceErr = c.Sandbox.Trace(traceFile, d.duration) }() } @@ -339,31 +346,31 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Collect all errors. errorCount := 0 - if heapErr != nil { + if blockErr != nil { errorCount++ - log.Infof("error collecting heap profile: %v", heapErr) - os.Remove(heapFile.Name()) + log.Infof("error collecting block profile: %v", blockErr) + os.Remove(blockFile.Name()) } if cpuErr != nil { errorCount++ log.Infof("error collecting cpu profile: %v", cpuErr) os.Remove(cpuFile.Name()) } - if traceErr != nil { - errorCount++ - log.Infof("error collecting trace profile: %v", traceErr) - os.Remove(traceFile.Name()) - } - if blockErr != nil { + if heapErr != nil { errorCount++ - log.Infof("error collecting block profile: %v", blockErr) - os.Remove(blockFile.Name()) + log.Infof("error collecting heap profile: %v", heapErr) + os.Remove(heapFile.Name()) } if mutexErr != nil { errorCount++ log.Infof("error collecting mutex profile: %v", mutexErr) os.Remove(mutexFile.Name()) } + if traceErr != nil { + errorCount++ + log.Infof("error collecting trace profile: %v", traceErr) + os.Remove(traceFile.Name()) + } if errorCount > 0 { return subcommands.ExitFailure diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 6cf76f644..4eb5a96f1 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -130,7 +130,6 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su if conf.Network == config.NetworkNone { addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace}) - } else if conf.Rootless { if conf.Network == config.NetworkSandbox { c.notifyUser("*** Warning: sandbox network isn't supported with --rootless, switching to host ***") diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go index c1d029d7f..08246e543 100644 --- a/runsc/cmd/events.go +++ b/runsc/cmd/events.go @@ -33,6 +33,10 @@ type Events struct { intervalSec int // If true, events will print a single group of stats and exit. stats bool + // If true, events will dump all filtered events to stdout. + stream bool + // filters for streamed events. + filters stringSlice } // Name implements subcommands.Command.Name. @@ -62,6 +66,8 @@ OPTIONS: func (evs *Events) SetFlags(f *flag.FlagSet) { f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds") f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit") + f.BoolVar(&evs.stream, "stream", false, "dump all filtered events to stdout") + f.Var(&evs.filters, "filters", "only display matching events") } // Execute implements subcommands.Command.Execute. @@ -79,6 +85,13 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa Fatalf("loading sandbox: %v", err) } + if evs.stream { + if err := c.Stream(evs.filters, os.Stdout); err != nil { + Fatalf("Stream failed: %v", err) + } + return subcommands.ExitSuccess + } + // Repeatedly get stats from the container. for { // Get the event and print it as JSON. diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 2193e9040..c65e0267a 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -71,8 +71,8 @@ func (*Gofer) Name() string { } // Synopsis implements subcommands.Command. -func (*Gofer) Synopsis() string { - return "launch a gofer process that serves files over 9P protocol (internal use only)" +func (g *Gofer) Synopsis() string { + return fmt.Sprintf("launch a gofer process that serves files over the protocol (9P or lisafs) defined in the config (internal use only)") } // Usage implements subcommands.Command. @@ -83,7 +83,7 @@ func (*Gofer) Usage() string { // SetFlags implements subcommands.Command. func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") - f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") + f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") @@ -160,10 +160,98 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } log.Infof("Process chroot'd to %q", root) + // Initialize filters. + if conf.FSGoferHostUDS { + filter.InstallUDSFilters() + } + + if conf.Verity { + filter.InstallXattrFilters() + } + + if err := filter.Install(); err != nil { + Fatalf("installing seccomp filters: %v", err) + } + + if conf.Lisafs { + return g.serveLisafs(spec, conf, root) + } + return g.serve9P(spec, conf, root) +} + +func newSocket(ioFD int) *unet.Socket { + socket, err := unet.NewSocket(ioFD) + if err != nil { + Fatalf("creating server on FD %d: %v", ioFD, err) + } + return socket +} + +func (g *Gofer) serveLisafs(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus { + type connectionConfig struct { + sock *unet.Socket + readonly bool + } + cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1) + server := fsgofer.NewLisafsServer(fsgofer.Config{ + // These are global options. Ignore readonly configuration, that is set on + // a per connection basis. + HostUDS: conf.FSGoferHostUDS, + EnableVerityXattr: conf.Verity, + }) + + // Start with root mount, then add any other additional mount as needed. + cfgs = append(cfgs, connectionConfig{ + sock: newSocket(g.ioFDs[0]), + readonly: spec.Root.Readonly || conf.Overlay, + }) + log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], cfgs[0].readonly) + + mountIdx := 1 // first one is the root + for _, m := range spec.Mounts { + if !specutils.IsGoferMount(m, conf.VFS2) { + continue + } + + if !filepath.IsAbs(m.Destination) { + Fatalf("mount destination must be absolute: %q", m.Destination) + } + if mountIdx >= len(g.ioFDs) { + Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m) + } + + cfgs = append(cfgs, connectionConfig{ + sock: newSocket(g.ioFDs[mountIdx]), + readonly: isReadonlyMount(m.Options) || conf.Overlay, + }) + + log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfgs[mountIdx].readonly) + mountIdx++ + } + + if mountIdx != len(g.ioFDs) { + Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) + } + cfgs = cfgs[:mountIdx] + + for _, cfg := range cfgs { + conn, err := server.CreateConnection(cfg.sock, cfg.readonly) + if err != nil { + Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err) + } + server.StartConnection(conn) + } + server.Wait() + log.Infof("All lisafs servers exited.") + return subcommands.ExitSuccess +} + +func (g *Gofer) serve9P(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus { // Start with root mount, then add any other additional mount as needed. ats := make([]p9.Attacher, 0, len(spec.Mounts)+1) ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{ ROMount: spec.Root.Readonly || conf.Overlay, + HostUDS: conf.FSGoferHostUDS, EnableVerityXattr: conf.Verity, }) if err != nil { @@ -174,7 +262,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) mountIdx := 1 // first one is the root for _, m := range spec.Mounts { - if specutils.Is9PMount(m, conf.VFS2) { + if specutils.IsGoferMount(m, conf.VFS2) { cfg := fsgofer.Config{ ROMount: isReadonlyMount(m.Options) || conf.Overlay, HostUDS: conf.FSGoferHostUDS, @@ -197,26 +285,9 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) } - if conf.FSGoferHostUDS { - filter.InstallUDSFilters() - } - - if conf.Verity { - filter.InstallXattrFilters() - } - - if err := filter.Install(); err != nil { - Fatalf("installing seccomp filters: %v", err) - } - - runServers(ats, g.ioFDs) - return subcommands.ExitSuccess -} - -func runServers(ats []p9.Attacher, ioFDs []int) { // Run the loops and wait for all to exit. var wg sync.WaitGroup - for i, ioFD := range ioFDs { + for i, ioFD := range g.ioFDs { wg.Add(1) go func(ioFD int, at p9.Attacher) { socket, err := unet.NewSocket(ioFD) @@ -232,6 +303,7 @@ func runServers(ats []p9.Attacher, ioFDs []int) { } wg.Wait() log.Infof("All 9P servers exited.") + return subcommands.ExitSuccess } func (g *Gofer) writeMounts(mounts []specs.Mount) error { @@ -362,7 +434,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { // creates directories as needed. func setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error { for _, m := range mounts { - if !specutils.Is9PMount(m, conf.VFS2) { + if !specutils.IsGoferMount(m, conf.VFS2) { continue } @@ -402,7 +474,7 @@ func setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath strin func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { cleanMounts := make([]specs.Mount, 0, len(mounts)) for _, m := range mounts { - if !specutils.Is9PMount(m, conf.VFS2) { + if !specutils.IsGoferMount(m, conf.VFS2) { cleanMounts = append(cleanMounts, m) continue } diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go index 722181aff..da11c9d06 100644 --- a/runsc/cmd/run.go +++ b/runsc/cmd/run.go @@ -68,7 +68,14 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s waitStatus := args[1].(*unix.WaitStatus) if conf.Rootless { - return Errorf("Rootless mode not supported with %q", r.Name()) + if conf.Network == config.NetworkSandbox { + return Errorf("sandbox network isn't supported with --rootless, use --network=none or --network=host") + } + + if err := specutils.MaybeRunAsRoot(); err != nil { + return Errorf("Error executing inside namespace: %v", err) + } + // Execution will continue here if no more capabilities are needed... } bundleDir := r.bundleDir diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go index 55194e641..3a7f2a2c4 100644 --- a/runsc/cmd/spec.go +++ b/runsc/cmd/spec.go @@ -30,7 +30,6 @@ func writeSpec(w io.Writer, cwd string, netns string, args []string) error { spec := &specs.Spec{ Version: "1.0.0", Process: &specs.Process{ - Terminal: true, User: specs.User{ UID: 0, GID: 0, diff --git a/runsc/cmd/usage.go b/runsc/cmd/usage.go new file mode 100644 index 000000000..d2aeafa28 --- /dev/null +++ b/runsc/cmd/usage.go @@ -0,0 +1,93 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "os" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Usage implements subcommands.Command for the "usage" command. +type Usage struct { + full bool + fd bool +} + +// Name implements subcommands.Command.Name. +func (*Usage) Name() string { + return "usage" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Usage) Synopsis() string { + return "Usage shows application memory usage across various categories in bytes." +} + +// Usage implements subcommands.Command.Usage. +func (*Usage) Usage() string { + return `usage [flags] <container id> - print memory usages to standard output.` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (u *Usage) SetFlags(f *flag.FlagSet) { + f.BoolVar(&u.full, "full", false, "enumerate all usage by categories") + f.BoolVar(&u.fd, "fd", false, "retrieves a subset of usage through the established usage FD") +} + +// Execute implements subcommands.Command.Execute. +func (u *Usage) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() < 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*config.Config) + + cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) + if err != nil { + Fatalf("loading container: %v", err) + } + + if !u.fd { + m, err := cont.Usage(u.full) + if err != nil { + Fatalf("usage failed: %v", err) + } + if err := json.NewEncoder(os.Stdout).Encode(m); err != nil { + Fatalf("Encode MemoryUsage failed: %v", err) + } + } else { + m, err := cont.UsageFD() + if err != nil { + Fatalf("usagefd failed: %v", err) + } + + mapped, unknown, total, err := m.Fetch() + if err != nil { + Fatalf("Fetch memory usage failed: %v", err) + } + + fmt.Printf("Mapped %v, Unknown %v, Total %v\n", mapped, unknown, total) + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/verity_prepare.go b/runsc/cmd/verity_prepare.go index 85d762a51..44c1d05db 100644 --- a/runsc/cmd/verity_prepare.go +++ b/runsc/cmd/verity_prepare.go @@ -82,7 +82,7 @@ func (c *VerityPrepare) Execute(_ context.Context, f *flag.FlagSet, args ...inte }, Process: &specs.Process{ Cwd: absRoot, - Args: []string{c.tool, "--path", "/verityroot"}, + Args: []string{c.tool, "--path", "/verityroot", "--rawpath", "/rawroot"}, Env: os.Environ(), Capabilities: specutils.AllCapabilities(), }, @@ -94,6 +94,11 @@ func (c *VerityPrepare) Execute(_ context.Context, f *flag.FlagSet, args ...inte Type: "bind", Options: []string{"verity.roothash="}, }, + { + Source: c.dir, + Destination: "/rawroot", + Type: "bind", + }, }, } diff --git a/runsc/config/BUILD b/runsc/config/BUILD index b1672bb9d..64295d283 100644 --- a/runsc/config/BUILD +++ b/runsc/config/BUILD @@ -11,6 +11,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/refs", + "//pkg/sentry/control:control_go_proto", "//pkg/sentry/watchdog", "//pkg/sync", "//runsc/flag", @@ -24,5 +25,8 @@ go_test( "config_test.go", ], library = ":config", - deps = ["//runsc/flag"], + deps = [ + "//pkg/sentry/control:control_go_proto", + "//runsc/flag", + ], ) diff --git a/runsc/config/config.go b/runsc/config/config.go index cc4650180..91142888f 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -19,8 +19,10 @@ package config import ( "fmt" + "strings" "gvisor.dev/gvisor/pkg/refs" + controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" "gvisor.dev/gvisor/pkg/sentry/watchdog" ) @@ -84,6 +86,9 @@ type Config struct { // capabilities. EnableRaw bool `flag:"net-raw"` + // AllowPacketEndpointWrite enables write operations on packet endpoints. + AllowPacketEndpointWrite bool `flag:"TESTONLY-allow-packet-endpoint-write"` + // HardwareGSO indicates that hardware segmentation offload is enabled. HardwareGSO bool `flag:"gso"` @@ -117,6 +122,10 @@ type Config struct { // StraceLogSize is the max size of data blobs to display. StraceLogSize uint `flag:"strace-log-size"` + // StraceEvent indicates sending strace to events if true. Strace is + // sent to log if false. + StraceEvent bool `flag:"strace-event"` + // DisableSeccomp indicates whether seccomp syscall filters should be // disabled. Pardon the double negation, but default to enabled is important. DisableSeccomp bool @@ -131,6 +140,29 @@ type Config struct { // ProfileEnable is set to prepare the sandbox to be profiled. ProfileEnable bool `flag:"profile"` + // ProfileBlock collects a block profile to the passed file for the + // duration of the container execution. Requires ProfileEnabled. + ProfileBlock string `flag:"profile-block"` + + // ProfileCPU collects a CPU profile to the passed file for the + // duration of the container execution. Requires ProfileEnabled. + ProfileCPU string `flag:"profile-cpu"` + + // ProfileHeap collects a heap profile to the passed file for the + // duration of the container execution. Requires ProfileEnabled. + ProfileHeap string `flag:"profile-heap"` + + // ProfileMutex collects a mutex profile to the passed file for the + // duration of the container execution. Requires ProfileEnabled. + ProfileMutex string `flag:"profile-mutex"` + + // TraceFile collects a Go runtime execution trace to the passed file + // for the duration of the container execution. + TraceFile string `flag:"trace"` + + // Controls defines the controls that may be enabled. + Controls controlConfig `flag:"controls"` + // RestoreFile is the path to the saved container image RestoreFile string @@ -161,6 +193,9 @@ type Config struct { // Enables VFS2. VFS2 bool `flag:"vfs2"` + // Enable lisafs. + Lisafs bool `flag:"lisafs"` + // Enables FUSE usage. FUSE bool `flag:"fuse"` @@ -195,6 +230,21 @@ func (c *Config) validate() error { if c.NumNetworkChannels <= 0 { return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels) } + // Require profile flags to explicitly opt-in to profiling with + // -profile rather than implying it since these options have security + // implications. + if c.ProfileBlock != "" && !c.ProfileEnable { + return fmt.Errorf("profile-block flag requires enabling profiling with profile flag") + } + if c.ProfileCPU != "" && !c.ProfileEnable { + return fmt.Errorf("profile-cpu flag requires enabling profiling with profile flag") + } + if c.ProfileHeap != "" && !c.ProfileEnable { + return fmt.Errorf("profile-heap flag requires enabling profiling with profile flag") + } + if c.ProfileMutex != "" && !c.ProfileEnable { + return fmt.Errorf("profile-mutex flag requires enabling profiling with profile flag") + } return nil } @@ -347,6 +397,96 @@ func (q QueueingDiscipline) String() string { panic(fmt.Sprintf("Invalid qdisc %d", q)) } +// controlConfig represents control endpoints. +type controlConfig struct { + Controls *controlpb.ControlConfig +} + +// Set implements flag.Value. +func (c *controlConfig) Set(v string) error { + controls := strings.Split(v, ",") + var controlList []controlpb.ControlConfig_Endpoint + for _, control := range controls { + switch control { + case "EVENTS": + controlList = append(controlList, controlpb.ControlConfig_EVENTS) + case "FS": + controlList = append(controlList, controlpb.ControlConfig_FS) + case "LIFECYCLE": + controlList = append(controlList, controlpb.ControlConfig_LIFECYCLE) + case "LOGGING": + controlList = append(controlList, controlpb.ControlConfig_LOGGING) + case "PROFILE": + controlList = append(controlList, controlpb.ControlConfig_PROFILE) + case "USAGE": + controlList = append(controlList, controlpb.ControlConfig_USAGE) + case "PROC": + controlList = append(controlList, controlpb.ControlConfig_PROC) + case "STATE": + controlList = append(controlList, controlpb.ControlConfig_STATE) + case "DEBUG": + controlList = append(controlList, controlpb.ControlConfig_DEBUG) + default: + return fmt.Errorf("invalid control %q", control) + } + } + c.Controls.AllowedControls = controlList + return nil +} + +// Get implements flag.Value. +func (c *controlConfig) Get() interface{} { + return *c +} + +// String implements flag.Value. +func (c *controlConfig) String() string { + v := "" + for _, control := range c.Controls.GetAllowedControls() { + if len(v) > 0 { + v += "," + } + switch control { + case controlpb.ControlConfig_EVENTS: + v += "EVENTS" + case controlpb.ControlConfig_FS: + v += "FS" + case controlpb.ControlConfig_LIFECYCLE: + v += "LIFECYCLE" + case controlpb.ControlConfig_LOGGING: + v += "LOGGING" + case controlpb.ControlConfig_PROFILE: + v += "PROFILE" + case controlpb.ControlConfig_USAGE: + v += "USAGE" + case controlpb.ControlConfig_PROC: + v += "PROC" + case controlpb.ControlConfig_STATE: + v += "STATE" + case controlpb.ControlConfig_DEBUG: + v += "DEBUG" + default: + panic(fmt.Sprintf("Invalid control %d", control)) + } + } + return v +} + +func defaultControlConfig() *controlConfig { + c := controlConfig{} + c.Controls = &controlpb.ControlConfig{} + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_EVENTS) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_FS) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_LIFECYCLE) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_LOGGING) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_PROFILE) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_USAGE) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_PROC) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_STATE) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_DEBUG) + return &c +} + func leakModePtr(v refs.LeakMode) *refs.LeakMode { return &v } diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go index 80ff2c0a6..57c241c86 100644 --- a/runsc/config/config_test.go +++ b/runsc/config/config_test.go @@ -18,6 +18,7 @@ import ( "strings" "testing" + controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" "gvisor.dev/gvisor/runsc/flag" ) @@ -59,6 +60,9 @@ func TestFromFlags(t *testing.T) { if err := flag.CommandLine.Lookup("network").Value.Set("none"); err != nil { t.Errorf("Flag set: %v", err) } + if err := flag.CommandLine.Lookup("controls").Value.Set("EVENTS,FS"); err != nil { + t.Errorf("Flag set: %v", err) + } defer func() { if err := setDefault("root"); err != nil { t.Errorf("Flag set: %v", err) @@ -72,6 +76,9 @@ func TestFromFlags(t *testing.T) { if err := setDefault("network"); err != nil { t.Errorf("Flag set: %v", err) } + if err := setDefault("controls"); err != nil { + t.Errorf("Flag set: %v", err) + } }() c, err := NewFromFlags() @@ -90,6 +97,12 @@ func TestFromFlags(t *testing.T) { if want := NetworkNone; c.Network != want { t.Errorf("Network=%v, want: %v", c.Network, want) } + wants := []controlpb.ControlConfig_Endpoint{controlpb.ControlConfig_EVENTS, controlpb.ControlConfig_FS} + for i, want := range wants { + if c.Controls.Controls.AllowedControls[i] != want { + t.Errorf("Controls.Controls.AllowedControls[%d]=%v, want: %v", i, c.Controls.Controls.AllowedControls[i], want) + } + } } func TestToFlags(t *testing.T) { @@ -101,10 +114,15 @@ func TestToFlags(t *testing.T) { c.Debug = true c.NumNetworkChannels = 123 c.Network = NetworkNone + c.Controls = controlConfig{ + Controls: &controlpb.ControlConfig{ + AllowedControls: []controlpb.ControlConfig_Endpoint{controlpb.ControlConfig_EVENTS, controlpb.ControlConfig_FS}, + }, + } flags := c.ToFlags() - if len(flags) != 4 { - t.Errorf("wrong number of flags set, want: 4, got: %d: %s", len(flags), flags) + if len(flags) != 5 { + t.Errorf("wrong number of flags set, want: 5, got: %d: %s", len(flags), flags) } t.Logf("Flags: %s", flags) fm := map[string]string{} @@ -117,6 +135,7 @@ func TestToFlags(t *testing.T) { "--debug": "true", "--num-network-channels": "123", "--network": "none", + "--controls": "EVENTS,FS", } { if got, ok := fm[name]; ok { if got != want { diff --git a/runsc/config/flags.go b/runsc/config/flags.go index 6f1b5927a..11cea71b8 100644 --- a/runsc/config/flags.go +++ b/runsc/config/flags.go @@ -56,16 +56,23 @@ func RegisterFlags() { flag.Bool("strace", false, "enable strace.") flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.") flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.") + flag.Bool("strace-event", false, "send strace to event.") // Flags that control sandbox runtime behavior. flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") flag.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.") flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + flag.String("profile-block", "", "collects a block profile to this file path for the duration of the container execution. Requires -profile=true.") + flag.String("profile-cpu", "", "collects a CPU profile to this file path for the duration of the container execution. Requires -profile=true.") + flag.String("profile-heap", "", "collects a heap profile to this file path for the duration of the container execution. Requires -profile=true.") + flag.String("profile-mutex", "", "collects a mutex profile to this file path for the duration of the container execution. Requires -profile=true.") + flag.String("trace", "", "collects a Go runtime execution trace to this file path for the duration of the container execution.") flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.") flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") flag.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.") + flag.Var(defaultControlConfig(), "controls", "Sentry control endpoints.") // Flags that control sandbox runtime behavior: FS related. flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem validation to use for the root mount: exclusive (default), shared.") @@ -75,6 +82,7 @@ func RegisterFlags() { flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") flag.Bool("vfs2", false, "enables VFSv2. This uses the new VFS layer that is faster than the previous one.") flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") + flag.Bool("lisafs", false, "Enables lisafs protocol instead of 9P. This is only effective with VFS2.") flag.Bool("cgroupfs", false, "Automatically mount cgroupfs.") // Flags that control sandbox runtime behavior: network related. @@ -90,6 +98,7 @@ func RegisterFlags() { // Test flags, not to be used outside tests, ever. flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.") + flag.Bool("TESTONLY-allow-packet-endpoint-write", false, "TEST ONLY; do not ever use! Used for tests to allow writes on packet sockets.") }) } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 5314549d6..4e744e604 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -19,7 +19,7 @@ go_library( "//pkg/cleanup", "//pkg/log", "//pkg/sentry/control", - "//pkg/sentry/sighandling", + "//pkg/sighandling", "//pkg/sync", "//runsc/boot", "//runsc/cgroup", diff --git a/runsc/container/container.go b/runsc/container/container.go index d1f979eb2..7f991444e 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -35,7 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/sighandling" + "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" @@ -652,6 +652,30 @@ func (c *Container) Cat(files []string, out *os.File) error { return c.Sandbox.Cat(c.ID, files, out) } +// Usage displays memory used by the application. +func (c *Container) Usage(full bool) (control.MemoryUsage, error) { + log.Debugf("Usage in container, cid: %s, full: %v", c.ID, full) + return c.Sandbox.Usage(c.ID, full) +} + +// UsageFD shows application memory usage using two donated FDs. +func (c *Container) UsageFD() (*control.MemoryUsageRecord, error) { + log.Debugf("UsageFD in container, cid: %s", c.ID) + return c.Sandbox.UsageFD(c.ID) +} + +// Reduce requests that the sentry attempt to reduce its memory usage. +func (c *Container) Reduce(wait bool) error { + log.Debugf("Reduce in container, cid: %s", c.ID) + return c.Sandbox.Reduce(c.ID, wait) +} + +// Stream dumps all events to out. +func (c *Container) Stream(filters []string, out *os.File) error { + log.Debugf("Stream in container, cid: %s", c.ID) + return c.Sandbox.Stream(c.ID, filters, out) +} + // State returns the metadata of the container. func (c *Container) State() specs.State { return specs.State{ @@ -893,7 +917,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu // Add root mount and then add any other additional mounts. mountCount := 1 for _, m := range spec.Mounts { - if specutils.Is9PMount(m, conf.VFS2) { + if specutils.IsGoferMount(m, conf.VFS2) { mountCount++ } } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 960c36946..69dcf3f03 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -2655,3 +2655,220 @@ func TestCat(t *testing.T) { t.Errorf("out got %s, want include %s", buf, want) } } + +// TestUsage checks that usage generates the expected memory usage. +func TestUsage(t *testing.T) { + spec, conf := sleepSpecConf(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + for _, full := range []bool{false, true} { + m, err := cont.Usage(full) + if err != nil { + t.Fatalf("error usage from container: %v", err) + } + if m.Mapped == 0 { + t.Errorf("Usage mapped got zero") + } + if m.Total == 0 { + t.Errorf("Usage total got zero") + } + if full { + if m.System == 0 { + t.Errorf("Usage system got zero") + } + if m.Anonymous == 0 { + t.Errorf("Usage anonymous got zero") + } + } + } +} + +// TestUsageFD checks that usagefd generates the expected memory usage. +func TestUsageFD(t *testing.T) { + spec, conf := sleepSpecConf(t) + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + m, err := cont.UsageFD() + if err != nil { + t.Fatalf("error usageFD from container: %v", err) + } + + mapped, unknown, total, err := m.Fetch() + if err != nil { + t.Fatalf("error Fetch memory usage: %v", err) + } + + if mapped == 0 { + t.Errorf("UsageFD Mapped got zero") + } + if unknown == 0 { + t.Errorf("UsageFD unknown got zero") + } + if total == 0 { + t.Errorf("UsageFD total got zero") + } +} + +// TestReduce checks that reduce call succeeds. +func TestReduce(t *testing.T) { + spec, conf := sleepSpecConf(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + if err := cont.Reduce(false); err != nil { + t.Fatalf("error reduce from container: %v", err) + } +} + +// TestStream checks that Stream dumps expected events. +func TestStream(t *testing.T) { + spec, conf := sleepSpecConf(t) + conf.Strace = true + conf.StraceEvent = true + conf.StraceSyscalls = "" + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + r, w, err := os.Pipe() + if err != nil { + t.Fatalf("os.Create(): %v", err) + } + + // Spawn a new thread to Stream events as it blocks indefinitely. + go func() { + cont.Stream(nil, w) + }() + + buf := make([]byte, 1024) + if _, err := r.Read(buf); err != nil { + t.Fatalf("Read out: %v", err) + } + + // A syscall strace event includes "Strace". + if got, want := string(buf), "Strace"; !strings.Contains(got, want) { + t.Errorf("out got %s, want include %s", buf, want) + } +} + +// TestProfile checks that profiling options generate profiles. +func TestProfile(t *testing.T) { + // Perform a non-trivial amount of work so we actually capture + // something in the profiles. + spec := testutil.NewSpecWithArgs("/bin/bash", "-c", "true") + conf := testutil.TestConfig(t) + conf.ProfileEnable = true + conf.ProfileBlock = filepath.Join(t.TempDir(), "block.pprof") + conf.ProfileCPU = filepath.Join(t.TempDir(), "cpu.pprof") + conf.ProfileHeap = filepath.Join(t.TempDir(), "heap.pprof") + conf.ProfileMutex = filepath.Join(t.TempDir(), "mutex.pprof") + conf.TraceFile = filepath.Join(t.TempDir(), "trace.out") + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + Attached: true, + } + + _, err = Run(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + + // Basic test; simply assert that the profiles are not empty. + for _, name := range []string{conf.ProfileBlock, conf.ProfileCPU, conf.ProfileHeap, conf.ProfileMutex, conf.TraceFile} { + fi, err := os.Stat(name) + if err != nil { + t.Fatalf("Unable to stat profile file %s: %v", name, err) + } + if fi.Size() == 0 { + t.Errorf("Profile file %s is empty: %+v", name, fi) + } + } +} diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 3280b74fe..8d5a6d300 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -9,12 +9,16 @@ go_library( "fsgofer_amd64_unsafe.go", "fsgofer_arm64_unsafe.go", "fsgofer_unsafe.go", + "lisafs.go", ], visibility = ["//runsc:__subpackages__"], deps = [ + "//pkg/abi/linux", "//pkg/cleanup", "//pkg/fd", + "//pkg/lisafs", "//pkg/log", + "//pkg/marshal/primitive", "//pkg/p9", "//pkg/sync", "//pkg/syserr", @@ -37,3 +41,15 @@ go_test( "@org_golang_x_sys//unix:go_default_library", ], ) + +go_test( + name = "lisafs_test", + size = "small", + srcs = ["lisafs_test.go"], + deps = [ + ":fsgofer", + "//pkg/lisafs", + "//pkg/lisafs/testsuite", + "//pkg/log", + ], +) diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 07497e47b..600b21189 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -1242,13 +1242,14 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) { } parent := l.file.FD() - for _, name := range names { - child, err := unix.Openat(parent, name, openFlags|unix.O_PATH, 0) + closeParent := func() { if parent != l.file.FD() { - // Parent is no longer needed. _ = unix.Close(parent) - parent = -1 } + } + defer closeParent() + for _, name := range names { + child, err := unix.Openat(parent, name, openFlags|unix.O_PATH, 0) if err != nil { if errors.Is(err, unix.ENOENT) { // No pont in continuing any further. @@ -1256,10 +1257,11 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) { } return nil, err } + closeParent() + parent = child var stat unix.Stat_t if err := unix.Fstat(child, &stat); err != nil { - _ = unix.Close(child) return nil, err } valid, attr := l.fillAttr(&stat) @@ -1271,13 +1273,9 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) { if (stat.Mode & unix.S_IFMT) != unix.S_IFDIR { // Doesn't need to continue if entry is not a dir. Including symlinks // that cannot be followed. - _ = unix.Close(child) break } parent = child } - if parent != -1 && parent != l.file.FD() { - _ = unix.Close(parent) - } return stats, nil } diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index ee6cc97df..6cdd6d695 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -105,14 +105,14 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error { return nil } -type state struct { +type fileState struct { root *localFile file *localFile conf Config fileType uint32 } -func (s state) String() string { +func (s fileState) String() string { return fmt.Sprintf("type(%v)", s.fileType) } @@ -129,11 +129,11 @@ func typeName(fileType uint32) string { } } -func runAll(t *testing.T, test func(*testing.T, state)) { +func runAll(t *testing.T, test func(*testing.T, fileState)) { runCustom(t, allTypes, allConfs, test) } -func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, state)) { +func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, fileState)) { for _, c := range confs { for _, ft := range types { name := fmt.Sprintf("%s/%s", configTestName(&c), typeName(ft)) @@ -159,7 +159,7 @@ func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing. t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) } - st := state{ + st := fileState{ root: root.(*localFile), file: file.(*localFile), conf: c, @@ -227,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) { } func TestReadWrite(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("%v: createFile() failed, err: %v", s, err) @@ -261,7 +261,7 @@ func TestReadWrite(t *testing.T) { } func TestCreate(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) { for i, flags := range allOpenFlags { _, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { @@ -296,7 +296,7 @@ func TestCreateSetGID(t *testing.T) { t.Skipf("Test requires CAP_CHOWN") } - runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) { // Change group and set setgid to the parent dir. if err := unix.Chown(s.file.hostPath, os.Getuid(), nobody); err != nil { t.Fatalf("Chown() failed: %v", err) @@ -364,7 +364,7 @@ func TestCreateSetGID(t *testing.T) { // TestReadWriteDup tests that a file opened in any mode can be dup'ed and // reopened in any other mode. func TestReadWriteDup(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("%v: createFile() failed, err: %v", s, err) @@ -410,7 +410,7 @@ func TestReadWriteDup(t *testing.T) { } func TestUnopened(t *testing.T) { - runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s fileState) { b := []byte("foobar") if _, err := s.file.WriteAt(b, 0); err != unix.EBADF { t.Errorf("%v: WriteAt() should have failed, got: %v, expected: unix.EBADF", s, err) @@ -432,7 +432,7 @@ func TestUnopened(t *testing.T) { // was open with O_PATH, but Open() was not checking for it and allowing the // control file to be reused. func TestOpenOPath(t *testing.T) { - runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s fileState) { // Fist remove all permissions on the file. if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil { t.Fatalf("SetAttr(): %v", err) @@ -465,7 +465,7 @@ func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, e } func TestSetAttrPerm(t *testing.T) { - runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) { valid := p9.SetAttrMask{Permissions: true} attr := p9.SetAttr{Permissions: 0777} got, err := SetGetAttr(s.file, valid, attr) @@ -485,7 +485,7 @@ func TestSetAttrPerm(t *testing.T) { } func TestSetAttrSize(t *testing.T) { - runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) { for _, size := range []uint64{1024, 0, 1024 * 1024} { valid := p9.SetAttrMask{Size: true} attr := p9.SetAttr{Size: size} @@ -508,7 +508,7 @@ func TestSetAttrSize(t *testing.T) { } func TestSetAttrTime(t *testing.T) { - runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) { valid := p9.SetAttrMask{ATime: true, ATimeNotSystemTime: true} attr := p9.SetAttr{ATimeSeconds: 123, ATimeNanoSeconds: 456} got, err := SetGetAttr(s.file, valid, attr) @@ -542,7 +542,7 @@ func TestSetAttrOwner(t *testing.T) { t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid()) } - runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) { newUID := os.Getuid() + 1 valid := p9.SetAttrMask{UID: true} attr := p9.SetAttr{UID: p9.UID(newUID)} @@ -571,7 +571,7 @@ func SetGetXattr(l *localFile, name string, value string) error { } func TestSetGetDisabledXattr(t *testing.T) { - runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s fileState) { name := "user.merkle.offset" value := "tmp" err := SetGetXattr(s.file, name, value) @@ -582,7 +582,7 @@ func TestSetGetDisabledXattr(t *testing.T) { } func TestSetGetXattr(t *testing.T) { - runCustom(t, []uint32{unix.S_IFREG}, []Config{{ROMount: false, EnableVerityXattr: true}}, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFREG}, []Config{{ROMount: false, EnableVerityXattr: true}}, func(t *testing.T, s fileState) { name := "user.merkle.offset" value := "tmp" err := SetGetXattr(s.file, name, value) @@ -596,7 +596,7 @@ func TestLink(t *testing.T) { if !specutils.HasCapabilities(capability.CAP_DAC_READ_SEARCH) { t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid()) } - runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s fileState) { const dirName = "linkdir" const linkFile = "link" if _, err := s.root.Mkdir(dirName, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { @@ -625,7 +625,7 @@ func TestROMountChecks(t *testing.T) { uid := p9.UID(os.Getuid()) gid := p9.GID(os.Getgid()) - runCustom(t, allTypes, roConfs, func(t *testing.T, s state) { + runCustom(t, allTypes, roConfs, func(t *testing.T, s fileState) { if s.fileType != unix.S_IFLNK { if _, _, _, err := s.file.Open(p9.WriteOnly); err != want { t.Errorf("Open() should have failed, got: %v, expected: %v", err, want) @@ -676,7 +676,7 @@ func TestROMountChecks(t *testing.T) { } func TestWalkNotFound(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s fileState) { if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT { t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "nobody-here", err) } @@ -695,7 +695,7 @@ func TestWalkNotFound(t *testing.T) { } func TestWalkDup(t *testing.T) { - runAll(t, func(t *testing.T, s state) { + runAll(t, func(t *testing.T, s fileState) { _, dup, err := s.file.Walk([]string{}) if err != nil { t.Fatalf("%v: Walk(nil) failed, err: %v", s, err) @@ -708,7 +708,7 @@ func TestWalkDup(t *testing.T) { } func TestWalkMultiple(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) { var names []string var parent p9.File = s.file for i := 0; i < 5; i++ { @@ -729,7 +729,7 @@ func TestWalkMultiple(t *testing.T) { } func TestReaddir(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) { name := "dir" if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err) @@ -915,7 +915,7 @@ func TestDoubleAttachError(t *testing.T) { } func TestTruncate(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("createFile() failed: %v", err) @@ -951,7 +951,7 @@ func TestTruncate(t *testing.T) { } func TestMknod(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s fileState) { _, err := s.file.Mknod("test", p9.ModeRegular|0777, 1, 2, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { t.Fatalf("Mknod() failed: %v", err) diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go index f11fea40d..fb4fbe0d2 100644 --- a/runsc/fsgofer/fsgofer_unsafe.go +++ b/runsc/fsgofer/fsgofer_unsafe.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/syserr" ) +var unixDirentMaxSize uint32 = uint32(unsafe.Sizeof(unix.Dirent{})) + func utimensat(dirFd int, name string, times [2]unix.Timespec, flags int) error { // utimensat(2) doesn't accept empty name, instead name must be nil to make it // operate directly on 'dirFd' unlike other *at syscalls. @@ -80,3 +82,31 @@ func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error } return nil } + +func parseDirents(buf []byte, handleDirent func(ino uint64, off int64, ftype uint8, name string) bool) { + for len(buf) > 0 { + // Interpret the buf populated by unix.Getdents as unix.Dirent. + dirent := *(*unix.Dirent)(unsafe.Pointer(&buf[0])) + + // Extracting the name is pretty tedious... + var nameBuf [unix.NAME_MAX]byte + var nameLen int + for i := 0; i < len(dirent.Name); i++ { + // The name is null terminated. + if dirent.Name[i] == 0 { + nameLen = i + break + } + nameBuf[i] = byte(dirent.Name[i]) + } + name := string(nameBuf[:nameLen]) + + // Deliver results to caller. + if !handleDirent(dirent.Ino, dirent.Off, dirent.Type, name) { + return + } + + // Advance buf for the next dirent. + buf = buf[dirent.Reclen:] + } +} diff --git a/runsc/fsgofer/lisafs.go b/runsc/fsgofer/lisafs.go new file mode 100644 index 000000000..0db44ff6a --- /dev/null +++ b/runsc/fsgofer/lisafs.go @@ -0,0 +1,1084 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsgofer + +import ( + "io" + "math" + "path" + "strconv" + "sync/atomic" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" + rwfd "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/lisafs" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/p9" +) + +// LisafsServer implements lisafs.ServerImpl for fsgofer. +type LisafsServer struct { + lisafs.Server + config Config +} + +var _ lisafs.ServerImpl = (*LisafsServer)(nil) + +// NewLisafsServer initializes a new lisafs server for fsgofer. +func NewLisafsServer(config Config) *LisafsServer { + s := &LisafsServer{config: config} + s.Server.Init(s) + return s +} + +// Mount implements lisafs.ServerImpl.Mount. +func (s *LisafsServer) Mount(c *lisafs.Connection, mountPath string) (lisafs.ControlFDImpl, lisafs.Inode, error) { + s.RenameMu.RLock() + defer s.RenameMu.RUnlock() + + rootFD, rootStat, err := tryStepLocked(c, mountPath, nil, func(flags int) (int, error) { + return unix.Open(mountPath, flags, 0) + }) + if err != nil { + return nil, lisafs.Inode{}, err + } + + var rootIno lisafs.Inode + rootFD.initInodeWithStat(&rootIno, &rootStat) + return rootFD, rootIno, nil +} + +// MaxMessageSize implements lisafs.ServerImpl.MaxMessageSize. +func (s *LisafsServer) MaxMessageSize() uint32 { + return lisafs.MaxMessageSize() +} + +// SupportedMessages implements lisafs.ServerImpl.SupportedMessages. +func (s *LisafsServer) SupportedMessages() []lisafs.MID { + // Note that Flush, FListXattr and FRemoveXattr are not supported. + return []lisafs.MID{ + lisafs.Mount, + lisafs.Channel, + lisafs.FStat, + lisafs.SetStat, + lisafs.Walk, + lisafs.WalkStat, + lisafs.OpenAt, + lisafs.OpenCreateAt, + lisafs.Close, + lisafs.FSync, + lisafs.PWrite, + lisafs.PRead, + lisafs.MkdirAt, + lisafs.MknodAt, + lisafs.SymlinkAt, + lisafs.LinkAt, + lisafs.FStatFS, + lisafs.FAllocate, + lisafs.ReadLinkAt, + lisafs.Connect, + lisafs.UnlinkAt, + lisafs.RenameAt, + lisafs.Getdents64, + lisafs.FGetXattr, + lisafs.FSetXattr, + } +} + +// controlFDLisa implements lisafs.ControlFDImpl. +type controlFDLisa struct { + lisafs.ControlFD + + // hostFD is the file descriptor which can be used to make host syscalls. + hostFD int + + // writableHostFD is the file descriptor number for a writable FD opened on the + // same FD as `hostFD`. writableHostFD must only be accessed using atomic + // operations. It is initialized to -1, and can change in value exactly once. + writableHostFD int32 +} + +var _ lisafs.ControlFDImpl = (*controlFDLisa)(nil) + +// Precondition: server's rename mutex must be at least read locked. +func newControlFDLisaLocked(c *lisafs.Connection, hostFD int, parent *controlFDLisa, name string, mode linux.FileMode) *controlFDLisa { + fd := &controlFDLisa{ + hostFD: hostFD, + writableHostFD: -1, + } + fd.ControlFD.Init(c, parent.FD(), name, mode, fd) + return fd +} + +func (fd *controlFDLisa) initInode(inode *lisafs.Inode) error { + inode.ControlFD = fd.ID() + return fstatTo(fd.hostFD, &inode.Stat) +} + +func (fd *controlFDLisa) initInodeWithStat(inode *lisafs.Inode, unixStat *unix.Stat_t) { + inode.ControlFD = fd.ID() + unixToLinuxStat(unixStat, &inode.Stat) +} + +func (fd *controlFDLisa) getWritableFD() (int, error) { + if writableFD := atomic.LoadInt32(&fd.writableHostFD); writableFD != -1 { + return int(writableFD), nil + } + + writableFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(fd.hostFD), (unix.O_WRONLY|openFlags)&^unix.O_NOFOLLOW, 0) + if err != nil { + return -1, err + } + if !atomic.CompareAndSwapInt32(&fd.writableHostFD, -1, int32(writableFD)) { + // Race detected, use the new value and clean this up. + unix.Close(writableFD) + return int(atomic.LoadInt32(&fd.writableHostFD)), nil + } + return writableFD, nil +} + +// FD implements lisafs.ControlFDImpl.FD. +func (fd *controlFDLisa) FD() *lisafs.ControlFD { + if fd == nil { + return nil + } + return &fd.ControlFD +} + +// Close implements lisafs.ControlFDImpl.Close. +func (fd *controlFDLisa) Close(c *lisafs.Connection) { + if fd.hostFD >= 0 { + _ = unix.Close(fd.hostFD) + fd.hostFD = -1 + } + // No concurrent access is possible so no need to use atomics. + if fd.writableHostFD >= 0 { + _ = unix.Close(int(fd.writableHostFD)) + fd.writableHostFD = -1 + } +} + +// Stat implements lisafs.ControlFDImpl.Stat. +func (fd *controlFDLisa) Stat(c *lisafs.Connection, comm lisafs.Communicator) (uint32, error) { + var resp linux.Statx + if err := fstatTo(fd.hostFD, &resp); err != nil { + return 0, err + } + + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// SetStat implements lisafs.ControlFDImpl.SetStat. +func (fd *controlFDLisa) SetStat(c *lisafs.Connection, comm lisafs.Communicator, stat lisafs.SetStatReq) (uint32, error) { + var resp lisafs.SetStatResp + if stat.Mask&unix.STATX_MODE != 0 { + if err := unix.Fchmod(fd.hostFD, stat.Mode&^unix.S_IFMT); err != nil { + log.Debugf("SetStat fchmod failed %q, err: %v", fd.FilePath(), err) + resp.FailureMask |= unix.STATX_MODE + resp.FailureErrNo = uint32(p9.ExtractErrno(err)) + } + } + + if stat.Mask&unix.STATX_SIZE != 0 { + // ftruncate(2) requires the FD to be open for writing. + writableFD, err := fd.getWritableFD() + if err == nil { + err = unix.Ftruncate(writableFD, int64(stat.Size)) + } + if err != nil { + log.Debugf("SetStat ftruncate failed %q, err: %v", fd.FilePath(), err) + resp.FailureMask |= unix.STATX_SIZE + resp.FailureErrNo = uint32(p9.ExtractErrno(err)) + } + } + + if stat.Mask&(unix.STATX_ATIME|unix.STATX_MTIME) != 0 { + utimes := [2]unix.Timespec{ + {Sec: 0, Nsec: unix.UTIME_OMIT}, + {Sec: 0, Nsec: unix.UTIME_OMIT}, + } + if stat.Mask&unix.STATX_ATIME != 0 { + utimes[0].Sec = stat.Atime.Sec + utimes[0].Nsec = stat.Atime.Nsec + } + if stat.Mask&unix.STATX_MTIME != 0 { + utimes[1].Sec = stat.Mtime.Sec + utimes[1].Nsec = stat.Mtime.Nsec + } + + if fd.IsSymlink() { + // utimensat operates different that other syscalls. To operate on a + // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty + // name. + c.Server().WithRenameReadLock(func() error { + if err := utimensat(fd.ParentLocked().(*controlFDLisa).hostFD, fd.NameLocked(), utimes, unix.AT_SYMLINK_NOFOLLOW); err != nil { + log.Debugf("SetStat utimens failed %q, err: %v", fd.FilePathLocked(), err) + resp.FailureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME)) + resp.FailureErrNo = uint32(p9.ExtractErrno(err)) + } + return nil + }) + } else { + hostFD := fd.hostFD + if fd.IsRegular() { + // For regular files, utimensat(2) requires the FD to be open for + // writing, see BUGS section. + writableFD, err := fd.getWritableFD() + if err != nil { + return 0, err + } + hostFD = writableFD + } + // Directories and regular files can operate directly on the fd + // using empty name. + err := utimensat(hostFD, "", utimes, 0) + if err != nil { + log.Debugf("SetStat utimens failed %q, err: %v", fd.FilePath(), err) + resp.FailureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME)) + resp.FailureErrNo = uint32(p9.ExtractErrno(err)) + } + } + } + + if stat.Mask&(unix.STATX_UID|unix.STATX_GID) != 0 { + // "If the owner or group is specified as -1, then that ID is not changed" + // - chown(2) + uid := -1 + if stat.Mask&unix.STATX_UID != 0 { + uid = int(stat.UID) + } + gid := -1 + if stat.Mask&unix.STATX_GID != 0 { + gid = int(stat.GID) + } + if err := unix.Fchownat(fd.hostFD, "", uid, gid, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { + log.Debugf("SetStat fchown failed %q, err: %v", fd.FilePath(), err) + resp.FailureMask |= stat.Mask & (unix.STATX_UID | unix.STATX_GID) + resp.FailureErrNo = uint32(p9.ExtractErrno(err)) + } + } + + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// Walk implements lisafs.ControlFDImpl.Walk. +func (fd *controlFDLisa) Walk(c *lisafs.Connection, comm lisafs.Communicator, path lisafs.StringArray) (uint32, error) { + // We need to generate inodes for each component walked. We will manually + // marshal the inodes into the payload buffer as they are generated to avoid + // the slice allocation. The memory format should be lisafs.WalkResp's. + var numInodes primitive.Uint32 + var status lisafs.WalkStatus + maxPayloadSize := status.SizeBytes() + numInodes.SizeBytes() + (len(path) * (*lisafs.Inode)(nil).SizeBytes()) + if maxPayloadSize > math.MaxUint32 { + // Too much to walk, can't do. + return 0, unix.EIO + } + payloadBuf := comm.PayloadBuf(uint32(maxPayloadSize)) + payloadPos := status.SizeBytes() + numInodes.SizeBytes() + + s := c.Server() + s.RenameMu.RLock() + defer s.RenameMu.RUnlock() + + curDirFD := fd + cu := cleanup.Make(func() { + // Destroy all newly created FDs until now. Walk upward from curDirFD to + // fd. Do not destroy fd as the client still owns that. + for curDirFD != fd { + c.RemoveControlFDLocked(curDirFD.ID()) + curDirFD = curDirFD.ParentLocked().(*controlFDLisa) + } + }) + defer cu.Clean() + + for _, name := range path { + // Symlinks terminate walk. This client gets the symlink inode, but will + // have to invoke Walk again with the resolved path. + if curDirFD.IsSymlink() { + status = lisafs.WalkComponentSymlink + break + } + + child, childStat, err := tryStepLocked(c, name, curDirFD, func(flags int) (int, error) { + return unix.Openat(curDirFD.hostFD, name, flags, 0) + }) + if err == unix.ENOENT { + status = lisafs.WalkComponentDoesNotExist + break + } + if err != nil { + return 0, err + } + + // Write inode to payloadBuf and update state. + var childInode lisafs.Inode + child.initInodeWithStat(&childInode, &childStat) + childInode.MarshalUnsafe(payloadBuf[payloadPos:]) + payloadPos += childInode.SizeBytes() + numInodes++ + curDirFD = child + } + cu.Release() + + // lisafs.WalkResp writes the walk status followed by the number of inodes in + // the beginning. + status.MarshalUnsafe(payloadBuf) + numInodes.MarshalUnsafe(payloadBuf[status.SizeBytes():]) + return uint32(payloadPos), nil +} + +// WalkStat implements lisafs.ControlFDImpl.WalkStat. +func (fd *controlFDLisa) WalkStat(c *lisafs.Connection, comm lisafs.Communicator, path lisafs.StringArray) (uint32, error) { + // We may need to generate statx for dirFD + each component walked. We will + // manually marshal the statx results into the payload buffer as they are + // generated to avoid the slice allocation. The memory format should be the + // same as lisafs.WalkStatResp's. + var numStats primitive.Uint32 + maxPayloadSize := numStats.SizeBytes() + (len(path) * linux.SizeOfStatx) + if maxPayloadSize > math.MaxUint32 { + // Too much to walk, can't do. + return 0, unix.EIO + } + payloadBuf := comm.PayloadBuf(uint32(maxPayloadSize)) + payloadPos := numStats.SizeBytes() + + s := c.Server() + s.RenameMu.RLock() + defer s.RenameMu.RUnlock() + + curDirFD := fd.hostFD + closeCurDirFD := func() { + if curDirFD != fd.hostFD { + unix.Close(curDirFD) + } + } + defer closeCurDirFD() + var ( + stat linux.Statx + unixStat unix.Stat_t + ) + if len(path) > 0 && len(path[0]) == 0 { + // Write stat results for dirFD if the first path component is "". + if err := unix.Fstat(fd.hostFD, &unixStat); err != nil { + return 0, err + } + unixToLinuxStat(&unixStat, &stat) + stat.MarshalUnsafe(payloadBuf[payloadPos:]) + payloadPos += stat.SizeBytes() + path = path[1:] + numStats++ + } + + // Don't attempt walking if parent is a symlink. + if fd.IsSymlink() { + return 0, nil + } + for _, name := range path { + curFD, err := unix.Openat(curDirFD, name, unix.O_PATH|openFlags, 0) + if err == unix.ENOENT { + // No more path components exist on the filesystem. Return the partial + // walk to the client. + break + } + if err != nil { + return 0, err + } + closeCurDirFD() + curDirFD = curFD + + // Write stat results for curFD. + if err := unix.Fstat(curFD, &unixStat); err != nil { + return 0, err + } + unixToLinuxStat(&unixStat, &stat) + stat.MarshalUnsafe(payloadBuf[payloadPos:]) + payloadPos += stat.SizeBytes() + numStats++ + + // Symlinks terminate walk. This client gets the symlink stat result, but + // will have to invoke Walk again with the resolved path. + if unixStat.Mode&unix.S_IFMT == unix.S_IFLNK { + break + } + } + + // lisafs.WalkStatResp writes the number of stats in the beginning. + numStats.MarshalUnsafe(payloadBuf) + return uint32(payloadPos), nil +} + +// Open implements lisafs.ControlFDImpl.Open. +func (fd *controlFDLisa) Open(c *lisafs.Connection, comm lisafs.Communicator, flags uint32) (uint32, error) { + flags |= openFlags + newHostFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(fd.hostFD), int(flags)&^unix.O_NOFOLLOW, 0) + if err != nil { + return 0, err + } + newFD := fd.newOpenFDLisa(newHostFD, flags) + + if fd.IsRegular() { + // Donate FD for regular files only. Since FD donation is a destructive + // operation, we should duplicate the to-be-donated FD. Eat the error if + // one occurs, it is better to have an FD without a host FD, than failing + // the Open attempt. + if dupFD, err := unix.Dup(newFD.hostFD); err == nil { + _ = comm.DonateFD(dupFD) + } + } + + resp := lisafs.OpenAtResp{NewFD: newFD.ID()} + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// OpenCreate implements lisafs.ControlFDImpl.OpenCreate. +func (fd *controlFDLisa) OpenCreate(c *lisafs.Connection, comm lisafs.Communicator, mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string, flags uint32) (uint32, error) { + // Need to hold rename mutex for reading while performing the walk. Also keep + // holding it while the cleanup is still possible. + var resp lisafs.OpenCreateAtResp + var newFD *openFDLisa + if err := c.Server().WithRenameReadLock(func() error { + createFlags := unix.O_CREAT | unix.O_EXCL | unix.O_RDONLY | unix.O_NONBLOCK | openFlags + childHostFD, err := unix.Openat(fd.hostFD, name, createFlags, uint32(mode&^linux.FileTypeMask)) + if err != nil { + return err + } + + childFD := newControlFDLisaLocked(c, childHostFD, fd, name, linux.ModeRegular) + cu := cleanup.Make(func() { + // Best effort attempt to remove the file in case of failure. + if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.FilePathLocked(), name), err) + } + c.RemoveControlFDLocked(childFD.ID()) + }) + defer cu.Clean() + + // Set the owners as requested by the client. + if err := unix.Fchownat(childFD.hostFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { + log.Infof("ayush: Fchownat %v", err) + return err + } + + // Do not use the stat result from tryOpen because the owners might have + // changed. initInode() will stat the FD again and use fresh results. + if err := childFD.initInode(&resp.Child); err != nil { + log.Infof("ayush: initInode %v", err) + return err + } + + // Now open an FD to the newly created file with the flags requested by the client. + flags |= openFlags + newHostFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(childFD.hostFD), int(flags)&^unix.O_NOFOLLOW, 0) + if err != nil { + log.Infof("ayush: Openat %v", err) + return err + } + cu.Release() + + newFD = childFD.newOpenFDLisa(newHostFD, uint32(flags)) + resp.NewFD = newFD.ID() + return nil + }); err != nil { + return 0, err + } + + // Donate FD because open(O_CREAT|O_EXCL) always creates a regular file. + // Since FD donation is a destructive operation, we should duplicate the + // to-be-donated FD. Eat the error if one occurs, it is better to have an FD + // without a host FD, than failing the Open attempt. + if dupFD, err := unix.Dup(newFD.hostFD); err == nil { + _ = comm.DonateFD(dupFD) + } + + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// Mkdir implements lisafs.ControlFDImpl.Mkdir. +func (fd *controlFDLisa) Mkdir(c *lisafs.Connection, comm lisafs.Communicator, mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string) (uint32, error) { + var resp lisafs.MkdirAtResp + if err := c.Server().WithRenameReadLock(func() error { + if err := unix.Mkdirat(fd.hostFD, name, uint32(mode&^linux.FileTypeMask)); err != nil { + return err + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the dir in case of failure. + if err := unix.Unlinkat(fd.hostFD, name, unix.AT_REMOVEDIR); err != nil { + log.Warningf("error unlinking dir %q after failure: %v", path.Join(fd.FilePathLocked(), name), err) + } + }) + defer cu.Clean() + + // Open directory to change ownership. + childDirFd, err := unix.Openat(fd.hostFD, name, unix.O_DIRECTORY|unix.O_RDONLY|openFlags, 0) + if err != nil { + return err + } + if err := unix.Fchownat(childDirFd, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { + unix.Close(childDirFd) + return err + } + + childDir := newControlFDLisaLocked(c, childDirFd, fd, name, linux.ModeDirectory) + if err := childDir.initInode(&resp.ChildDir); err != nil { + c.RemoveControlFDLocked(childDir.ID()) + return err + } + cu.Release() + + return nil + }); err != nil { + return 0, err + } + + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// Mknod implements lisafs.ControlFDImpl.Mknod. +func (fd *controlFDLisa) Mknod(c *lisafs.Connection, comm lisafs.Communicator, mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string, minor uint32, major uint32) (uint32, error) { + // From mknod(2) man page: + // "EPERM: [...] if the filesystem containing pathname does not support + // the type of node requested." + if mode.FileType() != linux.ModeRegular { + return 0, unix.EPERM + } + + var resp lisafs.MknodAtResp + if err := c.Server().WithRenameReadLock(func() error { + if err := unix.Mknodat(fd.hostFD, name, uint32(mode), 0); err != nil { + return err + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the file in case of failure. + if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.FilePathLocked(), name), err) + } + }) + defer cu.Clean() + + // Open file to change ownership. + childFD, err := unix.Openat(fd.hostFD, name, unix.O_PATH|openFlags, 0) + if err != nil { + return err + } + if err := unix.Fchownat(childFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { + unix.Close(childFD) + return err + } + + child := newControlFDLisaLocked(c, childFD, fd, name, mode) + if err := child.initInode(&resp.Child); err != nil { + c.RemoveControlFDLocked(child.ID()) + return err + } + cu.Release() + return nil + }); err != nil { + return 0, err + } + + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// Symlink implements lisafs.ControlFDImpl.Symlink. +func (fd *controlFDLisa) Symlink(c *lisafs.Connection, comm lisafs.Communicator, name string, target string, uid lisafs.UID, gid lisafs.GID) (uint32, error) { + var resp lisafs.SymlinkAtResp + if err := c.Server().WithRenameReadLock(func() error { + if err := unix.Symlinkat(target, fd.hostFD, name); err != nil { + return err + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the symlink in case of failure. + if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.FilePathLocked(), name), err) + } + }) + defer cu.Clean() + + // Open symlink to change ownership. + symlinkFD, err := unix.Openat(fd.hostFD, name, unix.O_PATH|openFlags, 0) + if err != nil { + return err + } + if err := unix.Fchownat(symlinkFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { + unix.Close(symlinkFD) + return err + } + + symlink := newControlFDLisaLocked(c, symlinkFD, fd, name, linux.ModeSymlink) + if err := symlink.initInode(&resp.Symlink); err != nil { + c.RemoveControlFDLocked(symlink.ID()) + return err + } + cu.Release() + return nil + }); err != nil { + return 0, err + } + + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// Link implements lisafs.ControlFDImpl.Link. +func (fd *controlFDLisa) Link(c *lisafs.Connection, comm lisafs.Communicator, dir lisafs.ControlFDImpl, name string) (uint32, error) { + var resp lisafs.LinkAtResp + if err := c.Server().WithRenameReadLock(func() error { + dirFD := dir.(*controlFDLisa) + if err := unix.Linkat(fd.hostFD, "", dirFD.hostFD, name, unix.AT_EMPTY_PATH); err != nil { + return err + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the hard link in case of failure. + if err := unix.Unlinkat(dirFD.hostFD, name, 0); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(dirFD.FilePathLocked(), name), err) + } + }) + defer cu.Clean() + + linkFD, linkStat, err := tryStepLocked(c, name, dirFD, func(flags int) (int, error) { + return unix.Openat(dirFD.hostFD, name, flags, 0) + }) + if err != nil { + return err + } + cu.Release() + + linkFD.initInodeWithStat(&resp.Link, &linkStat) + return nil + }); err != nil { + return 0, err + } + + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// StatFS implements lisafs.ControlFDImpl.StatFS. +func (fd *controlFDLisa) StatFS(c *lisafs.Connection, comm lisafs.Communicator) (uint32, error) { + var s unix.Statfs_t + if err := unix.Fstatfs(fd.hostFD, &s); err != nil { + return 0, err + } + + resp := lisafs.StatFS{ + Type: uint64(s.Type), + BlockSize: s.Bsize, + Blocks: s.Blocks, + BlocksFree: s.Bfree, + BlocksAvailable: s.Bavail, + Files: s.Files, + FilesFree: s.Ffree, + NameLength: uint64(s.Namelen), + } + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// Readlink implements lisafs.ControlFDImpl.Readlink. +func (fd *controlFDLisa) Readlink(c *lisafs.Connection, comm lisafs.Communicator) (uint32, error) { + // We will manually marshal lisafs.ReadLinkAtResp, which just contains a + // lisafs.SizedString. Let unix.Readlinkat directly write into the payload + // buffer and manually write the string size before it. + + // This is similar to what os.Readlink does. + const limit = primitive.Uint32(1024 * 1024) + for linkLen := primitive.Uint32(128); linkLen < limit; linkLen *= 2 { + b := comm.PayloadBuf(uint32(linkLen) + uint32(linkLen.SizeBytes())) + n, err := unix.Readlinkat(fd.hostFD, "", b[linkLen.SizeBytes():]) + if err != nil { + return 0, err + } + if n < int(linkLen) { + linkLen = primitive.Uint32(n) + linkLen.MarshalUnsafe(b[:linkLen.SizeBytes()]) + return uint32(linkLen) + uint32(linkLen.SizeBytes()), nil + } + } + return 0, unix.ENOMEM +} + +// Connect implements lisafs.ControlFDImpl.Connect. +func (fd *controlFDLisa) Connect(c *lisafs.Connection, comm lisafs.Communicator, sockType uint32) error { + s := c.ServerImpl().(*LisafsServer) + if !s.config.HostUDS { + return unix.ECONNREFUSED + } + + // Lock RenameMu so that the hostPath read stays valid and is not tampered + // with until it is actually connected to. + s.RenameMu.RLock() + defer s.RenameMu.RUnlock() + + // TODO(gvisor.dev/issue/1003): Due to different app vs replacement + // mappings, the app path may have fit in the sockaddr, but we can't fit + // hostPath in our sockaddr. We'd need to redirect through a shorter path + // in order to actually connect to this socket. + hostPath := fd.FilePathLocked() + if len(hostPath) > 108 { // UNIX_PATH_MAX = 108 is defined in afunix.h. + return unix.ECONNREFUSED + } + + // Only the following types are supported. + switch sockType { + case unix.SOCK_STREAM, unix.SOCK_DGRAM, unix.SOCK_SEQPACKET: + default: + return unix.ENXIO + } + + sock, err := unix.Socket(unix.AF_UNIX, int(sockType), 0) + if err != nil { + return err + } + if err := comm.DonateFD(sock); err != nil { + return err + } + + sa := unix.SockaddrUnix{Name: hostPath} + if err := unix.Connect(sock, &sa); err != nil { + return err + } + return nil +} + +// Unlink implements lisafs.ControlFDImpl.Unlink. +func (fd *controlFDLisa) Unlink(c *lisafs.Connection, name string, flags uint32) error { + return c.Server().WithRenameReadLock(func() error { + return unix.Unlinkat(fd.hostFD, name, int(flags)) + }) +} + +// RenameLocked implements lisafs.ControlFDImpl.RenameLocked. +func (fd *controlFDLisa) RenameLocked(c *lisafs.Connection, newDir lisafs.ControlFDImpl, newName string) (func(lisafs.ControlFDImpl), func(), error) { + // Note that there is no controlFDLisa specific update needed on rename. + return nil, nil, renameat(fd.ParentLocked().(*controlFDLisa).hostFD, fd.NameLocked(), newDir.(*controlFDLisa).hostFD, newName) +} + +// GetXattr implements lisafs.ControlFDImpl.GetXattr. +func (fd *controlFDLisa) GetXattr(c *lisafs.Connection, comm lisafs.Communicator, name string, size uint32) (uint32, error) { + if !c.ServerImpl().(*LisafsServer).config.EnableVerityXattr { + return 0, unix.EOPNOTSUPP + } + if _, ok := verityXattrs[name]; !ok { + return 0, unix.EOPNOTSUPP + } + + // Manually marshal lisafs.FGetXattrResp to avoid allocations and copying. + var valueLen primitive.Uint32 + buf := comm.PayloadBuf(uint32(valueLen.SizeBytes()) + size) + n, err := unix.Fgetxattr(fd.hostFD, name, buf[valueLen.SizeBytes():]) + if err != nil { + return 0, err + } + valueLen = primitive.Uint32(n) + valueLen.MarshalBytes(buf[:valueLen.SizeBytes()]) + + return uint32(valueLen.SizeBytes() + n), nil +} + +// SetXattr implements lisafs.ControlFDImpl.SetXattr. +func (fd *controlFDLisa) SetXattr(c *lisafs.Connection, name string, value string, flags uint32) error { + if !c.ServerImpl().(*LisafsServer).config.EnableVerityXattr { + return unix.EOPNOTSUPP + } + if _, ok := verityXattrs[name]; !ok { + return unix.EOPNOTSUPP + } + return unix.Fsetxattr(fd.hostFD, name, []byte(value) /* sigh */, int(flags)) +} + +// ListXattr implements lisafs.ControlFDImpl.ListXattr. +func (fd *controlFDLisa) ListXattr(c *lisafs.Connection, comm lisafs.Communicator, size uint64) (uint32, error) { + return 0, unix.EOPNOTSUPP +} + +// RemoveXattr implements lisafs.ControlFDImpl.RemoveXattr. +func (fd *controlFDLisa) RemoveXattr(c *lisafs.Connection, comm lisafs.Communicator, name string) error { + return unix.EOPNOTSUPP +} + +// openFDLisa implements lisafs.OpenFDImpl. +type openFDLisa struct { + lisafs.OpenFD + + // hostFD is the host file descriptor which can be used to make syscalls. + hostFD int +} + +var _ lisafs.OpenFDImpl = (*openFDLisa)(nil) + +func (fd *controlFDLisa) newOpenFDLisa(hostFD int, flags uint32) *openFDLisa { + newFD := &openFDLisa{ + hostFD: hostFD, + } + newFD.OpenFD.Init(fd.FD(), flags, newFD) + return newFD +} + +// FD implements lisafs.OpenFDImpl.FD. +func (fd *openFDLisa) FD() *lisafs.OpenFD { + if fd == nil { + return nil + } + return &fd.OpenFD +} + +// Close implements lisafs.OpenFDImpl.Close. +func (fd *openFDLisa) Close(c *lisafs.Connection) { + if fd.hostFD >= 0 { + _ = unix.Close(fd.hostFD) + fd.hostFD = -1 + } +} + +// Stat implements lisafs.OpenFDImpl.Stat. +func (fd *openFDLisa) Stat(c *lisafs.Connection, comm lisafs.Communicator) (uint32, error) { + var resp linux.Statx + if err := fstatTo(fd.hostFD, &resp); err != nil { + return 0, err + } + + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// Sync implements lisafs.OpenFDImpl.Sync. +func (fd *openFDLisa) Sync(c *lisafs.Connection) error { + return unix.Fsync(fd.hostFD) +} + +// Write implements lisafs.OpenFDImpl.Write. +func (fd *openFDLisa) Write(c *lisafs.Connection, comm lisafs.Communicator, buf []byte, off uint64) (uint32, error) { + rw := rwfd.NewReadWriter(fd.hostFD) + n, err := rw.WriteAt(buf, int64(off)) + if err != nil { + return 0, err + } + + resp := &lisafs.PWriteResp{Count: uint64(n)} + respLen := uint32(resp.SizeBytes()) + resp.MarshalUnsafe(comm.PayloadBuf(respLen)) + return respLen, nil +} + +// Read implements lisafs.OpenFDImpl.Read. +func (fd *openFDLisa) Read(c *lisafs.Connection, comm lisafs.Communicator, off uint64, count uint32) (uint32, error) { + // To save an allocation and a copy, we directly read into the payload + // buffer. The rest of the response message is manually marshalled. + var resp lisafs.PReadResp + respMetaSize := uint32(resp.NumBytes.SizeBytes()) + maxRespLen := respMetaSize + count + + payloadBuf := comm.PayloadBuf(maxRespLen) + rw := rwfd.NewReadWriter(fd.hostFD) + n, err := rw.ReadAt(payloadBuf[respMetaSize:], int64(off)) + if err != nil && err != io.EOF { + return 0, err + } + + // Write the response metadata onto the payload buffer. The response contents + // already have been written immediately after it. + resp.NumBytes = primitive.Uint32(n) + resp.NumBytes.MarshalUnsafe(payloadBuf[:respMetaSize]) + return respMetaSize + uint32(n), nil +} + +// Allocate implements lisafs.OpenFDImpl.Allocate. +func (fd *openFDLisa) Allocate(c *lisafs.Connection, mode, off, length uint64) error { + return unix.Fallocate(fd.hostFD, uint32(mode), int64(off), int64(length)) +} + +// Flush implements lisafs.OpenFDImpl.Flush. +func (fd *openFDLisa) Flush(c *lisafs.Connection) error { + return nil +} + +// Getdent64 implements lisafs.OpenFDImpl.Getdent64. +func (fd *openFDLisa) Getdent64(c *lisafs.Connection, comm lisafs.Communicator, count uint32, seek0 bool) (uint32, error) { + if seek0 { + if _, err := unix.Seek(fd.hostFD, 0, 0); err != nil { + return 0, err + } + } + + // We will manually marshal the response lisafs.Getdents64Resp. + + // numDirents is the number of dirents marshalled into the payload. + var numDirents primitive.Uint32 + // The payload starts with numDirents, dirents go right after that. + // payloadBufPos represents the position at which to write the next dirent. + payloadBufPos := uint32(numDirents.SizeBytes()) + // Request enough payloadBuf for 10 dirents, we will extend when needed. + payloadBuf := comm.PayloadBuf(payloadBufPos + 10*unixDirentMaxSize) + + var direntsBuf [8192]byte + var bytesRead int + for bytesRead < int(count) { + bufEnd := len(direntsBuf) + if remaining := int(count) - bytesRead; remaining < bufEnd { + bufEnd = remaining + } + n, err := unix.Getdents(fd.hostFD, direntsBuf[:bufEnd]) + if err != nil { + if err == unix.EINVAL && bufEnd < 268 { + // getdents64(2) returns EINVAL is returned when the result + // buffer is too small. If bufEnd is smaller than the max + // size of unix.Dirent, then just break here to return all + // dirents collected till now. + break + } + return 0, err + } + if n <= 0 { + break + } + bytesRead += n + + var statErr error + parseDirents(direntsBuf[:n], func(ino uint64, off int64, ftype uint8, name string) bool { + dirent := lisafs.Dirent64{ + Ino: primitive.Uint64(ino), + Off: primitive.Uint64(off), + Type: primitive.Uint8(ftype), + Name: lisafs.SizedString(name), + } + + // The client also wants the device ID, which annoyingly incurs an + // additional syscall per dirent. Live with it. + stat, err := statAt(fd.hostFD, name) + if err != nil { + statErr = err + return false + } + dirent.DevMinor = primitive.Uint32(unix.Minor(stat.Dev)) + dirent.DevMajor = primitive.Uint32(unix.Major(stat.Dev)) + + // Paste the dirent into the payload buffer without having the dirent + // escape. Request a larger buffer if needed. + if int(payloadBufPos)+dirent.SizeBytes() > len(payloadBuf) { + // Ask for 10 large dirents worth of more space. + payloadBuf = comm.PayloadBuf(payloadBufPos + 10*unixDirentMaxSize) + } + dirent.MarshalBytes(payloadBuf[payloadBufPos:]) + payloadBufPos += uint32(dirent.SizeBytes()) + numDirents++ + return true + }) + if statErr != nil { + return 0, statErr + } + } + + // The number of dirents goes at the beginning of the payload. + numDirents.MarshalUnsafe(payloadBuf) + return payloadBufPos, nil +} + +// tryStepLocked tries to walk via open() with different modes as documented. +// It then initializes and returns the control FD. +// +// Precondition: server's rename mutex must at least be read locked. +func tryStepLocked(c *lisafs.Connection, name string, parent *controlFDLisa, open func(flags int) (int, error)) (*controlFDLisa, unix.Stat_t, error) { + // Attempt to open file in the following in order: + // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. + // Use non-blocking to prevent getting stuck inside open(2) for + // FIFOs. This option has no effect on regular files. + // 2. PATH: for symlinks, sockets. + options := []struct { + flag int + readable bool + }{ + { + flag: unix.O_RDONLY | unix.O_NONBLOCK, + readable: true, + }, + { + flag: unix.O_PATH, + readable: false, + }, + } + + for i, option := range options { + hostFD, err := open(option.flag | openFlags) + if err == nil { + var stat unix.Stat_t + if err = unix.Fstat(hostFD, &stat); err == nil { + return newControlFDLisaLocked(c, hostFD, parent, name, linux.FileMode(stat.Mode)), stat, nil + } + unix.Close(hostFD) + } + + e := extractErrno(err) + if e == unix.ENOENT { + // File doesn't exist, no point in retrying. + return nil, unix.Stat_t{}, e + } + if i < len(options)-1 { + continue + } + return nil, unix.Stat_t{}, e + } + panic("unreachable") +} + +func fstatTo(hostFD int, stat *linux.Statx) error { + var unixStat unix.Stat_t + if err := unix.Fstat(hostFD, &unixStat); err != nil { + return err + } + + unixToLinuxStat(&unixStat, stat) + return nil +} + +func unixToLinuxStat(from *unix.Stat_t, to *linux.Statx) { + to.Mask = unix.STATX_TYPE | unix.STATX_MODE | unix.STATX_INO | unix.STATX_NLINK | unix.STATX_UID | unix.STATX_GID | unix.STATX_SIZE | unix.STATX_BLOCKS | unix.STATX_ATIME | unix.STATX_MTIME | unix.STATX_CTIME + to.Mode = uint16(from.Mode) + to.DevMinor = unix.Minor(from.Dev) + to.DevMajor = unix.Major(from.Dev) + to.Ino = from.Ino + to.Nlink = uint32(from.Nlink) + to.UID = from.Uid + to.GID = from.Gid + to.RdevMinor = unix.Minor(from.Rdev) + to.RdevMajor = unix.Major(from.Rdev) + to.Size = uint64(from.Size) + to.Blksize = uint32(from.Blksize) + to.Blocks = uint64(from.Blocks) + to.Atime.Sec = from.Atim.Sec + to.Atime.Nsec = uint32(from.Atim.Nsec) + to.Mtime.Sec = from.Mtim.Sec + to.Mtime.Nsec = uint32(from.Mtim.Nsec) + to.Ctime.Sec = from.Ctim.Sec + to.Ctime.Nsec = uint32(from.Ctim.Nsec) +} diff --git a/runsc/fsgofer/lisafs_test.go b/runsc/fsgofer/lisafs_test.go new file mode 100644 index 000000000..4653f9955 --- /dev/null +++ b/runsc/fsgofer/lisafs_test.go @@ -0,0 +1,56 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lisafs_test + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/lisafs" + "gvisor.dev/gvisor/pkg/lisafs/testsuite" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/fsgofer" +) + +// Note that these are not supposed to be extensive or robust tests. These unit +// tests provide a sanity check that all RPCs at least work in obvious ways. + +func init() { + log.SetLevel(log.Debug) + if err := fsgofer.OpenProcSelfFD(); err != nil { + panic(err) + } +} + +// tester implements testsuite.Tester. +type tester struct{} + +// NewServer implements testsuite.Tester.NewServer. +func (tester) NewServer(t *testing.T) *lisafs.Server { + return &fsgofer.NewLisafsServer(fsgofer.Config{HostUDS: true, EnableVerityXattr: true}).Server +} + +// LinkSupported implements testsuite.Tester.LinkSupported. +func (tester) LinkSupported() bool { + return true +} + +// SetUserGroupIDSupported implements testsuite.Tester.SetUserGroupIDSupported. +func (tester) SetUserGroupIDSupported() bool { + return true +} + +func TestFSGofer(t *testing.T) { + testsuite.RunAllLocalFSTests(t, tester{}) +} diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index bc4a3fa32..d625230dd 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -17,12 +17,14 @@ go_library( "//pkg/control/client", "//pkg/control/server", "//pkg/coverage", + "//pkg/eventchannel", "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", "//pkg/sync", "//pkg/tcpip/header", "//pkg/tcpip/stack", + "//pkg/unet", "//pkg/urpc", "//runsc/boot", "//runsc/boot/platforms", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index b15572a98..f4a37cedc 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -35,10 +35,12 @@ import ( "gvisor.dev/gvisor/pkg/control/client" "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" @@ -488,6 +490,61 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD)) nextFD++ + if conf.ProfileBlock != "" { + blockFile, err := os.OpenFile(conf.ProfileBlock, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening block profiling file %q: %v", conf.ProfileBlock, err) + } + defer blockFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, blockFile) + cmd.Args = append(cmd.Args, "--profile-block-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.ProfileCPU != "" { + cpuFile, err := os.OpenFile(conf.ProfileCPU, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening cpu profiling file %q: %v", conf.ProfileCPU, err) + } + defer cpuFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, cpuFile) + cmd.Args = append(cmd.Args, "--profile-cpu-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.ProfileHeap != "" { + heapFile, err := os.OpenFile(conf.ProfileHeap, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening heap profiling file %q: %v", conf.ProfileHeap, err) + } + defer heapFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, heapFile) + cmd.Args = append(cmd.Args, "--profile-heap-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.ProfileMutex != "" { + mutexFile, err := os.OpenFile(conf.ProfileMutex, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening mutex profiling file %q: %v", conf.ProfileMutex, err) + } + defer mutexFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, mutexFile) + cmd.Args = append(cmd.Args, "--profile-mutex-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.TraceFile != "" { + traceFile, err := os.OpenFile(conf.TraceFile, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening trace file %q: %v", conf.TraceFile, err) + } + defer traceFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, traceFile) + cmd.Args = append(cmd.Args, "--trace-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + // If there is a gofer, sends all socket ends to the sandbox. for _, f := range args.IOFiles { defer f.Close() @@ -1020,6 +1077,90 @@ func (s *Sandbox) Cat(cid string, files []string, out *os.File) error { return nil } +// Usage sends the collect call for a container in the sandbox. +func (s *Sandbox) Usage(cid string, Full bool) (control.MemoryUsage, error) { + log.Debugf("Usage sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return control.MemoryUsage{}, err + } + defer conn.Close() + + var m control.MemoryUsage + err = conn.Call(boot.UsageCollect, &control.MemoryUsageOpts{ + Full: Full, + }, &m) + return m, err +} + +// UsageFD sends the usagefd call for a container in the sandbox. +func (s *Sandbox) UsageFD(cid string) (*control.MemoryUsageRecord, error) { + log.Debugf("Usage sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return nil, err + } + defer conn.Close() + + var m control.MemoryUsageFile + if err := conn.Call(boot.UsageUsageFD, &control.MemoryUsageFileOpts{ + Version: 1, + }, &m); err != nil { + return nil, fmt.Errorf("UsageFD failed: %v", err) + } + + if len(m.FilePayload.Files) != 2 { + return nil, fmt.Errorf("wants exactly two fds") + } + + return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1]) +} + +// Reduce sends the reduce call for a container in the sandbox. +func (s *Sandbox) Reduce(cid string, wait bool) error { + log.Debugf("Reduce sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + return conn.Call(boot.UsageReduce, &control.UsageReduceOpts{ + Wait: wait, + }, nil) +} + +// Stream sends the AttachDebugEmitter call for a container in the sandbox, and +// dumps filtered events to out. +func (s *Sandbox) Stream(cid string, filters []string, out *os.File) error { + log.Debugf("Stream sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + r, w, err := unet.SocketPair(false) + if err != nil { + return err + } + + wfd, err := w.Release() + if err != nil { + return fmt.Errorf("failed to release write socket FD: %v", err) + } + + if err := conn.Call(boot.EventsAttachDebugEmitter, &control.EventsOpts{ + FilePayload: urpc.FilePayload{Files: []*os.File{ + os.NewFile(uintptr(wfd), "event sink"), + }}, + }, nil); err != nil { + return fmt.Errorf("AttachDebugEmitter failed: %v", err) + } + + return eventchannel.ProcessAll(r, filters, out) +} + // IsRunning returns true if the sandbox or gofer process is running. func (s *Sandbox) IsRunning() bool { if s.Pid != 0 { diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 5365b5b1b..9d3b97277 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -332,9 +332,9 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth. return auth.CapabilitySetOfMany(caps), nil } -// Is9PMount returns true if the given mount can be mounted as an external +// IsGoferMount returns true if the given mount can be mounted as an external // gofer. -func Is9PMount(m specs.Mount, vfs2Enabled bool) bool { +func IsGoferMount(m specs.Mount, vfs2Enabled bool) bool { MaybeConvertToBindMount(&m) return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m, vfs2Enabled) } |