From 57772db2e7351511de422baeecf807785709ee5d Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Wed, 5 Jun 2019 18:39:30 -0700 Subject: Shutdown host sockets on internal shutdown This is required to make the shutdown visible to peers outside the sandbox. The readClosed / writeClosed fields were dropped, as they were preventing a shutdown socket from reading the remainder of queued bytes. The host syscalls will return the appropriate errors for shutdown. The control message tests have been split out of socket_unix.cc to make the (few) remaining tests accessible to testing inherited host UDS, which don't support sending control messages. Updates #273 PiperOrigin-RevId: 251763060 --- runsc/boot/filter/config.go | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'runsc') diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 652da1cef..ef2dbfad2 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -246,6 +246,10 @@ var allowedSyscalls = seccomp.SyscallRules{ }, syscall.SYS_SETITIMER: {}, syscall.SYS_SHUTDOWN: []seccomp.Rule{ + // Used by fs/host to shutdown host sockets. + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + // Used by unet to shutdown connections. {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, -- cgit v1.2.3 From 85be01b42d4ac48698d1e8f50a4cf2607a4fc50b Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Thu, 6 Jun 2019 08:05:46 -0700 Subject: Add multi-fd support to fdbased endpoint. This allows an fdbased endpoint to have multiple underlying fd's from which packets can be read and dispatched/written to. This should allow for higher throughput as well as better scalability of the network stack as number of connections increases. Updates #231 PiperOrigin-RevId: 251852825 --- pkg/tcpip/link/fdbased/endpoint.go | 168 ++++++++++++++++++++++--------- pkg/tcpip/link/fdbased/endpoint_test.go | 2 +- pkg/tcpip/sample/tun_tcp_connect/main.go | 2 +- pkg/tcpip/sample/tun_tcp_echo/main.go | 2 +- pkg/urpc/urpc.go | 2 +- runsc/boot/config.go | 6 ++ runsc/boot/network.go | 39 ++++--- runsc/main.go | 25 +++-- runsc/sandbox/network.go | 119 +++++++++++++--------- runsc/test/testutil/testutil.go | 1 + 10 files changed, 249 insertions(+), 117 deletions(-) (limited to 'runsc') diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 1f889c2a0..b88e2e7bf 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -21,12 +21,29 @@ // FD based endpoints can be used in the networking stack by calling New() to // create a new endpoint, and then passing it as an argument to // Stack.CreateNIC(). +// +// FD based endpoints can use more than one file descriptor to read incoming +// packets. If there are more than one FDs specified and the underlying FD is an +// AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the +// host kernel will consistently hash the packets to the sockets. This ensures +// that packets for the same TCP streams are not reordered. +// +// Similarly if more than one FD's are specified where the underlying FD is not +// AF_PACKET then it's the caller's responsibility to ensure that all inbound +// packets on the descriptors are consistently 5 tuple hashed to one of the +// descriptors to prevent TCP reordering. +// +// Since netstack today does not compute 5 tuple hashes for outgoing packets we +// only use the first FD to write outbound packets. Once 5 tuple hashes for +// all outbound packets are available we will make use of all underlying FD's to +// write outbound packets. package fdbased import ( "fmt" "syscall" + "golang.org/x/sys/unix" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" "gvisor.googlesource.com/gvisor/pkg/tcpip/header" @@ -65,8 +82,10 @@ const ( ) type endpoint struct { - // fd is the file descriptor used to send and receive packets. - fd int + // fds is the set of file descriptors each identifying one inbound/outbound + // channel. The endpoint will dispatch from all inbound channels as well as + // hash outbound packets to specific channels based on the packet hash. + fds []int // mtu (maximum transmission unit) is the maximum size of a packet. mtu uint32 @@ -85,8 +104,8 @@ type endpoint struct { // its end of the communication pipe. closed func(*tcpip.Error) - inboundDispatcher linkDispatcher - dispatcher stack.NetworkDispatcher + inboundDispatchers []linkDispatcher + dispatcher stack.NetworkDispatcher // packetDispatchMode controls the packet dispatcher used by this // endpoint. @@ -99,17 +118,47 @@ type endpoint struct { // Options specify the details about the fd-based endpoint to be created. type Options struct { - FD int - MTU uint32 - EthernetHeader bool - ClosedFunc func(*tcpip.Error) - Address tcpip.LinkAddress - SaveRestore bool - DisconnectOk bool - GSOMaxSize uint32 + // FDs is a set of FDs used to read/write packets. + FDs []int + + // MTU is the mtu to use for this endpoint. + MTU uint32 + + // EthernetHeader if true, indicates that the endpoint should read/write + // ethernet frames instead of IP packets. + EthernetHeader bool + + // ClosedFunc is a function to be called when an endpoint's peer (if + // any) closes its end of the communication pipe. + ClosedFunc func(*tcpip.Error) + + // Address is the link address for this endpoint. Only used if + // EthernetHeader is true. + Address tcpip.LinkAddress + + // SaveRestore if true, indicates that this NIC capability set should + // include CapabilitySaveRestore + SaveRestore bool + + // DisconnectOk if true, indicates that this NIC capability set should + // include CapabilityDisconnectOk. + DisconnectOk bool + + // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is + // disabled. + GSOMaxSize uint32 + + // PacketDispatchMode specifies the type of inbound dispatcher to be + // used for this endpoint. PacketDispatchMode PacketDispatchMode - TXChecksumOffload bool - RXChecksumOffload bool + + // TXChecksumOffload if true, indicates that this endpoints capability + // set should include CapabilityTXChecksumOffload. + TXChecksumOffload bool + + // RXChecksumOffload if true, indicates that this endpoints capability + // set should include CapabilityRXChecksumOffload. + RXChecksumOffload bool } // New creates a new fd-based endpoint. @@ -117,10 +166,6 @@ type Options struct { // Makes fd non-blocking, but does not take ownership of fd, which must remain // open for the lifetime of the returned endpoint. func New(opts *Options) (tcpip.LinkEndpointID, error) { - if err := syscall.SetNonblock(opts.FD, true); err != nil { - return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err) - } - caps := stack.LinkEndpointCapabilities(0) if opts.RXChecksumOffload { caps |= stack.CapabilityRXChecksumOffload @@ -144,8 +189,12 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) { caps |= stack.CapabilityDisconnectOk } + if len(opts.FDs) == 0 { + return 0, fmt.Errorf("opts.FD is empty, at least one FD must be specified") + } + e := &endpoint{ - fd: opts.FD, + fds: opts.FDs, mtu: opts.MTU, caps: caps, closed: opts.ClosedFunc, @@ -154,46 +203,71 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) { packetDispatchMode: opts.PacketDispatchMode, } - isSocket, err := isSocketFD(e.fd) - if err != nil { - return 0, err - } - if isSocket { - if opts.GSOMaxSize != 0 { - e.caps |= stack.CapabilityGSO - e.gsoMaxSize = opts.GSOMaxSize + // Create per channel dispatchers. + for i := 0; i < len(e.fds); i++ { + fd := e.fds[i] + if err := syscall.SetNonblock(fd, true); err != nil { + return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err) } - } - e.inboundDispatcher, err = createInboundDispatcher(e, isSocket) - if err != nil { - return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err) + + isSocket, err := isSocketFD(fd) + if err != nil { + return 0, err + } + if isSocket { + if opts.GSOMaxSize != 0 { + e.caps |= stack.CapabilityGSO + e.gsoMaxSize = opts.GSOMaxSize + } + } + inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket) + if err != nil { + return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err) + } + e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) } return stack.RegisterLinkEndpoint(e), nil } -func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) { +func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) { // By default use the readv() dispatcher as it works with all kinds of // FDs (tap/tun/unix domain sockets and af_packet). - inboundDispatcher, err := newReadVDispatcher(e.fd, e) + inboundDispatcher, err := newReadVDispatcher(fd, e) if err != nil { - return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err) + return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) } if isSocket { + sa, err := unix.Getsockname(fd) + if err != nil { + return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) + } + switch sa.(type) { + case *unix.SockaddrLinklayer: + // enable PACKET_FANOUT mode is the underlying socket is + // of type AF_PACKET. + const fanoutID = 1 + const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG + fanoutArg := fanoutID | fanoutType<<16 + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { + return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) + } + } + switch e.packetDispatchMode { case PacketMMap: - inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e) + inboundDispatcher, err = newPacketMMapDispatcher(fd, e) if err != nil { - return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err) + return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) } case RecvMMsg: // If the provided FD is a socket then we optimize // packet reads by using recvmmsg() instead of read() to // read packets in a batch. - inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e) + inboundDispatcher, err = newRecvMMsgDispatcher(fd, e) if err != nil { - return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err) + return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) } } } @@ -215,7 +289,9 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { // Link endpoints are not savable. When transportation endpoints are // saved, they stop sending outgoing packets and all incoming packets // are rejected. - go e.dispatchLoop() // S/R-SAFE: See above. + for i := range e.inboundDispatchers { + go e.dispatchLoop(e.inboundDispatchers[i]) // S/R-SAFE: See above. + } } // IsAttached implements stack.LinkEndpoint.IsAttached. @@ -305,26 +381,26 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen } } - return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView()) + return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, hdr.View(), payload.ToView()) } if payload.Size() == 0 { - return rawfile.NonBlockingWrite(e.fd, hdr.View()) + return rawfile.NonBlockingWrite(e.fds[0], hdr.View()) } - return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil) + return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil) } // WriteRawPacket writes a raw packet directly to the file descriptor. func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error { - return rawfile.NonBlockingWrite(e.fd, packet) + return rawfile.NonBlockingWrite(e.fds[0], packet) } // dispatchLoop reads packets from the file descriptor in a loop and dispatches // them to the network stack. -func (e *endpoint) dispatchLoop() *tcpip.Error { +func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error { for { - cont, err := e.inboundDispatcher.dispatch() + cont, err := inboundDispatcher.dispatch() if err != nil || !cont { if e.closed != nil { e.closed(err) @@ -363,7 +439,7 @@ func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabiliti syscall.SetNonblock(fd, true) e := &InjectableEndpoint{endpoint: endpoint{ - fd: fd, + fds: []int{fd}, mtu: mtu, caps: capabilities, }} diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index fd1722074..ba3e09192 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -67,7 +67,7 @@ func newContext(t *testing.T, opt *Options) *context { done <- struct{}{} } - opt.FD = fds[1] + opt.FDs = []int{fds[1]} epID, err := New(opt) if err != nil { t.Fatalf("Failed to create FD endpoint: %v", err) diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go index 1681de56e..1fa899e7e 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/main.go +++ b/pkg/tcpip/sample/tun_tcp_connect/main.go @@ -137,7 +137,7 @@ func main() { log.Fatal(err) } - linkID, err := fdbased.New(&fdbased.Options{FD: fd, MTU: mtu}) + linkID, err := fdbased.New(&fdbased.Options{FDs: []int{fd}, MTU: mtu}) if err != nil { log.Fatal(err) } diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go index 642607f83..d47085581 100644 --- a/pkg/tcpip/sample/tun_tcp_echo/main.go +++ b/pkg/tcpip/sample/tun_tcp_echo/main.go @@ -129,7 +129,7 @@ func main() { } linkID, err := fdbased.New(&fdbased.Options{ - FD: fd, + FDs: []int{fd}, MTU: mtu, EthernetHeader: *tap, Address: tcpip.LinkAddress(maddr), diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go index 0f155ec74..4ea684659 100644 --- a/pkg/urpc/urpc.go +++ b/pkg/urpc/urpc.go @@ -35,7 +35,7 @@ import ( ) // maxFiles determines the maximum file payload. -const maxFiles = 16 +const maxFiles = 32 // ErrTooManyFiles is returned when too many file descriptors are mapped. var ErrTooManyFiles = errors.New("too many files") diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 15f624f9b..8564c502d 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -221,6 +221,11 @@ type Config struct { // user, and without chrooting the sandbox process. This can be // necessary in test environments that have limited capabilities. TestOnlyAllowRunAsCurrentUserWithoutChroot bool + + // NumNetworkChannels controls the number of AF_PACKET sockets that map + // to the same underlying network device. This allows netstack to better + // scale for high throughput use cases. + NumNetworkChannels int } // ToFlags returns a slice of flags that correspond to the given Config. @@ -244,6 +249,7 @@ func (c *Config) ToFlags() []string { "--panic-signal=" + strconv.Itoa(c.PanicSignal), "--profile=" + strconv.FormatBool(c.ProfileEnable), "--net-raw=" + strconv.FormatBool(c.EnableRaw), + "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), } if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Only include if set since it is never to be used by users. diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 0a154d90b..82c259f47 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -57,6 +57,10 @@ type FDBasedLink struct { Routes []Route GSOMaxSize uint32 LinkAddress []byte + + // NumChannels controls how many underlying FD's are to be used to + // create this endpoint. + NumChannels int } // LoopbackLink configures a loopback li nk. @@ -68,8 +72,9 @@ type LoopbackLink struct { // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. type CreateLinksAndRoutesArgs struct { - // FilePayload contains the fds associated with the FDBasedLinks. The - // two slices must have the same length. + // FilePayload contains the fds associated with the FDBasedLinks. The + // number of fd's should match the sum of the NumChannels field of the + // FDBasedLink entries below. urpc.FilePayload LoopbackLinks []LoopbackLink @@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route { // CreateLinksAndRoutes creates links and routes in a network stack. It should // only be called once. func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { - if len(args.FilePayload.Files) != len(args.FDBasedLinks) { - return fmt.Errorf("FilePayload must be same length at FDBasedLinks") + wantFDs := 0 + for _, l := range args.FDBasedLinks { + wantFDs += l.NumChannels + } + if got := len(args.FilePayload.Files); got != wantFDs { + return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs) } var nicID tcpip.NICID @@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } } - for i, link := range args.FDBasedLinks { + fdOffset := 0 + for _, link := range args.FDBasedLinks { nicID++ nicids[link.Name] = nicID - // Copy the underlying FD. - oldFD := args.FilePayload.Files[i].Fd() - newFD, err := syscall.Dup(int(oldFD)) - if err != nil { - return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + FDs := []int{} + for j := 0; j < link.NumChannels; j++ { + // Copy the underlying FD. + oldFD := args.FilePayload.Files[fdOffset].Fd() + newFD, err := syscall.Dup(int(oldFD)) + if err != nil { + return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + } + FDs = append(FDs, newFD) + fdOffset++ } mac := tcpip.LinkAddress(link.LinkAddress) linkEP, err := fdbased.New(&fdbased.Options{ - FD: newFD, + FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: true, Address: mac, @@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct return err } - log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil { return err } diff --git a/runsc/main.go b/runsc/main.go index 11bc73f75..44ad23cba 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -60,16 +60,16 @@ var ( straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") // Flags that control sandbox runtime behavior. - platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") - network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload") - fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") - overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") - watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") - panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") - profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") - netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") - + platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + gso = flag.Bool("gso", true, "enable generic segmenation offload") + fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") + panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") + profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") + numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") ) @@ -141,6 +141,10 @@ func main() { cmd.Fatalf("%v", err) } + if *numNetworkChannels <= 0 { + cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels) + } + // Create a new Config from the flags. conf := &boot.Config{ RootDir: *rootDir, @@ -162,6 +166,7 @@ func main() { ProfileEnable: *profile, EnableRaw: *netRaw, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, + NumNetworkChannels: *numNetworkChannels, } if len(*straceSyscalls) != 0 { conf.StraceSyscalls = strings.Split(*straceSyscalls, ",") diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 0460d5f1a..1fd091514 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -68,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case boot.NetworkHost: @@ -138,7 +138,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -202,25 +202,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO continue } - // Create the socket. - const protocol = 0x0300 // htons(ETH_P_ALL) - fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) - if err != nil { - return fmt.Errorf("unable to create raw socket: %v", err) - } - deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") - - // Bind to the appropriate device. - ll := syscall.SockaddrLinklayer{ - Protocol: protocol, - Ifindex: iface.Index, - Hatype: 0, // No ARP type. - Pkttype: syscall.PACKET_OTHERHOST, - } - if err := syscall.Bind(fd, &ll); err != nil { - return fmt.Errorf("unable to bind to %q: %v", iface.Name, err) - } - // Scrape the routes before removing the address, since that // will remove the routes as well. routes, def, err := routesForIface(iface) @@ -236,9 +217,10 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } link := boot.FDBasedLink{ - Name: iface.Name, - MTU: iface.MTU, - Routes: routes, + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + NumChannels: numNetworkChannels, } // Get the link for the interface. @@ -248,30 +230,23 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr) - if enableGSO { - gso, err := isGSOEnabled(fd, iface.Name) + log.Debugf("Setting up network channels") + // Create the socket for the device. + for i := 0; i < link.NumChannels; i++ { + log.Debugf("Creating Channel %d", i) + socketEntry, err := createSocket(iface, ifaceLink, enableGSO) if err != nil { - return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) + return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) } - if gso { - if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { - return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) - } - link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize + if i == 0 { + link.GSOMaxSize = socketEntry.gsoMaxSize } else { - log.Infof("GSO not available in host.") + if link.GSOMaxSize != socketEntry.gsoMaxSize { + return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", + link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) + } } - } - - // Use SO_RCVBUFFORCE because on linux the receive buffer for an - // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max - // defaults to a unusually low value of 208KB. This is too low - // for gVisor to be able to receive packets at high throughputs - // without incurring packet drops. - const rcvBufSize = 4 << 20 // 4MB. - - if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil { - return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err) + args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) } // Collect the addresses for the interface, enable forwarding, @@ -285,7 +260,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } } - args.FilePayload.Files = append(args.FilePayload.Files, deviceFile) args.FDBasedLinks = append(args.FDBasedLinks, link) } @@ -296,6 +270,61 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO return nil } +type socketEntry struct { + deviceFile *os.File + gsoMaxSize uint32 +} + +// createSocket creates an underlying AF_PACKET socket and configures it for use by +// the sentry and returns an *os.File that wraps the underlying socket fd. +func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) { + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return nil, fmt.Errorf("unable to create raw socket: %v", err) + } + deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: iface.Index, + Hatype: 0, // No ARP type. + Pkttype: syscall.PACKET_OTHERHOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + gsoMaxSize := uint32(0) + if enableGSO { + gso, err := isGSOEnabled(fd, iface.Name) + if err != nil { + return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) + } + if gso { + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { + return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) + } + gsoMaxSize = ifaceLink.Attrs().GSOMaxSize + } else { + log.Infof("GSO not available in host.") + } + } + + // Use SO_RCVBUFFORCE because on linux the receive buffer for an + // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max + // defaults to a unusually low value of 208KB. This is too low + // for gVisor to be able to receive packets at high throughputs + // without incurring packet drops. + const rcvBufSize = 4 << 20 // 4MB. + + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil { + return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err) + } + return &socketEntry{deviceFile, gsoMaxSize}, nil +} + // loopbackLinks collects the links for a loopback interface. func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) { var links []boot.LoopbackLink diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index 9efb1ba8e..727b648a6 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -136,6 +136,7 @@ func TestConfig() *boot.Config { Strace: true, FileAccess: boot.FileAccessExclusive, TestOnlyAllowRunAsCurrentUserWithoutChroot: true, + NumNetworkChannels: 1, } } -- cgit v1.2.3 From 720ec3590d9bbf6dc2f9533ed5ef2cbc0b01627a Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 6 Jun 2019 10:48:19 -0700 Subject: Send error message to docker/kubectl exec on failure Containerd uses the last error message sent to the log to print as failure cause for create/exec. This required a few changes in the logging logic for runsc: - cmd.Errorf/Fatalf: now writes a message with 'error' level to containerd log, in addition to stderr and debug logs, like before. - log.Infof/Warningf/Fatalf: are not sent to containerd log anymore. They are mostly used for debugging and not useful to containerd. In most cases, --debug-log is enabled and this avoids the logs messages from being duplicated. - stderr is not used as default log destination anymore. Some commands assume stdio is for the container/process running inside the sandbox and it's better to never use it for logging. By default, logs are supressed now. PiperOrigin-RevId: 251881815 --- runsc/cmd/BUILD | 1 + runsc/cmd/cmd.go | 19 ---------- runsc/cmd/create.go | 1 - runsc/cmd/error.go | 72 +++++++++++++++++++++++++++++++++++++ runsc/cmd/exec.go | 28 +++++++++------ runsc/cmd/start.go | 1 - runsc/main.go | 60 +++++++++++++++---------------- runsc/test/integration/exec_test.go | 23 ++++++++++++ 8 files changed, 142 insertions(+), 63 deletions(-) create mode 100644 runsc/cmd/error.go (limited to 'runsc') diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index b7551a5ab..173b7671e 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -14,6 +14,7 @@ go_library( "debug.go", "delete.go", "do.go", + "error.go", "events.go", "exec.go", "gofer.go", diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go index a2fc377d1..5b4cc4a39 100644 --- a/runsc/cmd/cmd.go +++ b/runsc/cmd/cmd.go @@ -17,34 +17,15 @@ package cmd import ( "fmt" - "os" "runtime" "strconv" "syscall" - "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/runsc/specutils" ) -// Errorf logs to stderr and returns subcommands.ExitFailure. -func Errorf(s string, args ...interface{}) subcommands.ExitStatus { - // If runsc is being invoked by docker or cri-o, then we might not have - // access to stderr, so we log a serious-looking warning in addition to - // writing to stderr. - log.Warningf("FATAL ERROR: "+s, args...) - fmt.Fprintf(os.Stderr, s+"\n", args...) - // Return an error that is unlikely to be used by the application. - return subcommands.ExitFailure -} - -// Fatalf logs to stderr and exits with a failure status code. -func Fatalf(s string, args ...interface{}) { - Errorf(s, args...) - os.Exit(128) -} - // intFlags can be used with int flags that appear multiple times. type intFlags []int diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go index 629c198fd..8bf9b7dcf 100644 --- a/runsc/cmd/create.go +++ b/runsc/cmd/create.go @@ -16,7 +16,6 @@ package cmd import ( "context" - "flag" "github.com/google/subcommands" "gvisor.googlesource.com/gvisor/runsc/boot" diff --git a/runsc/cmd/error.go b/runsc/cmd/error.go new file mode 100644 index 000000000..700b19f14 --- /dev/null +++ b/runsc/cmd/error.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/json" + "fmt" + "io" + "os" + "time" + + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// ErrorLogger is where error messages should be written to. These messages are +// consumed by containerd and show up to users of command line tools, +// like docker/kubectl. +var ErrorLogger io.Writer + +type jsonError struct { + Msg string `json:"msg"` + Level string `json:"level"` + Time time.Time `json:"time"` +} + +// Errorf logs error to containerd log (--log), to stderr, and debug logs. It +// returns subcommands.ExitFailure for convenience with subcommand.Execute() +// methods: +// return Errorf("Danger! Danger!") +// +func Errorf(format string, args ...interface{}) subcommands.ExitStatus { + // If runsc is being invoked by docker or cri-o, then we might not have + // access to stderr, so we log a serious-looking warning in addition to + // writing to stderr. + log.Warningf("FATAL ERROR: "+format, args...) + fmt.Fprintf(os.Stderr, format+"\n", args...) + + j := jsonError{ + Msg: fmt.Sprintf(format, args...), + Level: "error", + Time: time.Now(), + } + b, err := json.Marshal(j) + if err != nil { + panic(err) + } + if ErrorLogger != nil { + ErrorLogger.Write(b) + } + + return subcommands.ExitFailure +} + +// Fatalf logs the same way as Errorf() does, plus *exits* the process. +func Fatalf(format string, args ...interface{}) { + Errorf(format, args...) + // Return an error that is unlikely to be used by the application. + os.Exit(128) +} diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 8cd070e61..0eeaaadba 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -143,13 +143,16 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // write the child's PID to the pid file. So when the container returns, the // child process will also return and signal containerd. if ex.detach { - return ex.execAndWait(waitStatus) + return ex.execChildAndWait(waitStatus) } + return ex.exec(c, e, waitStatus) +} +func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus { // Start the new process and get it pid. pid, err := c.Execute(e) if err != nil { - Fatalf("executing processes for container: %v", err) + return Errorf("executing processes for container: %v", err) } if e.StdioIsPty { @@ -163,29 +166,29 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if ex.internalPidFile != "" { pidStr := []byte(strconv.Itoa(int(pid))) if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil { - Fatalf("writing internal pid file %q: %v", ex.internalPidFile, err) + return Errorf("writing internal pid file %q: %v", ex.internalPidFile, err) } } - // Generate the pid file after the internal pid file is generated, so that users - // can safely assume that the internal pid file is ready after `runsc exec -d` - // returns. + // Generate the pid file after the internal pid file is generated, so that + // users can safely assume that the internal pid file is ready after + // `runsc exec -d` returns. if ex.pidFile != "" { if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil { - Fatalf("writing pid file: %v", err) + return Errorf("writing pid file: %v", err) } } // Wait for the process to exit. ws, err := c.WaitPID(pid) if err != nil { - Fatalf("waiting on pid %d: %v", pid, err) + return Errorf("waiting on pid %d: %v", pid, err) } *waitStatus = ws return subcommands.ExitSuccess } -func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus { +func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus { var args []string for _, a := range os.Args[1:] { if !strings.Contains(a, "detach") { @@ -193,7 +196,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat } } - // The command needs to write a pid file so that execAndWait can tell + // The command needs to write a pid file so that execChildAndWait can tell // when it has started. If no pid-file was provided, we should use a // filename in a temp directory. pidFile := ex.pidFile @@ -262,7 +265,10 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat return false, nil } if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil { - Fatalf("unexpected error waiting for PID file, err: %v", err) + // Don't log fatal error here, otherwise it will override the error logged + // by the child process that has failed to start. + log.Warningf("Unexpected error waiting for PID file, err: %v", err) + return subcommands.ExitFailure } *waitStatus = 0 diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go index 657726251..31e8f42bb 100644 --- a/runsc/cmd/start.go +++ b/runsc/cmd/start.go @@ -16,7 +16,6 @@ package cmd import ( "context" - "flag" "github.com/google/subcommands" "gvisor.googlesource.com/gvisor/runsc/boot" diff --git a/runsc/main.go b/runsc/main.go index 44ad23cba..39c43507c 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -117,6 +117,22 @@ func main() { os.Exit(0) } + var errorLogger io.Writer + if *logFD > -1 { + errorLogger = os.NewFile(uintptr(*logFD), "error log file") + + } else if *logFilename != "" { + // We must set O_APPEND and not O_TRUNC because Docker passes + // the same log file for all commands (and also parses these + // log files), so we can't destroy them on each command. + var err error + errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + cmd.Fatalf("error opening log file %q: %v", *logFilename, err) + } + } + cmd.ErrorLogger = errorLogger + platformType, err := boot.MakePlatformType(*platform) if err != nil { cmd.Fatalf("%v", err) @@ -179,24 +195,7 @@ func main() { subcommand := flag.CommandLine.Arg(0) - var logFile io.Writer = os.Stderr - if *logFD > -1 { - logFile = os.NewFile(uintptr(*logFD), "log file") - } else if *logFilename != "" { - // We must set O_APPEND and not O_TRUNC because Docker passes - // the same log file for all commands (and also parses these - // log files), so we can't destroy them on each command. - f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) - if err != nil { - cmd.Fatalf("error opening log file %q: %v", *logFilename, err) - } - logFile = f - } else if subcommand == "do" { - logFile = ioutil.Discard - } - - e := newEmitter(*logFormat, logFile) - + var e log.Emitter if *debugLogFD > -1 { f := os.NewFile(uintptr(*debugLogFD), "debug log file") @@ -206,28 +205,27 @@ func main() { cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand) } - // If we are the boot process, then we own our stdio FDs and - // can do what we want with them. Since Docker and Containerd - // both eat boot's stderr, we dup our stderr to the provided - // log FD so that panics will appear in the logs, rather than - // just disappear. + // If we are the boot process, then we own our stdio FDs and can do what we + // want with them. Since Docker and Containerd both eat boot's stderr, we + // dup our stderr to the provided log FD so that panics will appear in the + // logs, rather than just disappear. if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil { cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err) } - if logFile == os.Stderr { - // Suppress logging to stderr when debug log is enabled. Otherwise all - // messages will be duplicated in the debug log (see Dup2() call above). - e = newEmitter(*debugLogFormat, f) - } else { - e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)} - } + e = newEmitter(*debugLogFormat, f) + } else if *debugLog != "" { f, err := specutils.DebugLogFile(*debugLog, subcommand) if err != nil { cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err) } - e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)} + e = newEmitter(*debugLogFormat, f) + + } else { + // Stderr is reserved for the application, just discard the logs if no debug + // log is specified. + e = newEmitter("text", ioutil.Discard) } log.SetTarget(e) diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go index 7af064d79..7c0e61ac3 100644 --- a/runsc/test/integration/exec_test.go +++ b/runsc/test/integration/exec_test.go @@ -29,6 +29,7 @@ package integration import ( "fmt" "strconv" + "strings" "syscall" "testing" "time" @@ -136,3 +137,25 @@ func TestExecJobControl(t *testing.T) { t.Errorf("ws.ExitedStatus got %d, want %d", got, want) } } + +// Test that failure to exec returns proper error message. +func TestExecError(t *testing.T) { + if err := testutil.Pull("alpine"); err != nil { + t.Fatalf("docker pull failed: %v", err) + } + d := testutil.MakeDocker("exec-error-test") + + // Start the container. + if err := d.Run("alpine", "sleep", "1000"); err != nil { + t.Fatalf("docker run failed: %v", err) + } + defer d.CleanUp() + + _, err := d.Exec("no_can_find") + if err == nil { + t.Fatalf("docker exec didn't fail") + } + if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) { + t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want) + } +} -- cgit v1.2.3 From a26043ee53a2f38b81c9eaa098d115025e87f4c3 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 6 Jun 2019 16:26:00 -0700 Subject: Implement reclaim-driven MemoryFile eviction. PiperOrigin-RevId: 251950660 --- pkg/sentry/hostmm/BUILD | 18 ++++++ pkg/sentry/hostmm/cgroup.go | 111 ++++++++++++++++++++++++++++++++++++ pkg/sentry/hostmm/hostmm.go | 130 ++++++++++++++++++++++++++++++++++++++++++ pkg/sentry/pgalloc/BUILD | 1 + pkg/sentry/pgalloc/pgalloc.go | 63 +++++++++++++++++--- runsc/boot/loader.go | 3 + 6 files changed, 317 insertions(+), 9 deletions(-) create mode 100644 pkg/sentry/hostmm/BUILD create mode 100644 pkg/sentry/hostmm/cgroup.go create mode 100644 pkg/sentry/hostmm/hostmm.go (limited to 'runsc') diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD new file mode 100644 index 000000000..1a4632a54 --- /dev/null +++ b/pkg/sentry/hostmm/BUILD @@ -0,0 +1,18 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "hostmm", + srcs = [ + "cgroup.go", + "hostmm.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/fd", + "//pkg/log", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/hostmm/cgroup.go b/pkg/sentry/hostmm/cgroup.go new file mode 100644 index 000000000..e5cc26ab2 --- /dev/null +++ b/pkg/sentry/hostmm/cgroup.go @@ -0,0 +1,111 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostmm + +import ( + "bufio" + "fmt" + "os" + "path" + "strings" +) + +// currentCgroupDirectory returns the directory for the cgroup for the given +// controller in which the calling process resides. +func currentCgroupDirectory(ctrl string) (string, error) { + root, err := cgroupRootDirectory(ctrl) + if err != nil { + return "", err + } + cg, err := currentCgroup(ctrl) + if err != nil { + return "", err + } + return path.Join(root, cg), nil +} + +// cgroupRootDirectory returns the root directory for the cgroup hierarchy in +// which the given cgroup controller is mounted in the calling process' mount +// namespace. +func cgroupRootDirectory(ctrl string) (string, error) { + const path = "/proc/self/mounts" + file, err := os.Open(path) + if err != nil { + return "", err + } + defer file.Close() + + // Per proc(5) -> fstab(5): + // Each line of /proc/self/mounts describes a mount. + scanner := bufio.NewScanner(file) + for scanner.Scan() { + // Each line consists of 6 space-separated fields. Find the line for + // which the third field (fs_vfstype) is cgroup, and the fourth field + // (fs_mntops, a comma-separated list of mount options) contains + // ctrl. + var spec, file, vfstype, mntopts, freq, passno string + const nrfields = 6 + line := scanner.Text() + n, err := fmt.Sscan(line, &spec, &file, &vfstype, &mntopts, &freq, &passno) + if err != nil { + return "", fmt.Errorf("failed to parse %s: %v", path, err) + } + if n != nrfields { + return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, n, nrfields) + } + if vfstype != "cgroup" { + continue + } + for _, mntopt := range strings.Split(mntopts, ",") { + if mntopt == ctrl { + return file, nil + } + } + } + return "", fmt.Errorf("no cgroup hierarchy mounted for controller %s", ctrl) +} + +// currentCgroup returns the cgroup for the given controller in which the +// calling process resides. The returned string is a path that should be +// interpreted as relative to cgroupRootDirectory(ctrl). +func currentCgroup(ctrl string) (string, error) { + const path = "/proc/self/cgroup" + file, err := os.Open(path) + if err != nil { + return "", err + } + defer file.Close() + + // Per proc(5) -> cgroups(7): + // Each line of /proc/self/cgroups describes a cgroup hierarchy. + scanner := bufio.NewScanner(file) + for scanner.Scan() { + // Each line consists of 3 colon-separated fields. Find the line for + // which the second field (controller-list, a comma-separated list of + // cgroup controllers) contains ctrl. + line := scanner.Text() + const nrfields = 3 + fields := strings.Split(line, ":") + if len(fields) != nrfields { + return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, len(fields), nrfields) + } + for _, controller := range strings.Split(fields[1], ",") { + if controller == ctrl { + return fields[2], nil + } + } + } + return "", fmt.Errorf("not a member of a cgroup hierarchy for controller %s", ctrl) +} diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go new file mode 100644 index 000000000..5432cada9 --- /dev/null +++ b/pkg/sentry/hostmm/hostmm.go @@ -0,0 +1,130 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hostmm provides tools for interacting with the host Linux kernel's +// virtual memory management subsystem. +package hostmm + +import ( + "fmt" + "os" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// NotifyCurrentMemcgPressureCallback requests that f is called whenever the +// calling process' memory cgroup indicates memory pressure of the given level, +// as specified by Linux's Documentation/cgroup-v1/memory.txt. +// +// If NotifyCurrentMemcgPressureCallback succeeds, it returns a function that +// terminates the requested memory pressure notifications. This function may be +// called at most once. +func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) { + cgdir, err := currentCgroupDirectory("memory") + if err != nil { + return nil, err + } + + pressurePath := path.Join(cgdir, "memory.pressure_level") + pressureFile, err := os.Open(pressurePath) + if err != nil { + return nil, err + } + defer pressureFile.Close() + + eventControlPath := path.Join(cgdir, "cgroup.event_control") + eventControlFile, err := os.OpenFile(eventControlPath, os.O_WRONLY, 0) + if err != nil { + return nil, err + } + defer eventControlFile.Close() + + eventFD, err := newEventFD() + if err != nil { + return nil, err + } + + // Don't use fmt.Fprintf since the whole string needs to be written in a + // single syscall. + eventControlStr := fmt.Sprintf("%d %d %s", eventFD.FD(), pressureFile.Fd(), level) + if n, err := eventControlFile.Write([]byte(eventControlStr)); n != len(eventControlStr) || err != nil { + eventFD.Close() + return nil, fmt.Errorf("error writing %q to %s: got (%d, %v), wanted (%d, nil)", eventControlStr, eventControlPath, n, err, len(eventControlStr)) + } + + log.Debugf("Receiving memory pressure level notifications from %s at level %q", pressurePath, level) + const sizeofUint64 = 8 + // The most significant bit of the eventfd value is set by the stop + // function, which is practically unambiguous since it's not plausible for + // 2**63 pressure events to occur between eventfd reads. + const stopVal = 1 << 63 + stopCh := make(chan struct{}) + go func() { // S/R-SAFE: f provides synchronization if necessary + rw := fd.NewReadWriter(eventFD.FD()) + var buf [sizeofUint64]byte + for { + n, err := rw.Read(buf[:]) + if err != nil { + if err == syscall.EINTR { + continue + } + panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err)) + } + if n != sizeofUint64 { + panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) + } + val := usermem.ByteOrder.Uint64(buf[:]) + if val >= stopVal { + // Assume this was due to the notifier's "destructor" (the + // function returned by NotifyCurrentMemcgPressureCallback + // below) being called. + eventFD.Close() + close(stopCh) + return + } + f() + } + }() + return func() { + rw := fd.NewReadWriter(eventFD.FD()) + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], stopVal) + for { + n, err := rw.Write(buf[:]) + if err != nil { + if err == syscall.EINTR { + continue + } + panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err)) + } + if n != sizeofUint64 { + panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) + } + break + } + <-stopCh + }, nil +} + +func newEventFD() (*fd.FD, error) { + f, _, e := syscall.Syscall(syscall.SYS_EVENTFD2, 0, 0, 0) + if e != 0 { + return nil, fmt.Errorf("failed to create eventfd: %v", e) + } + return fd.New(int(f)), nil +} diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 8a8a0e4e4..bbdb1f922 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -65,6 +65,7 @@ go_library( "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/context", + "//pkg/sentry/hostmm", "//pkg/sentry/memutil", "//pkg/sentry/platform", "//pkg/sentry/safemem", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 2b9924ad7..6d91f1a7b 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -32,6 +32,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" @@ -162,6 +163,11 @@ type MemoryFile struct { // evictionWG counts the number of goroutines currently performing evictions. evictionWG sync.WaitGroup + + // stopNotifyPressure stops memory cgroup pressure level + // notifications used to drive eviction. stopNotifyPressure is + // immutable. + stopNotifyPressure func() } // MemoryFileOpts provides options to NewMemoryFile. @@ -169,6 +175,11 @@ type MemoryFileOpts struct { // DelayedEviction controls the extent to which the MemoryFile may delay // eviction of evictable allocations. DelayedEviction DelayedEvictionType + + // If UseHostMemcgPressure is true, use host memory cgroup pressure level + // notifications to determine when eviction is necessary. This option has + // no effect unless DelayedEviction is DelayedEvictionEnabled. + UseHostMemcgPressure bool } // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. @@ -186,9 +197,14 @@ const ( // evictable allocations until doing so is considered necessary to avoid // performance degradation due to host memory pressure, or OOM kills. // - // As of this writing, DelayedEvictionEnabled delays evictions until the - // reclaimer goroutine is out of work (pages to reclaim), then evicts all - // pending evictable allocations immediately. + // As of this writing, the behavior of DelayedEvictionEnabled depends on + // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled: + // + // - If UseHostMemcgPressure is true, evictions are delayed until memory + // pressure is indicated. + // + // - Otherwise, evictions are only delayed until the reclaimer goroutine + // is out of work (pages to reclaim). DelayedEvictionEnabled // DelayedEvictionManual requires that evictable allocations are only @@ -292,6 +308,22 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { } f.mappings.Store(make([]uintptr, initialSize/chunkSize)) f.reclaimCond.L = &f.mu + + if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { + stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() { + f.mu.Lock() + startedAny := f.startEvictionsLocked() + f.mu.Unlock() + if startedAny { + log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure") + } + }, "low") + if err != nil { + return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err) + } + f.stopNotifyPressure = stop + } + go f.runReclaim() // S/R-SAFE: f.mu // The Linux kernel contains an optional feature called "Integrity @@ -692,9 +724,11 @@ func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) // Kick off eviction immediately. f.startEvictionGoroutineLocked(user, info) case DelayedEvictionEnabled: - // Ensure that the reclaimer goroutine is running, so that it can - // start eviction when necessary. - f.reclaimCond.Signal() + if !f.opts.UseHostMemcgPressure { + // Ensure that the reclaimer goroutine is running, so that it + // can start eviction when necessary. + f.reclaimCond.Signal() + } } } } @@ -992,11 +1026,12 @@ func (f *MemoryFile) runReclaim() { } f.markReclaimed(fr) } + // We only get here if findReclaimable finds f.destroyed set and returns // false. f.mu.Lock() - defer f.mu.Unlock() if !f.destroyed { + f.mu.Unlock() panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") } f.file.Close() @@ -1016,6 +1051,13 @@ func (f *MemoryFile) runReclaim() { } // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) f.mappings.Store([]uintptr{}) + f.mu.Unlock() + + // This must be called without holding f.mu to avoid circular lock + // ordering. + if f.stopNotifyPressure != nil { + f.stopNotifyPressure() + } } func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { @@ -1029,7 +1071,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { if f.reclaimable { break } - if f.opts.DelayedEviction == DelayedEvictionEnabled { + if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure { // No work to do. Evict any pending evictable allocations to // get more reclaimable pages before going to sleep. f.startEvictionsLocked() @@ -1089,14 +1131,17 @@ func (f *MemoryFile) StartEvictions() { } // Preconditions: f.mu must be locked. -func (f *MemoryFile) startEvictionsLocked() { +func (f *MemoryFile) startEvictionsLocked() bool { + startedAny := false for user, info := range f.evictable { // Don't start multiple goroutines to evict the same user's // allocations. if !info.evicting { f.startEvictionGoroutineLocked(user, info) + startedAny = true } } + return startedAny } // Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index a997776f8..ef4ccd0bd 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -424,6 +424,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return nil, fmt.Errorf("error creating memfd: %v", err) } memfile := os.NewFile(uintptr(memfd), memfileName) + // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if + // there are memory cgroups specified, because at this point we're already + // in a mount namespace in which the relevant cgroupfs is not visible. mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() -- cgit v1.2.3 From 02ab1f187cd24c67b754b004229421d189cee264 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 6 Jun 2019 16:44:40 -0700 Subject: Copy up parent when binding UDS on overlayfs Overlayfs was expecting the parent to exist when bind(2) was called, which may not be the case. The fix is to copy the parent directory to the upper layer before binding the UDS. There is not good place to add tests for it. Syscall tests would be ideal, but it's hard to guarantee that the directory where the socket is created hasn't been touched before (and thus copied the parent to the upper layer). Added it to runsc integration tests for now. If it turns out we have lots of these kind of tests, we can consider moving them somewhere more appropriate. PiperOrigin-RevId: 251954156 --- pkg/sentry/fs/dirent.go | 2 +- pkg/sentry/fs/inode.go | 4 +-- pkg/sentry/fs/inode_overlay.go | 12 ++++----- runsc/test/integration/BUILD | 1 + runsc/test/integration/regression_test.go | 45 +++++++++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 9 deletions(-) create mode 100644 runsc/test/integration/regression_test.go (limited to 'runsc') diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index c0bc261a2..a0a35c242 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -805,7 +805,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans var childDir *Dirent err := d.genericCreate(ctx, root, name, func() error { var e error - childDir, e = d.Inode.Bind(ctx, name, data, perms) + childDir, e = d.Inode.Bind(ctx, d, name, data, perms) if e != nil { return e } diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index aef1a1cb9..0b54c2e77 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -220,9 +220,9 @@ func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent, } // Bind calls i.InodeOperations.Bind with i as the directory. -func (i *Inode) Bind(ctx context.Context, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { +func (i *Inode) Bind(ctx context.Context, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { if i.overlay != nil { - return overlayBind(ctx, i.overlay, name, data, perm) + return overlayBind(ctx, i.overlay, parent, name, data, perm) } return i.InodeOperations.Bind(ctx, i, name, data, perm) } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index cdffe173b..06506fb20 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -398,14 +398,14 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena return nil } -func overlayBind(ctx context.Context, o *overlayEntry, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { +func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { + if err := copyUp(ctx, parent); err != nil { + return nil, err + } + o.copyMu.RLock() defer o.copyMu.RUnlock() - // We do not support doing anything exciting with sockets unless there - // is already a directory in the upper filesystem. - if o.upper == nil { - return nil, syserror.EOPNOTSUPP - } + d, err := o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm) if err != nil { return nil, err diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD index 0c4e4fa80..04ed885c6 100644 --- a/runsc/test/integration/BUILD +++ b/runsc/test/integration/BUILD @@ -8,6 +8,7 @@ go_test( srcs = [ "exec_test.go", "integration_test.go", + "regression_test.go", ], embed = [":integration"], tags = [ diff --git a/runsc/test/integration/regression_test.go b/runsc/test/integration/regression_test.go new file mode 100644 index 000000000..80bae9970 --- /dev/null +++ b/runsc/test/integration/regression_test.go @@ -0,0 +1,45 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package integration + +import ( + "strings" + "testing" + + "gvisor.googlesource.com/gvisor/runsc/test/testutil" +) + +// Test that UDS can be created using overlay when parent directory is in lower +// layer only (b/134090485). +// +// Prerequisite: the directory where the socket file is created must not have +// been open for write before bind(2) is called. +func TestBindOverlay(t *testing.T) { + if err := testutil.Pull("ubuntu:trusty"); err != nil { + t.Fatal("docker pull failed:", err) + } + d := testutil.MakeDocker("bind-overlay-test") + + cmd := "nc -l -U /var/run/sock& sleep 1 && echo foobar-asdf | nc -U /var/run/sock" + got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd) + if err != nil { + t.Fatal("docker run failed:", err) + } + + if want := "foobar-asdf"; !strings.Contains(got, want) { + t.Fatalf("docker run output is missing %q: %s", want, got) + } + defer d.CleanUp() +} -- cgit v1.2.3 From 2e43dcb26b4ccbc4d4f314be61806a82f073a50e Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 6 Jun 2019 17:48:53 -0700 Subject: Add alsologtostderr option When set sends log messages to the error log: sudo ./runsc --logtostderr do ls I0531 17:59:58.105064 144564 x:0] *************************** I0531 17:59:58.105087 144564 x:0] Args: [runsc --logtostderr do ls] I0531 17:59:58.105112 144564 x:0] PID: 144564 I0531 17:59:58.105125 144564 x:0] UID: 0, GID: 0 [...] PiperOrigin-RevId: 251964377 --- runsc/main.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'runsc') diff --git a/runsc/main.go b/runsc/main.go index 39c43507c..6f8e6e378 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -48,11 +48,12 @@ var ( // system that are not covered by the runtime spec. // Debugging flags. - debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") - logPackets = flag.Bool("log-packets", false, "enable network packet logging") - logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") - debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") - debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s") + debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") + logPackets = flag.Bool("log-packets", false, "enable network packet logging") + logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") + debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") + debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s") + alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr") // Debugging flags: strace related strace = flag.Bool("strace", false, "enable strace") @@ -228,6 +229,10 @@ func main() { e = newEmitter("text", ioutil.Discard) } + if *alsoLogToStderr { + e = log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)} + } + log.SetTarget(e) log.Infof("***************************") -- cgit v1.2.3 From e5fb3aab122c546441c595c2135a273468c5a997 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Thu, 6 Jun 2019 22:08:49 -0700 Subject: BUILD: Use runsc to generate version This also ensures BUILD files are correctly formatted. PiperOrigin-RevId: 251990267 --- runsc/BUILD | 10 ++++++---- test/BUILD | 6 +----- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'runsc') diff --git a/runsc/BUILD b/runsc/BUILD index af8e928c5..3d6c92e4c 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -1,6 +1,4 @@ -package( - licenses = ["notice"], # Apache 2.0 -) +package(licenses = ["notice"]) # Apache 2.0 load("@io_bazel_rules_go//go:def.bzl", "go_binary") load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar") @@ -84,8 +82,9 @@ pkg_tar( genrule( name = "deb-version", outs = ["version.txt"], - cmd = "cat bazel-out/volatile-status.txt | grep VERSION | sed 's/^[^0-9]*//' >$@", + cmd = "$(location :runsc) -version | head -n 1 | sed 's/^[^0-9]*//' > $@", stamp = 1, + tools = [":runsc"], ) pkg_deb( @@ -98,4 +97,7 @@ pkg_deb( package = "runsc", postinst = "debian/postinst.sh", version_file = ":version.txt", + visibility = [ + "//visibility:public", + ], ) diff --git a/test/BUILD b/test/BUILD index e99b4e501..8e1dc5228 100644 --- a/test/BUILD +++ b/test/BUILD @@ -1,8 +1,4 @@ -# gVisor is a general-purpose sandbox. - -package(licenses = ["notice"]) - -exports_files(["LICENSE"]) +package(licenses = ["notice"]) # Apache 2.0 # We need to define a bazel platform and toolchain to specify dockerPrivileged # and dockerRunAsRoot options, they are required to run tests on the RBE -- cgit v1.2.3 From 48961d27a8bcc76b3783a7cc4a4a5ebcd5532d25 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 7 Jun 2019 14:51:18 -0700 Subject: Move //pkg/sentry/memutil to //pkg/memutil. PiperOrigin-RevId: 252124156 --- pkg/memutil/BUILD | 11 +++++++ pkg/memutil/memutil_unsafe.go | 42 +++++++++++++++++++++++++++ pkg/sentry/context/contexttest/BUILD | 2 +- pkg/sentry/context/contexttest/contexttest.go | 2 +- pkg/sentry/memutil/BUILD | 14 --------- pkg/sentry/memutil/memutil.go | 16 ---------- pkg/sentry/memutil/memutil_unsafe.go | 39 ------------------------- pkg/sentry/pgalloc/BUILD | 2 +- pkg/sentry/usage/BUILD | 2 +- pkg/sentry/usage/memory.go | 2 +- runsc/boot/BUILD | 2 +- runsc/boot/loader.go | 2 +- 12 files changed, 60 insertions(+), 76 deletions(-) create mode 100644 pkg/memutil/BUILD create mode 100644 pkg/memutil/memutil_unsafe.go delete mode 100644 pkg/sentry/memutil/BUILD delete mode 100644 pkg/sentry/memutil/memutil.go delete mode 100644 pkg/sentry/memutil/memutil_unsafe.go (limited to 'runsc') diff --git a/pkg/memutil/BUILD b/pkg/memutil/BUILD new file mode 100644 index 000000000..71b48a972 --- /dev/null +++ b/pkg/memutil/BUILD @@ -0,0 +1,11 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "memutil", + srcs = ["memutil_unsafe.go"], + importpath = "gvisor.googlesource.com/gvisor/pkg/memutil", + visibility = ["//visibility:public"], + deps = ["@org_golang_x_sys//unix:go_default_library"], +) diff --git a/pkg/memutil/memutil_unsafe.go b/pkg/memutil/memutil_unsafe.go new file mode 100644 index 000000000..979d942a9 --- /dev/null +++ b/pkg/memutil/memutil_unsafe.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +// Package memutil provides a wrapper for the memfd_create() system call. +package memutil + +import ( + "fmt" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" +) + +// CreateMemFD creates a memfd file and returns the fd. +func CreateMemFD(name string, flags int) (int, error) { + p, err := syscall.BytePtrFromString(name) + if err != nil { + return -1, err + } + fd, _, e := syscall.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) + if e != 0 { + if e == syscall.ENOSYS { + return -1, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher") + } + return -1, e + } + return int(fd), nil +} diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD index ce4f1e42c..d17b1bdcf 100644 --- a/pkg/sentry/context/contexttest/BUILD +++ b/pkg/sentry/context/contexttest/BUILD @@ -9,11 +9,11 @@ go_library( importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest", visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/memutil", "//pkg/sentry/context", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", - "//pkg/sentry/memutil", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/platform/ptrace", diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go index 210a235d2..83da40711 100644 --- a/pkg/sentry/context/contexttest/contexttest.go +++ b/pkg/sentry/context/contexttest/contexttest.go @@ -21,11 +21,11 @@ import ( "testing" "time" + "gvisor.googlesource.com/gvisor/pkg/memutil" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" - "gvisor.googlesource.com/gvisor/pkg/sentry/memutil" "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace" diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD deleted file mode 100644 index 68b03d4cc..000000000 --- a/pkg/sentry/memutil/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "memutil", - srcs = [ - "memutil.go", - "memutil_unsafe.go", - ], - importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memutil", - visibility = ["//pkg/sentry:internal"], - deps = ["@org_golang_x_sys//unix:go_default_library"], -) diff --git a/pkg/sentry/memutil/memutil.go b/pkg/sentry/memutil/memutil.go deleted file mode 100644 index a4154c42a..000000000 --- a/pkg/sentry/memutil/memutil.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package memutil contains the utility functions for memory operations. -package memutil diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go deleted file mode 100644 index 92eab8a26..000000000 --- a/pkg/sentry/memutil/memutil_unsafe.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memutil - -import ( - "fmt" - "syscall" - "unsafe" - - "golang.org/x/sys/unix" -) - -// CreateMemFD creates a memfd file and returns the fd. -func CreateMemFD(name string, flags int) (int, error) { - p, err := syscall.BytePtrFromString(name) - if err != nil { - return -1, err - } - fd, _, e := syscall.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) - if e != 0 { - if e == syscall.ENOSYS { - return -1, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher") - } - return -1, e - } - return int(fd), nil -} diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index bbdb1f922..ca2d5ba6f 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -63,10 +63,10 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", + "//pkg/memutil", "//pkg/sentry/arch", "//pkg/sentry/context", "//pkg/sentry/hostmm", - "//pkg/sentry/memutil", "//pkg/sentry/platform", "//pkg/sentry/safemem", "//pkg/sentry/usage", diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD index 09198496b..860733061 100644 --- a/pkg/sentry/usage/BUILD +++ b/pkg/sentry/usage/BUILD @@ -17,6 +17,6 @@ go_library( ], deps = [ "//pkg/bits", - "//pkg/sentry/memutil", + "//pkg/memutil", ], ) diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index c316f1597..9ed974ccb 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -22,7 +22,7 @@ import ( "syscall" "gvisor.googlesource.com/gvisor/pkg/bits" - "gvisor.googlesource.com/gvisor/pkg/sentry/memutil" + "gvisor.googlesource.com/gvisor/pkg/memutil" ) // MemoryKind represents a type of memory used by the application. diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index df9907e52..ac28c4339 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/cpuid", "//pkg/eventchannel", "//pkg/log", + "//pkg/memutil", "//pkg/rand", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", @@ -51,7 +52,6 @@ go_library( "//pkg/sentry/kernel/kdefs", "//pkg/sentry/limits", "//pkg/sentry/loader", - "//pkg/sentry/memutil", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm", diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index ef4ccd0bd..42bddb2e8 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -29,6 +29,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/cpuid" "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/memutil" "gvisor.googlesource.com/gvisor/pkg/rand" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/control" @@ -37,7 +38,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/loader" - "gvisor.googlesource.com/gvisor/pkg/sentry/memutil" "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" -- cgit v1.2.3