From d28f71adcf60793a81490f3b4d25da36901e769e Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 3 Jun 2019 18:14:52 -0700 Subject: Remove 'clearStatus' option from container.Wait*PID() clearStatus was added to allow detached execution to wait on the exec'd process and retrieve its exit status. However, it's not currently used. Both docker and gvisor-containerd-shim wait on the "shim" process and retrieve the exit status from there. We could change gvisor-containerd-shim to use waits, but it will end up also consuming a process for the wait, which is similar to having the shim process. Closes #234 PiperOrigin-RevId: 251349490 --- runsc/sandbox/sandbox.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'runsc/sandbox') diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 47a66afb2..032190636 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -649,7 +649,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { // WaitPID waits for process 'pid' in the container's sandbox and returns its // WaitStatus. -func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) { +func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) { log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) var ws syscall.WaitStatus conn, err := s.sandboxConnect() @@ -659,9 +659,8 @@ func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.Wait defer conn.Close() args := &boot.WaitPIDArgs{ - PID: pid, - CID: cid, - ClearStatus: clearStatus, + PID: pid, + CID: cid, } if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil { return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err) -- cgit v1.2.3 From 85be01b42d4ac48698d1e8f50a4cf2607a4fc50b Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Thu, 6 Jun 2019 08:05:46 -0700 Subject: Add multi-fd support to fdbased endpoint. This allows an fdbased endpoint to have multiple underlying fd's from which packets can be read and dispatched/written to. This should allow for higher throughput as well as better scalability of the network stack as number of connections increases. Updates #231 PiperOrigin-RevId: 251852825 --- pkg/tcpip/link/fdbased/endpoint.go | 168 ++++++++++++++++++++++--------- pkg/tcpip/link/fdbased/endpoint_test.go | 2 +- pkg/tcpip/sample/tun_tcp_connect/main.go | 2 +- pkg/tcpip/sample/tun_tcp_echo/main.go | 2 +- pkg/urpc/urpc.go | 2 +- runsc/boot/config.go | 6 ++ runsc/boot/network.go | 39 ++++--- runsc/main.go | 25 +++-- runsc/sandbox/network.go | 119 +++++++++++++--------- runsc/test/testutil/testutil.go | 1 + 10 files changed, 249 insertions(+), 117 deletions(-) (limited to 'runsc/sandbox') diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 1f889c2a0..b88e2e7bf 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -21,12 +21,29 @@ // FD based endpoints can be used in the networking stack by calling New() to // create a new endpoint, and then passing it as an argument to // Stack.CreateNIC(). +// +// FD based endpoints can use more than one file descriptor to read incoming +// packets. If there are more than one FDs specified and the underlying FD is an +// AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the +// host kernel will consistently hash the packets to the sockets. This ensures +// that packets for the same TCP streams are not reordered. +// +// Similarly if more than one FD's are specified where the underlying FD is not +// AF_PACKET then it's the caller's responsibility to ensure that all inbound +// packets on the descriptors are consistently 5 tuple hashed to one of the +// descriptors to prevent TCP reordering. +// +// Since netstack today does not compute 5 tuple hashes for outgoing packets we +// only use the first FD to write outbound packets. Once 5 tuple hashes for +// all outbound packets are available we will make use of all underlying FD's to +// write outbound packets. package fdbased import ( "fmt" "syscall" + "golang.org/x/sys/unix" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" "gvisor.googlesource.com/gvisor/pkg/tcpip/header" @@ -65,8 +82,10 @@ const ( ) type endpoint struct { - // fd is the file descriptor used to send and receive packets. - fd int + // fds is the set of file descriptors each identifying one inbound/outbound + // channel. The endpoint will dispatch from all inbound channels as well as + // hash outbound packets to specific channels based on the packet hash. + fds []int // mtu (maximum transmission unit) is the maximum size of a packet. mtu uint32 @@ -85,8 +104,8 @@ type endpoint struct { // its end of the communication pipe. closed func(*tcpip.Error) - inboundDispatcher linkDispatcher - dispatcher stack.NetworkDispatcher + inboundDispatchers []linkDispatcher + dispatcher stack.NetworkDispatcher // packetDispatchMode controls the packet dispatcher used by this // endpoint. @@ -99,17 +118,47 @@ type endpoint struct { // Options specify the details about the fd-based endpoint to be created. type Options struct { - FD int - MTU uint32 - EthernetHeader bool - ClosedFunc func(*tcpip.Error) - Address tcpip.LinkAddress - SaveRestore bool - DisconnectOk bool - GSOMaxSize uint32 + // FDs is a set of FDs used to read/write packets. + FDs []int + + // MTU is the mtu to use for this endpoint. + MTU uint32 + + // EthernetHeader if true, indicates that the endpoint should read/write + // ethernet frames instead of IP packets. + EthernetHeader bool + + // ClosedFunc is a function to be called when an endpoint's peer (if + // any) closes its end of the communication pipe. + ClosedFunc func(*tcpip.Error) + + // Address is the link address for this endpoint. Only used if + // EthernetHeader is true. + Address tcpip.LinkAddress + + // SaveRestore if true, indicates that this NIC capability set should + // include CapabilitySaveRestore + SaveRestore bool + + // DisconnectOk if true, indicates that this NIC capability set should + // include CapabilityDisconnectOk. + DisconnectOk bool + + // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is + // disabled. + GSOMaxSize uint32 + + // PacketDispatchMode specifies the type of inbound dispatcher to be + // used for this endpoint. PacketDispatchMode PacketDispatchMode - TXChecksumOffload bool - RXChecksumOffload bool + + // TXChecksumOffload if true, indicates that this endpoints capability + // set should include CapabilityTXChecksumOffload. + TXChecksumOffload bool + + // RXChecksumOffload if true, indicates that this endpoints capability + // set should include CapabilityRXChecksumOffload. + RXChecksumOffload bool } // New creates a new fd-based endpoint. @@ -117,10 +166,6 @@ type Options struct { // Makes fd non-blocking, but does not take ownership of fd, which must remain // open for the lifetime of the returned endpoint. func New(opts *Options) (tcpip.LinkEndpointID, error) { - if err := syscall.SetNonblock(opts.FD, true); err != nil { - return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err) - } - caps := stack.LinkEndpointCapabilities(0) if opts.RXChecksumOffload { caps |= stack.CapabilityRXChecksumOffload @@ -144,8 +189,12 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) { caps |= stack.CapabilityDisconnectOk } + if len(opts.FDs) == 0 { + return 0, fmt.Errorf("opts.FD is empty, at least one FD must be specified") + } + e := &endpoint{ - fd: opts.FD, + fds: opts.FDs, mtu: opts.MTU, caps: caps, closed: opts.ClosedFunc, @@ -154,46 +203,71 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) { packetDispatchMode: opts.PacketDispatchMode, } - isSocket, err := isSocketFD(e.fd) - if err != nil { - return 0, err - } - if isSocket { - if opts.GSOMaxSize != 0 { - e.caps |= stack.CapabilityGSO - e.gsoMaxSize = opts.GSOMaxSize + // Create per channel dispatchers. + for i := 0; i < len(e.fds); i++ { + fd := e.fds[i] + if err := syscall.SetNonblock(fd, true); err != nil { + return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err) } - } - e.inboundDispatcher, err = createInboundDispatcher(e, isSocket) - if err != nil { - return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err) + + isSocket, err := isSocketFD(fd) + if err != nil { + return 0, err + } + if isSocket { + if opts.GSOMaxSize != 0 { + e.caps |= stack.CapabilityGSO + e.gsoMaxSize = opts.GSOMaxSize + } + } + inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket) + if err != nil { + return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err) + } + e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) } return stack.RegisterLinkEndpoint(e), nil } -func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) { +func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) { // By default use the readv() dispatcher as it works with all kinds of // FDs (tap/tun/unix domain sockets and af_packet). - inboundDispatcher, err := newReadVDispatcher(e.fd, e) + inboundDispatcher, err := newReadVDispatcher(fd, e) if err != nil { - return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err) + return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) } if isSocket { + sa, err := unix.Getsockname(fd) + if err != nil { + return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) + } + switch sa.(type) { + case *unix.SockaddrLinklayer: + // enable PACKET_FANOUT mode is the underlying socket is + // of type AF_PACKET. + const fanoutID = 1 + const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG + fanoutArg := fanoutID | fanoutType<<16 + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { + return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) + } + } + switch e.packetDispatchMode { case PacketMMap: - inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e) + inboundDispatcher, err = newPacketMMapDispatcher(fd, e) if err != nil { - return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err) + return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) } case RecvMMsg: // If the provided FD is a socket then we optimize // packet reads by using recvmmsg() instead of read() to // read packets in a batch. - inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e) + inboundDispatcher, err = newRecvMMsgDispatcher(fd, e) if err != nil { - return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err) + return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) } } } @@ -215,7 +289,9 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { // Link endpoints are not savable. When transportation endpoints are // saved, they stop sending outgoing packets and all incoming packets // are rejected. - go e.dispatchLoop() // S/R-SAFE: See above. + for i := range e.inboundDispatchers { + go e.dispatchLoop(e.inboundDispatchers[i]) // S/R-SAFE: See above. + } } // IsAttached implements stack.LinkEndpoint.IsAttached. @@ -305,26 +381,26 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen } } - return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView()) + return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, hdr.View(), payload.ToView()) } if payload.Size() == 0 { - return rawfile.NonBlockingWrite(e.fd, hdr.View()) + return rawfile.NonBlockingWrite(e.fds[0], hdr.View()) } - return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil) + return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil) } // WriteRawPacket writes a raw packet directly to the file descriptor. func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error { - return rawfile.NonBlockingWrite(e.fd, packet) + return rawfile.NonBlockingWrite(e.fds[0], packet) } // dispatchLoop reads packets from the file descriptor in a loop and dispatches // them to the network stack. -func (e *endpoint) dispatchLoop() *tcpip.Error { +func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error { for { - cont, err := e.inboundDispatcher.dispatch() + cont, err := inboundDispatcher.dispatch() if err != nil || !cont { if e.closed != nil { e.closed(err) @@ -363,7 +439,7 @@ func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabiliti syscall.SetNonblock(fd, true) e := &InjectableEndpoint{endpoint: endpoint{ - fd: fd, + fds: []int{fd}, mtu: mtu, caps: capabilities, }} diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index fd1722074..ba3e09192 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -67,7 +67,7 @@ func newContext(t *testing.T, opt *Options) *context { done <- struct{}{} } - opt.FD = fds[1] + opt.FDs = []int{fds[1]} epID, err := New(opt) if err != nil { t.Fatalf("Failed to create FD endpoint: %v", err) diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go index 1681de56e..1fa899e7e 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/main.go +++ b/pkg/tcpip/sample/tun_tcp_connect/main.go @@ -137,7 +137,7 @@ func main() { log.Fatal(err) } - linkID, err := fdbased.New(&fdbased.Options{FD: fd, MTU: mtu}) + linkID, err := fdbased.New(&fdbased.Options{FDs: []int{fd}, MTU: mtu}) if err != nil { log.Fatal(err) } diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go index 642607f83..d47085581 100644 --- a/pkg/tcpip/sample/tun_tcp_echo/main.go +++ b/pkg/tcpip/sample/tun_tcp_echo/main.go @@ -129,7 +129,7 @@ func main() { } linkID, err := fdbased.New(&fdbased.Options{ - FD: fd, + FDs: []int{fd}, MTU: mtu, EthernetHeader: *tap, Address: tcpip.LinkAddress(maddr), diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go index 0f155ec74..4ea684659 100644 --- a/pkg/urpc/urpc.go +++ b/pkg/urpc/urpc.go @@ -35,7 +35,7 @@ import ( ) // maxFiles determines the maximum file payload. -const maxFiles = 16 +const maxFiles = 32 // ErrTooManyFiles is returned when too many file descriptors are mapped. var ErrTooManyFiles = errors.New("too many files") diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 15f624f9b..8564c502d 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -221,6 +221,11 @@ type Config struct { // user, and without chrooting the sandbox process. This can be // necessary in test environments that have limited capabilities. TestOnlyAllowRunAsCurrentUserWithoutChroot bool + + // NumNetworkChannels controls the number of AF_PACKET sockets that map + // to the same underlying network device. This allows netstack to better + // scale for high throughput use cases. + NumNetworkChannels int } // ToFlags returns a slice of flags that correspond to the given Config. @@ -244,6 +249,7 @@ func (c *Config) ToFlags() []string { "--panic-signal=" + strconv.Itoa(c.PanicSignal), "--profile=" + strconv.FormatBool(c.ProfileEnable), "--net-raw=" + strconv.FormatBool(c.EnableRaw), + "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), } if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Only include if set since it is never to be used by users. diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 0a154d90b..82c259f47 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -57,6 +57,10 @@ type FDBasedLink struct { Routes []Route GSOMaxSize uint32 LinkAddress []byte + + // NumChannels controls how many underlying FD's are to be used to + // create this endpoint. + NumChannels int } // LoopbackLink configures a loopback li nk. @@ -68,8 +72,9 @@ type LoopbackLink struct { // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. type CreateLinksAndRoutesArgs struct { - // FilePayload contains the fds associated with the FDBasedLinks. The - // two slices must have the same length. + // FilePayload contains the fds associated with the FDBasedLinks. The + // number of fd's should match the sum of the NumChannels field of the + // FDBasedLink entries below. urpc.FilePayload LoopbackLinks []LoopbackLink @@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route { // CreateLinksAndRoutes creates links and routes in a network stack. It should // only be called once. func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { - if len(args.FilePayload.Files) != len(args.FDBasedLinks) { - return fmt.Errorf("FilePayload must be same length at FDBasedLinks") + wantFDs := 0 + for _, l := range args.FDBasedLinks { + wantFDs += l.NumChannels + } + if got := len(args.FilePayload.Files); got != wantFDs { + return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs) } var nicID tcpip.NICID @@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } } - for i, link := range args.FDBasedLinks { + fdOffset := 0 + for _, link := range args.FDBasedLinks { nicID++ nicids[link.Name] = nicID - // Copy the underlying FD. - oldFD := args.FilePayload.Files[i].Fd() - newFD, err := syscall.Dup(int(oldFD)) - if err != nil { - return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + FDs := []int{} + for j := 0; j < link.NumChannels; j++ { + // Copy the underlying FD. + oldFD := args.FilePayload.Files[fdOffset].Fd() + newFD, err := syscall.Dup(int(oldFD)) + if err != nil { + return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + } + FDs = append(FDs, newFD) + fdOffset++ } mac := tcpip.LinkAddress(link.LinkAddress) linkEP, err := fdbased.New(&fdbased.Options{ - FD: newFD, + FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: true, Address: mac, @@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct return err } - log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil { return err } diff --git a/runsc/main.go b/runsc/main.go index 11bc73f75..44ad23cba 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -60,16 +60,16 @@ var ( straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") // Flags that control sandbox runtime behavior. - platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") - network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload") - fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") - overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") - watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") - panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") - profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") - netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") - + platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + gso = flag.Bool("gso", true, "enable generic segmenation offload") + fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") + panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") + profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") + numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") ) @@ -141,6 +141,10 @@ func main() { cmd.Fatalf("%v", err) } + if *numNetworkChannels <= 0 { + cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels) + } + // Create a new Config from the flags. conf := &boot.Config{ RootDir: *rootDir, @@ -162,6 +166,7 @@ func main() { ProfileEnable: *profile, EnableRaw: *netRaw, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, + NumNetworkChannels: *numNetworkChannels, } if len(*straceSyscalls) != 0 { conf.StraceSyscalls = strings.Split(*straceSyscalls, ",") diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 0460d5f1a..1fd091514 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -68,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case boot.NetworkHost: @@ -138,7 +138,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -202,25 +202,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO continue } - // Create the socket. - const protocol = 0x0300 // htons(ETH_P_ALL) - fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) - if err != nil { - return fmt.Errorf("unable to create raw socket: %v", err) - } - deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") - - // Bind to the appropriate device. - ll := syscall.SockaddrLinklayer{ - Protocol: protocol, - Ifindex: iface.Index, - Hatype: 0, // No ARP type. - Pkttype: syscall.PACKET_OTHERHOST, - } - if err := syscall.Bind(fd, &ll); err != nil { - return fmt.Errorf("unable to bind to %q: %v", iface.Name, err) - } - // Scrape the routes before removing the address, since that // will remove the routes as well. routes, def, err := routesForIface(iface) @@ -236,9 +217,10 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } link := boot.FDBasedLink{ - Name: iface.Name, - MTU: iface.MTU, - Routes: routes, + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + NumChannels: numNetworkChannels, } // Get the link for the interface. @@ -248,30 +230,23 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr) - if enableGSO { - gso, err := isGSOEnabled(fd, iface.Name) + log.Debugf("Setting up network channels") + // Create the socket for the device. + for i := 0; i < link.NumChannels; i++ { + log.Debugf("Creating Channel %d", i) + socketEntry, err := createSocket(iface, ifaceLink, enableGSO) if err != nil { - return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) + return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) } - if gso { - if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { - return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) - } - link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize + if i == 0 { + link.GSOMaxSize = socketEntry.gsoMaxSize } else { - log.Infof("GSO not available in host.") + if link.GSOMaxSize != socketEntry.gsoMaxSize { + return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", + link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) + } } - } - - // Use SO_RCVBUFFORCE because on linux the receive buffer for an - // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max - // defaults to a unusually low value of 208KB. This is too low - // for gVisor to be able to receive packets at high throughputs - // without incurring packet drops. - const rcvBufSize = 4 << 20 // 4MB. - - if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil { - return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err) + args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) } // Collect the addresses for the interface, enable forwarding, @@ -285,7 +260,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } } - args.FilePayload.Files = append(args.FilePayload.Files, deviceFile) args.FDBasedLinks = append(args.FDBasedLinks, link) } @@ -296,6 +270,61 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO return nil } +type socketEntry struct { + deviceFile *os.File + gsoMaxSize uint32 +} + +// createSocket creates an underlying AF_PACKET socket and configures it for use by +// the sentry and returns an *os.File that wraps the underlying socket fd. +func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) { + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return nil, fmt.Errorf("unable to create raw socket: %v", err) + } + deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: iface.Index, + Hatype: 0, // No ARP type. + Pkttype: syscall.PACKET_OTHERHOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + gsoMaxSize := uint32(0) + if enableGSO { + gso, err := isGSOEnabled(fd, iface.Name) + if err != nil { + return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) + } + if gso { + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { + return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) + } + gsoMaxSize = ifaceLink.Attrs().GSOMaxSize + } else { + log.Infof("GSO not available in host.") + } + } + + // Use SO_RCVBUFFORCE because on linux the receive buffer for an + // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max + // defaults to a unusually low value of 208KB. This is too low + // for gVisor to be able to receive packets at high throughputs + // without incurring packet drops. + const rcvBufSize = 4 << 20 // 4MB. + + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil { + return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err) + } + return &socketEntry{deviceFile, gsoMaxSize}, nil +} + // loopbackLinks collects the links for a loopback interface. func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) { var links []boot.LoopbackLink diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index 9efb1ba8e..727b648a6 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -136,6 +136,7 @@ func TestConfig() *boot.Config { Strace: true, FileAccess: boot.FileAccessExclusive, TestOnlyAllowRunAsCurrentUserWithoutChroot: true, + NumNetworkChannels: 1, } } -- cgit v1.2.3 From 847c4b9759c49cb30728579cfb0f4a69f1987b94 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 11 Jun 2019 14:30:34 -0700 Subject: Use net.HardwareAddr for FDBasedLink.LinkAddress It prints formatted to the log. PiperOrigin-RevId: 252699551 --- runsc/boot/network.go | 2 +- runsc/sandbox/network.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'runsc/sandbox') diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 82c259f47..d86803252 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -56,7 +56,7 @@ type FDBasedLink struct { Addresses []net.IP Routes []Route GSOMaxSize uint32 - LinkAddress []byte + LinkAddress net.HardwareAddr // NumChannels controls how many underlying FD's are to be used to // create this endpoint. diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 1fd091514..e9e24fc58 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -228,7 +228,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO if err != nil { return fmt.Errorf("getting link for interface %q: %v", iface.Name, err) } - link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr) + link.LinkAddress = ifaceLink.Attrs().HardwareAddr log.Debugf("Setting up network channels") // Create the socket for the device. -- cgit v1.2.3 From 356d1be140bb51f2a50d2c7fe24242cbfeedc9d6 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 12 Jun 2019 09:40:50 -0700 Subject: Allow 'runsc do' to run without root '--rootless' flag lets a non-root user execute 'runsc do'. The drawback is that the sandbox and gofer processes will run as root inside a user namespace that is mapped to the caller's user, intead of nobody. And network is defaulted to '--network=host' inside the root network namespace. On the bright side, it's very convenient for testing: runsc --rootless do ls runsc --rootless do curl www.google.com PiperOrigin-RevId: 252840970 --- runsc/boot/config.go | 7 +++ runsc/cmd/boot.go | 22 +++++----- runsc/cmd/capability_test.go | 2 +- runsc/cmd/create.go | 8 +++- runsc/cmd/do.go | 39 ++++++++++++----- runsc/cmd/restore.go | 10 +++-- runsc/cmd/run.go | 8 +++- runsc/container/container_test.go | 3 +- runsc/main.go | 63 +++++++++++++++------------ runsc/sandbox/sandbox.go | 92 +++++++++++++++++++++++---------------- runsc/specutils/BUILD | 5 +-- runsc/specutils/namespace.go | 52 ++++++++++++++++++++++ runsc/test/testutil/BUILD | 1 - runsc/test/testutil/testutil.go | 50 --------------------- tools/run_tests.sh | 4 +- 15 files changed, 212 insertions(+), 154 deletions(-) (limited to 'runsc/sandbox') diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 8564c502d..6112b6c0a 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -226,6 +226,12 @@ type Config struct { // to the same underlying network device. This allows netstack to better // scale for high throughput use cases. NumNetworkChannels int + + // Rootless allows the sandbox to be started with a user that is not root. + // Defense is depth measures are weaker with rootless. Specifically, the + // sandbox and Gofer process run as root inside a user namespace with root + // mapped to the caller's user. + Rootless bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -250,6 +256,7 @@ func (c *Config) ToFlags() []string { "--profile=" + strconv.FormatBool(c.ProfileEnable), "--net-raw=" + strconv.FormatBool(c.EnableRaw), "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), + "--rootless=" + strconv.FormatBool(c.Rootless), } if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Only include if set since it is never to be used by users. diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 3a547d4aa..e0a950e9c 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -130,6 +130,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Ensure that if there is a panic, all goroutine stacks are printed. debug.SetTraceback("all") + conf := args[0].(*boot.Config) + if b.setUpRoot { if err := setUpChroot(b.pidns); err != nil { Fatalf("error setting up chroot: %v", err) @@ -143,14 +145,16 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) args = append(args, arg) } } - // Note that we've already read the spec from the spec FD, and - // we will read it again after the exec call. This works - // because the ReadSpecFromFile function seeks to the beginning - // of the file before reading. - if err := callSelfAsNobody(args); err != nil { - Fatalf("%v", err) + if !conf.Rootless { + // Note that we've already read the spec from the spec FD, and + // we will read it again after the exec call. This works + // because the ReadSpecFromFile function seeks to the beginning + // of the file before reading. + if err := callSelfAsNobody(args); err != nil { + Fatalf("%v", err) + } + panic("callSelfAsNobody must never return success") } - panic("callSelfAsNobody must never return success") } } @@ -163,9 +167,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } specutils.LogSpec(spec) - conf := args[0].(*boot.Config) - waitStatus := args[1].(*syscall.WaitStatus) - if b.applyCaps { caps := spec.Process.Capabilities if caps == nil { @@ -251,6 +252,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) ws := l.WaitExit() log.Infof("application exiting with %+v", ws) + waitStatus := args[1].(*syscall.WaitStatus) *waitStatus = syscall.WaitStatus(ws.Status()) l.Destroy() return subcommands.ExitSuccess diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go index ee74d33d8..2825dfaa5 100644 --- a/runsc/cmd/capability_test.go +++ b/runsc/cmd/capability_test.go @@ -116,6 +116,6 @@ func TestCapabilities(t *testing.T) { } func TestMain(m *testing.M) { - testutil.RunAsRoot() + specutils.MaybeRunAsRoot() os.Exit(m.Run()) } diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go index 8bf9b7dcf..e82e8c667 100644 --- a/runsc/cmd/create.go +++ b/runsc/cmd/create.go @@ -82,13 +82,17 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} id := f.Arg(0) conf := args[0].(*boot.Config) + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", c.Name()) + } + bundleDir := c.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } spec, err := specutils.ReadSpec(bundleDir) if err != nil { - Fatalf("reading spec: %v", err) + return Errorf("reading spec: %v", err) } specutils.LogSpec(spec) @@ -96,7 +100,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} // container unless the metadata specifies that it should be run in an // existing container. if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, c.userLog); err != nil { - Fatalf("creating container: %v", err) + return Errorf("creating container: %v", err) } return subcommands.ExitSuccess } diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 8ea59046c..3f6e46fce 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -39,10 +39,9 @@ import ( // Do implements subcommands.Command for the "do" command. It sets up a simple // sandbox and executes the command inside it. See Usage() for more details. type Do struct { - root string - cwd string - ip string - networkNamespace bool + root string + cwd string + ip string } // Name implements subcommands.Command.Name. @@ -72,7 +71,6 @@ func (c *Do) SetFlags(f *flag.FlagSet) { f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`) f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory") f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox") - f.BoolVar(&c.networkNamespace, "netns", true, "run in a new network namespace") } // Execute implements subcommands.Command.Execute. @@ -85,15 +83,21 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su conf := args[0].(*boot.Config) waitStatus := args[1].(*syscall.WaitStatus) - // Map the entire host file system, but make it readonly with a writable - // overlay on top (ignore --overlay option). - conf.Overlay = true + if conf.Rootless { + if err := specutils.MaybeRunAsRoot(); err != nil { + return Errorf("Error executing inside namespace: %v", err) + } + // Execution will continue here if no more capabilities are needed... + } hostname, err := os.Hostname() if err != nil { return Errorf("Error to retrieve hostname: %v", err) } + // Map the entire host file system, but make it readonly with a writable + // overlay on top (ignore --overlay option). + conf.Overlay = true absRoot, err := resolvePath(c.root) if err != nil { return Errorf("Error resolving root: %v", err) @@ -119,11 +123,22 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su specutils.LogSpec(spec) cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000)) - if !c.networkNamespace { - if conf.Network != boot.NetworkHost { - Fatalf("The current network namespace can be used only if --network=host is set", nil) + if conf.Network == boot.NetworkNone { + netns := specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + } + if spec.Linux != nil { + panic("spec.Linux is not nil") } - } else if conf.Network != boot.NetworkNone { + spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}} + + } else if conf.Rootless { + if conf.Network == boot.NetworkSandbox { + fmt.Println("*** Rootless requires changing network type to host ***") + conf.Network = boot.NetworkHost + } + + } else { clean, err := c.setupNet(cid, spec) if err != nil { return Errorf("Error setting up network: %v", err) diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go index 3ab2f5676..a78a0dce6 100644 --- a/runsc/cmd/restore.go +++ b/runsc/cmd/restore.go @@ -80,25 +80,29 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{ conf := args[0].(*boot.Config) waitStatus := args[1].(*syscall.WaitStatus) + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", r.Name()) + } + bundleDir := r.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } spec, err := specutils.ReadSpec(bundleDir) if err != nil { - Fatalf("reading spec: %v", err) + return Errorf("reading spec: %v", err) } specutils.LogSpec(spec) if r.imagePath == "" { - Fatalf("image-path flag must be provided") + return Errorf("image-path flag must be provided") } conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName) ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach) if err != nil { - Fatalf("running container: %v", err) + return Errorf("running container: %v", err) } *waitStatus = ws diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go index c228b4f93..abf602239 100644 --- a/runsc/cmd/run.go +++ b/runsc/cmd/run.go @@ -67,19 +67,23 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s conf := args[0].(*boot.Config) waitStatus := args[1].(*syscall.WaitStatus) + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", r.Name()) + } + bundleDir := r.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } spec, err := specutils.ReadSpec(bundleDir) if err != nil { - Fatalf("reading spec: %v", err) + return Errorf("reading spec: %v", err) } specutils.LogSpec(spec) ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach) if err != nil { - Fatalf("running container: %v", err) + return Errorf("running container: %v", err) } *waitStatus = ws diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 72c5ecbb0..867bf8187 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -36,6 +36,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/control" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" "gvisor.googlesource.com/gvisor/runsc/test/testutil" ) @@ -1853,7 +1854,7 @@ func TestMain(m *testing.M) { if err := testutil.ConfigureExePath(); err != nil { panic(err.Error()) } - testutil.RunAsRoot() + specutils.MaybeRunAsRoot() os.Exit(m.Run()) } diff --git a/runsc/main.go b/runsc/main.go index a214f6ba0..cfe3a78d0 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -61,16 +61,19 @@ var ( straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") // Flags that control sandbox runtime behavior. - platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") - network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload") - fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") - overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") - watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") - panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") - profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") - netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") - numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") + platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + gso = flag.Bool("gso", true, "enable generic segmenation offload") + fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") + panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") + profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") + numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") + rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") + + // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") ) @@ -166,26 +169,28 @@ func main() { // Create a new Config from the flags. conf := &boot.Config{ - RootDir: *rootDir, - Debug: *debug, - LogFilename: *logFilename, - LogFormat: *logFormat, - DebugLog: *debugLog, - DebugLogFormat: *debugLogFormat, - FileAccess: fsAccess, - Overlay: *overlay, - Network: netType, - GSO: *gso, - LogPackets: *logPackets, - Platform: platformType, - Strace: *strace, - StraceLogSize: *straceLogSize, - WatchdogAction: wa, - PanicSignal: *panicSignal, - ProfileEnable: *profile, - EnableRaw: *netRaw, + RootDir: *rootDir, + Debug: *debug, + LogFilename: *logFilename, + LogFormat: *logFormat, + DebugLog: *debugLog, + DebugLogFormat: *debugLogFormat, + FileAccess: fsAccess, + Overlay: *overlay, + Network: netType, + GSO: *gso, + LogPackets: *logPackets, + Platform: platformType, + Strace: *strace, + StraceLogSize: *straceLogSize, + WatchdogAction: wa, + PanicSignal: *panicSignal, + ProfileEnable: *profile, + EnableRaw: *netRaw, + NumNetworkChannels: *numNetworkChannels, + Rootless: *rootless, + TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, - NumNetworkChannels: *numNetworkChannels, } if len(*straceSyscalls) != 0 { conf.StraceSyscalls = strings.Split(*straceSyscalls, ",") diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 032190636..5ff6f879c 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -515,46 +515,64 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund } else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { log.Infof("Sandbox will be started in new user namespace") nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) + cmd.Args = append(cmd.Args, "--setup-root") - // Map nobody in the new namespace to nobody in the parent namespace. - // - // A sandbox process will construct an empty - // root for itself, so it has to have the CAP_SYS_ADMIN - // capability. - // - // FIXME(b/122554829): The current implementations of - // os/exec doesn't allow to set ambient capabilities if - // a process is started in a new user namespace. As a - // workaround, we start the sandbox process with the 0 - // UID and then it constructs a chroot and sets UID to - // nobody. https://github.com/golang/go/issues/2315 - const nobody = 65534 - cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ - { - ContainerID: int(0), - HostID: int(nobody - 1), - Size: int(1), - }, - { - ContainerID: int(nobody), - HostID: int(nobody), - Size: int(1), - }, - } - cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ - { - ContainerID: int(nobody), - HostID: int(nobody), - Size: int(1), - }, + if conf.Rootless { + log.Infof("Rootless mode: sandbox will run as root inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) + cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ + { + ContainerID: 0, + HostID: os.Getuid(), + Size: 1, + }, + } + cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ + { + ContainerID: 0, + HostID: os.Getgid(), + Size: 1, + }, + } + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} + + } else { + // Map nobody in the new namespace to nobody in the parent namespace. + // + // A sandbox process will construct an empty + // root for itself, so it has to have the CAP_SYS_ADMIN + // capability. + // + // FIXME(b/122554829): The current implementations of + // os/exec doesn't allow to set ambient capabilities if + // a process is started in a new user namespace. As a + // workaround, we start the sandbox process with the 0 + // UID and then it constructs a chroot and sets UID to + // nobody. https://github.com/golang/go/issues/2315 + const nobody = 65534 + cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ + { + ContainerID: 0, + HostID: nobody - 1, + Size: 1, + }, + { + ContainerID: nobody, + HostID: nobody, + Size: 1, + }, + } + cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ + { + ContainerID: nobody, + HostID: nobody, + Size: 1, + }, + } + + // Set credentials to run as user and group nobody. + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: nobody} } - // Set credentials to run as user and group nobody. - cmd.SysProcAttr.Credential = &syscall.Credential{ - Uid: 0, - Gid: nobody, - } - cmd.Args = append(cmd.Args, "--setup-root") } else { return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") } diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 15476de6f..0456e4c4f 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -10,10 +10,7 @@ go_library( "specutils.go", ], importpath = "gvisor.googlesource.com/gvisor/runsc/specutils", - visibility = [ - "//runsc:__subpackages__", - "//test:__subpackages__", - ], + visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", "//pkg/log", diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index 7d194335c..06c13d1ab 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -220,3 +220,55 @@ func HasCapabilities(cs ...capability.Cap) bool { } return true } + +// MaybeRunAsRoot ensures the process runs with capabilities needed to create a +// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed, +// it will create a new user namespace and re-execute the process as root +// inside the namespace with the same arguments and environment. +// +// This function returns immediately when no new capability is needed. If +// another process is executed, it returns straight from here with the same exit +// code as the child. +func MaybeRunAsRoot() error { + if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) { + return nil + } + + // Current process doesn't have required capabilities, create user namespace + // and run as root inside the namespace to acquire capabilities. + log.Infof("*** Re-running as root in new user namespace ***") + + cmd := exec.Command("/proc/self/exe", os.Args[1:]...) + + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, + // Set current user/group as root inside the namespace. Since we may not + // have CAP_SETUID/CAP_SETGID, just map root to the current user/group. + UidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getuid(), Size: 1}, + }, + GidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getgid(), Size: 1}, + }, + Credential: &syscall.Credential{Uid: 0, Gid: 0}, + GidMappingsEnableSetgroups: false, + } + + cmd.Env = os.Environ() + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + if exit, ok := err.(*exec.ExitError); ok { + if ws, ok := exit.Sys().(syscall.WaitStatus); ok { + os.Exit(ws.ExitStatus()) + } + log.Warningf("No wait status provided, exiting with -1: %v", err) + os.Exit(-1) + } + return fmt.Errorf("re-executing self: %v", err) + } + // Child completed with success. + os.Exit(0) + panic("unreachable") +} diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD index ddec81444..eedf962a4 100644 --- a/runsc/test/testutil/BUILD +++ b/runsc/test/testutil/BUILD @@ -18,6 +18,5 @@ go_library( "@com_github_cenkalti_backoff//:go_default_library", "@com_github_kr_pty//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", - "@com_github_syndtr_gocapability//capability:go_default_library", ], ) diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index 727b648a6..1bd5adc54 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -30,7 +30,6 @@ import ( "os/exec" "os/signal" "path/filepath" - "runtime" "strings" "sync" "sync/atomic" @@ -39,7 +38,6 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" - "github.com/syndtr/gocapability/capability" "gvisor.googlesource.com/gvisor/runsc/boot" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -284,54 +282,6 @@ func WaitForHTTP(port int, timeout time.Duration) error { return Poll(cb, timeout) } -// RunAsRoot ensures the test runs with CAP_SYS_ADMIN and CAP_SYS_CHROOT. If -// needed it will create a new user namespace and re-execute the test as root -// inside of the namespace. This function returns when it's running as root. If -// it needs to create another process, it will exit from there and not return. -func RunAsRoot() { - if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) { - return - } - - fmt.Println("*** Re-running test as root in new user namespace ***") - - // Current process doesn't have CAP_SYS_ADMIN, create user namespace and run - // as root inside that namespace to get it. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - cmd := exec.Command("/proc/self/exe", os.Args[1:]...) - cmd.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, - // Set current user/group as root inside the namespace. - UidMappings: []syscall.SysProcIDMap{ - {ContainerID: 0, HostID: os.Getuid(), Size: 1}, - }, - GidMappings: []syscall.SysProcIDMap{ - {ContainerID: 0, HostID: os.Getgid(), Size: 1}, - }, - GidMappingsEnableSetgroups: false, - Credential: &syscall.Credential{ - Uid: 0, - Gid: 0, - }, - } - cmd.Env = os.Environ() - cmd.Stdin = os.Stdin - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - if exit, ok := err.(*exec.ExitError); ok { - if ws, ok := exit.Sys().(syscall.WaitStatus); ok { - os.Exit(ws.ExitStatus()) - } - os.Exit(-1) - } - panic(fmt.Sprint("error running child process:", err.Error())) - } - os.Exit(0) -} - // Reaper reaps child processes. type Reaper struct { // mu protects ch, which will be nil if the reaper is not running. diff --git a/tools/run_tests.sh b/tools/run_tests.sh index 8874794fd..7a1f889dd 100755 --- a/tools/run_tests.sh +++ b/tools/run_tests.sh @@ -212,8 +212,8 @@ run_runsc_do_tests() { local runsc=$(find bazel-bin/runsc -type f -executable -name "runsc" | head -n1) # run runsc do without root privileges. - unshare -Ur ${runsc} --network=none --TESTONLY-unsafe-nonroot do true - unshare -Ur ${runsc} --TESTONLY-unsafe-nonroot --network=host do --netns=false true + ${runsc} --rootless do true + ${runsc} --rootless --network=none do true # run runsc do with root privileges. sudo -n -E ${runsc} do true -- cgit v1.2.3