From c9c52c024cf20c1c66327171af4287129724326e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 26 Aug 2019 09:58:16 -0700 Subject: fsgofer_test.go: a socket file path can't be longer than UNIX_MAX_PATH (108) PiperOrigin-RevId: 265478578 --- runsc/fsgofer/fsgofer_test.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'runsc/fsgofer') diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index c86beaef1..cbbe71019 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -635,7 +635,15 @@ func TestAttachInvalidType(t *testing.T) { t.Fatalf("Mkfifo(%q): %v", fifo, err) } - socket := filepath.Join(dir, "socket") + dirFile, err := os.Open(dir) + if err != nil { + t.Fatalf("Open(%s): %v", dir, err) + } + defer dirFile.Close() + + // Bind a socket via /proc to be sure that a length of a socket path + // is less than UNIX_PATH_MAX. + socket := filepath.Join(fmt.Sprintf("/proc/self/fd/%d", dirFile.Fd()), "socket") l, err := net.Listen("unix", socket) if err != nil { t.Fatalf("net.Listen(unix, %q): %v", socket, err) -- cgit v1.2.3 From c319b360d134cff66000fd036fce8b3816c296ea Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Mon, 29 Jul 2019 14:57:14 -0700 Subject: First pass at implementing Unix Domain Socket support. No tests. This commit adds support for detecting the socket file type, connecting to a Unix Domain Socket, and providing bidirectional communication (without file descriptor transfer support). --- pkg/fd/fd.go | 16 +++++++++ runsc/fsgofer/filter/config.go | 28 ++++++++++++++++ runsc/fsgofer/fsgofer.go | 73 ++++++++++++++++++++++++++++++------------ 3 files changed, 96 insertions(+), 21 deletions(-) (limited to 'runsc/fsgofer') diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go index 83bcfe220..c3acd0fe2 100644 --- a/pkg/fd/fd.go +++ b/pkg/fd/fd.go @@ -22,6 +22,7 @@ import ( "runtime" "sync/atomic" "syscall" + "net" ) // ReadWriter implements io.ReadWriter, io.ReaderAt, and io.WriterAt for fd. It @@ -185,6 +186,21 @@ func OpenAt(dir *FD, path string, flags int, mode uint32) (*FD, error) { return New(f), nil } +// OpenUnix TODO: DOC +func OpenUnix(path string) (*FD, error) { + addr, _ := net.ResolveUnixAddr("unix", path) + f, err := net.DialUnix("unix", nil, addr); if err != nil { + return nil, fmt.Errorf("unable to open socket: %q, err: %v", path, err) + } + fConnd, err := f.File(); if err != nil { + return nil, fmt.Errorf("unable to convert to os.File: %q, err: %v", path, err) + } + fdConnd, err := NewFromFile(fConnd); if err != nil { + return nil, fmt.Errorf("unable to convert os.File to fd.FD: %q, err: %v", path, err) + } + return fdConnd, nil +} + // Close closes the file descriptor contained in the FD. // // Close is safe to call multiple times, but will return an error after the diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 8ddfa77d6..71f387bd0 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -26,6 +26,31 @@ import ( // allowedSyscalls is the set of syscalls executed by the gofer. var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_ACCEPT: {}, + syscall.SYS_SOCKET: []seccomp.Rule{ + { + seccomp.AllowValue(syscall.AF_UNIX), + }, + }, + syscall.SYS_CONNECT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + }, + }, + syscall.SYS_SETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + }, + }, + syscall.SYS_GETSOCKNAME: []seccomp.Rule{ + { + seccomp.AllowAny{}, + }, + }, + syscall.SYS_GETPEERNAME: []seccomp.Rule{ + { + seccomp.AllowAny{}, + }, + }, syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ {seccomp.AllowValue(linux.ARCH_GET_FS)}, {seccomp.AllowValue(linux.ARCH_SET_FS)}, @@ -83,6 +108,9 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowAny{}, seccomp.AllowValue(syscall.F_GETFD), }, + { + seccomp.AllowAny{}, + }, }, syscall.SYS_FSTAT: {}, syscall.SYS_FSTATFS: {}, diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 7c4d2b94e..3a78d20a3 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -54,6 +54,7 @@ const ( regular fileType = iota directory symlink + socket unknown ) @@ -66,6 +67,8 @@ func (f fileType) String() string { return "directory" case symlink: return "symlink" + case socket: + return "socket" } return "unknown" } @@ -124,30 +127,56 @@ func (a *attachPoint) Attach() (p9.File, error) { if err != nil { return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err) } - mode := syscall.O_RDWR - if a.conf.ROMount || (stat.Mode&syscall.S_IFMT) == syscall.S_IFDIR { - mode = syscall.O_RDONLY - } - // Open the root directory. - f, err := fd.Open(a.prefix, openFlags|mode, 0) - if err != nil { - return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err) - } + // Apply the S_IFMT bitmask so we can detect file type appropriately + fmtStat := stat.Mode & syscall.S_IFMT - a.attachedMu.Lock() - defer a.attachedMu.Unlock() - if a.attached { - f.Close() - return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) - } + switch fmtStat{ + case syscall.S_IFSOCK: + // Attempt to open a connection. Bubble up the failures. + f, err := fd.OpenUnix(a.prefix); if err != nil { + return nil, err + } - rv, err := newLocalFile(a, f, a.prefix, stat) - if err != nil { - return nil, err + // Close the connection if the UDS is already attached. + a.attachedMu.Lock() + defer a.attachedMu.Unlock() + if a.attached { + f.Close() + return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) + } + a.attached = true + + // Return a localFile object to the caller with the UDS FD included. + return newLocalFile(a, f, a.prefix, stat) + + default: + // Default to Read/Write permissions. + mode := syscall.O_RDWR + // If the configuration is Read Only & the mount point is a directory, + // set the mode to Read Only. + if a.conf.ROMount || fmtStat == syscall.S_IFDIR { + mode = syscall.O_RDONLY + } + + // Open the mount point & capture the FD. + f, err := fd.Open(a.prefix, openFlags|mode, 0) + if err != nil { + return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err) + } + + // If the mount point has already been attached, close the FD. + a.attachedMu.Lock() + defer a.attachedMu.Unlock() + if a.attached { + f.Close() + return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) + } + a.attached = true + + // Return a localFile object to the caller with the mount point FD + return newLocalFile(a, f, a.prefix, stat) } - a.attached = true - return rv, nil } // makeQID returns a unique QID for the given stat buffer. @@ -304,6 +333,8 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) { ft = directory case syscall.S_IFLNK: ft = symlink + case syscall.S_IFSOCK: + ft = socket default: return unknown, syscall.EPERM } @@ -1026,7 +1057,7 @@ func (l *localFile) Flush() error { // Connect implements p9.File. func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) { - return nil, syscall.ECONNREFUSED + return fd.OpenUnix(l.hostPath) } // Close implements p9.File. -- cgit v1.2.3 From 07d329d89f25e4649731199c3025f4fa0ed52bdb Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Tue, 30 Jul 2019 14:58:26 -0700 Subject: Restrict seccomp filters for UDS support. This commit further restricts the seccomp filters required for Gofer access ot Unix Domain Sockets (UDS). --- runsc/fsgofer/filter/config.go | 3 +++ 1 file changed, 3 insertions(+) (limited to 'runsc/fsgofer') diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 71f387bd0..c058c433b 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -39,6 +39,8 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_SETSOCKOPT: []seccomp.Rule{ { seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_BROADCAST), }, }, syscall.SYS_GETSOCKNAME: []seccomp.Rule{ @@ -110,6 +112,7 @@ var allowedSyscalls = seccomp.SyscallRules{ }, { seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_DUPFD_CLOEXEC), }, }, syscall.SYS_FSTAT: {}, -- cgit v1.2.3 From 4288a578832eae652f73ae411ece2d4b6590062e Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Thu, 5 Sep 2019 15:26:16 -0400 Subject: Remove seccomp permissions, and clean up the Attach logic. --- runsc/fsgofer/filter/config.go | 17 --------------- runsc/fsgofer/fsgofer.go | 49 +++++++++++++++++------------------------- 2 files changed, 20 insertions(+), 46 deletions(-) (limited to 'runsc/fsgofer') diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index c058c433b..73407383d 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -36,23 +36,6 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowAny{}, }, }, - syscall.SYS_SETSOCKOPT: []seccomp.Rule{ - { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_BROADCAST), - }, - }, - syscall.SYS_GETSOCKNAME: []seccomp.Rule{ - { - seccomp.AllowAny{}, - }, - }, - syscall.SYS_GETPEERNAME: []seccomp.Rule{ - { - seccomp.AllowAny{}, - }, - }, syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ {seccomp.AllowValue(linux.ARCH_GET_FS)}, {seccomp.AllowValue(linux.ARCH_SET_FS)}, diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 3a78d20a3..e3b926798 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -128,31 +128,22 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err) } - // Apply the S_IFMT bitmask so we can detect file type appropriately - fmtStat := stat.Mode & syscall.S_IFMT + // Hold the file descriptor we are converting into a p9.File + var f *fd.FD - switch fmtStat{ - case syscall.S_IFSOCK: + // Apply the S_IFMT bitmask so we can detect file type appropriately + switch fmtStat := stat.Mode & syscall.S_IFMT; { + case fmtStat == syscall.S_IFSOCK: // Attempt to open a connection. Bubble up the failures. - f, err := fd.OpenUnix(a.prefix); if err != nil { + f, err = fd.OpenUnix(a.prefix) + if err != nil { return nil, err } - // Close the connection if the UDS is already attached. - a.attachedMu.Lock() - defer a.attachedMu.Unlock() - if a.attached { - f.Close() - return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) - } - a.attached = true - - // Return a localFile object to the caller with the UDS FD included. - return newLocalFile(a, f, a.prefix, stat) - default: // Default to Read/Write permissions. mode := syscall.O_RDWR + // If the configuration is Read Only & the mount point is a directory, // set the mode to Read Only. if a.conf.ROMount || fmtStat == syscall.S_IFDIR { @@ -160,23 +151,23 @@ func (a *attachPoint) Attach() (p9.File, error) { } // Open the mount point & capture the FD. - f, err := fd.Open(a.prefix, openFlags|mode, 0) + f, err = fd.Open(a.prefix, openFlags|mode, 0) if err != nil { return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err) } + } - // If the mount point has already been attached, close the FD. - a.attachedMu.Lock() - defer a.attachedMu.Unlock() - if a.attached { - f.Close() - return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) - } - a.attached = true - - // Return a localFile object to the caller with the mount point FD - return newLocalFile(a, f, a.prefix, stat) + // Close the connection if the UDS is already attached. + a.attachedMu.Lock() + defer a.attachedMu.Unlock() + if a.attached { + f.Close() + return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) } + a.attached = true + + // Return a localFile object to the caller with the UDS FD included. + return newLocalFile(a, f, a.prefix, stat) } // makeQID returns a unique QID for the given stat buffer. -- cgit v1.2.3 From c2ae77a607b6e103545aa83e8f2c7c5bf649846f Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Thu, 5 Sep 2019 17:12:09 -0400 Subject: Apply go fmt to the fsgofer changes. --- runsc/fsgofer/fsgofer.go | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'runsc/fsgofer') diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index e3b926798..89171c811 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -131,30 +131,30 @@ func (a *attachPoint) Attach() (p9.File, error) { // Hold the file descriptor we are converting into a p9.File var f *fd.FD - // Apply the S_IFMT bitmask so we can detect file type appropriately + // Apply the S_IFMT bitmask so we can detect file type appropriately switch fmtStat := stat.Mode & syscall.S_IFMT; { case fmtStat == syscall.S_IFSOCK: - // Attempt to open a connection. Bubble up the failures. - f, err = fd.OpenUnix(a.prefix) - if err != nil { - return nil, err - } + // Attempt to open a connection. Bubble up the failures. + f, err = fd.OpenUnix(a.prefix) + if err != nil { + return nil, err + } - default: - // Default to Read/Write permissions. - mode := syscall.O_RDWR + default: + // Default to Read/Write permissions. + mode := syscall.O_RDWR - // If the configuration is Read Only & the mount point is a directory, - // set the mode to Read Only. - if a.conf.ROMount || fmtStat == syscall.S_IFDIR { - mode = syscall.O_RDONLY - } + // If the configuration is Read Only & the mount point is a directory, + // set the mode to Read Only. + if a.conf.ROMount || fmtStat == syscall.S_IFDIR { + mode = syscall.O_RDONLY + } - // Open the mount point & capture the FD. - f, err = fd.Open(a.prefix, openFlags|mode, 0) - if err != nil { - return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err) - } + // Open the mount point & capture the FD. + f, err = fd.Open(a.prefix, openFlags|mode, 0) + if err != nil { + return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err) + } } // Close the connection if the UDS is already attached. -- cgit v1.2.3 From a8834fc555539bd6b0b46936c4a79817812658ff Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Thu, 12 Sep 2019 23:36:18 -0700 Subject: Update p9 to support flipcall. PiperOrigin-RevId: 268845090 --- pkg/p9/BUILD | 3 + pkg/p9/client.go | 280 ++++++++++++++++++++++++++++++++++++++--- pkg/p9/client_test.go | 50 +++++++- pkg/p9/handlers.go | 53 +++++++- pkg/p9/messages.go | 103 +++++++++++---- pkg/p9/p9.go | 2 + pkg/p9/p9test/p9test.go | 2 +- pkg/p9/server.go | 178 +++++++++++++++++++++----- pkg/p9/transport.go | 5 +- pkg/p9/transport_flipcall.go | 254 +++++++++++++++++++++++++++++++++++++ pkg/p9/transport_test.go | 4 +- pkg/p9/version.go | 9 +- runsc/boot/filter/config.go | 14 ++- runsc/fsgofer/filter/BUILD | 1 + runsc/fsgofer/filter/config.go | 36 +++++- 15 files changed, 905 insertions(+), 89 deletions(-) create mode 100644 pkg/p9/transport_flipcall.go (limited to 'runsc/fsgofer') diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD index 6bc4d3bc7..f32244c69 100644 --- a/pkg/p9/BUILD +++ b/pkg/p9/BUILD @@ -20,11 +20,14 @@ go_library( "pool.go", "server.go", "transport.go", + "transport_flipcall.go", "version.go", ], importpath = "gvisor.dev/gvisor/pkg/p9", deps = [ "//pkg/fd", + "//pkg/fdchannel", + "//pkg/flipcall", "//pkg/log", "//pkg/unet", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/p9/client.go b/pkg/p9/client.go index 7dc20aeef..123f54e29 100644 --- a/pkg/p9/client.go +++ b/pkg/p9/client.go @@ -20,6 +20,8 @@ import ( "sync" "syscall" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/unet" ) @@ -77,6 +79,45 @@ type Client struct { // fidPool is the collection of available fids. fidPool pool + // messageSize is the maximum total size of a message. + messageSize uint32 + + // payloadSize is the maximum payload size of a read or write. + // + // For large reads and writes this means that the read or write is + // broken up into buffer-size/payloadSize requests. + payloadSize uint32 + + // version is the agreed upon version X of 9P2000.L.Google.X. + // version 0 implies 9P2000.L. + version uint32 + + // sendRecv is the transport function. + // + // This is determined dynamically based on whether or not the server + // supports flipcall channels (preferred as it is faster and more + // efficient, and does not require tags). + sendRecv func(message, message) error + + // -- below corresponds to sendRecvChannel -- + + // channelsMu protects channels. + channelsMu sync.Mutex + + // channelsWg is a wait group for active clients. + channelsWg sync.WaitGroup + + // channels are the set of initialized IPCs channels. + channels []*channel + + // inuse is set when the channels are actually in use. + // + // This is a fixed-size slice, and the entries will be nil when the + // corresponding channel is available. + inuse []*channel + + // -- below corresponds to sendRecvLegacy -- + // pending is the set of pending messages. pending map[Tag]*response pendingMu sync.Mutex @@ -89,19 +130,6 @@ type Client struct { // Whoever writes to this channel is permitted to call recv. When // finished calling recv, this channel should be emptied. recvr chan bool - - // messageSize is the maximum total size of a message. - messageSize uint32 - - // payloadSize is the maximum payload size of a read or write - // request. For large reads and writes this means that the - // read or write is broken up into buffer-size/payloadSize - // requests. - payloadSize uint32 - - // version is the agreed upon version X of 9P2000.L.Google.X. - // version 0 implies 9P2000.L. - version uint32 } // NewClient creates a new client. It performs a Tversion exchange with @@ -138,8 +166,15 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client return nil, ErrBadVersionString } for { + // Always exchange the version using the legacy version of the + // protocol. If the protocol supports flipcall, then we switch + // our sendRecv function to use that functionality. Otherwise, + // we stick to sendRecvLegacy. rversion := Rversion{} - err := c.sendRecv(&Tversion{Version: versionString(requested), MSize: messageSize}, &rversion) + err := c.sendRecvLegacy(&Tversion{ + Version: versionString(requested), + MSize: messageSize, + }, &rversion) // The server told us to try again with a lower version. if err == syscall.EAGAIN { @@ -165,9 +200,125 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client c.version = version break } + + // Can we switch to use the more advanced channels and create + // independent channels for communication? Prefer it if possible. + if versionSupportsFlipcall(c.version) { + // Attempt to initialize IPC-based communication. + for i := 0; i < channelsPerClient; i++ { + if err := c.openChannel(i); err != nil { + log.Warningf("error opening flipcall channel: %v", err) + break // Stop. + } + } + if len(c.channels) >= 1 { + // At least one channel created. + c.sendRecv = c.sendRecvChannel + + // If we are using channels for communication, then we must poll + // for shutdown events on the main socket. If the socket happens + // to shutdown, then we will close the channels as well. This is + // necessary because channels can hang forever if the server dies + // while we're expecting a response. + go c.watch(socket) // S/R-SAFE: not relevant. + } else { + // Channel setup failed; fallback. + c.sendRecv = c.sendRecvLegacy + } + } else { + // No channels available: use the legacy mechanism. + c.sendRecv = c.sendRecvLegacy + } + return c, nil } +// watch watches the given socket and calls Close on hang up events. +// +// This is intended to be called as a goroutine. +func (c *Client) watch(socket *unet.Socket) { + events := []unix.PollFd{ + unix.PollFd{ + Fd: int32(socket.FD()), + Events: unix.POLLHUP | unix.POLLRDHUP, + }, + } + + for { + // Wait for a shutdown event. + n, err := unix.Ppoll(events, nil, nil) + if n == 0 || err == syscall.EAGAIN { + continue + } + break + } + + // Close everything down: this will kick all active clients off any + // pending requests. Note that Close must be safe to call concurrently, + // and multiple times (see Close below). + c.Close() +} + +// openChannel attempts to open a client channel. +// +// Note that this function returns naked errors which should not be propagated +// directly to a caller. It is expected that the errors will be logged and a +// fallback path will be used instead. +func (c *Client) openChannel(id int) error { + var ( + rchannel0 Rchannel + rchannel1 Rchannel + res = new(channel) + ) + + // Open the data channel. + if err := c.sendRecvLegacy(&Tchannel{ + ID: uint32(id), + Control: 0, + }, &rchannel0); err != nil { + return fmt.Errorf("error handling Tchannel message: %v", err) + } + if rchannel0.FilePayload() == nil { + return fmt.Errorf("missing file descriptor on primary channel") + } + + // We don't need to hold this. + defer rchannel0.FilePayload().Close() + + // Open the channel for file descriptors. + if err := c.sendRecvLegacy(&Tchannel{ + ID: uint32(id), + Control: 1, + }, &rchannel1); err != nil { + return err + } + if rchannel1.FilePayload() == nil { + return fmt.Errorf("missing file descriptor on file descriptor channel") + } + + // Construct the endpoints. + res.desc = flipcall.PacketWindowDescriptor{ + FD: rchannel0.FilePayload().FD(), + Offset: int64(rchannel0.Offset), + Length: int(rchannel0.Length), + } + if err := res.data.Init(flipcall.ClientSide, res.desc); err != nil { + rchannel1.FilePayload().Close() + return err + } + + // The fds channel owns the control payload, and it will be closed when + // the channel object is closed. + res.fds.Init(rchannel1.FilePayload().Release()) + + // Save the channel. + c.channelsMu.Lock() + defer c.channelsMu.Unlock() + c.channels = append(c.channels, res) + c.inuse = append(c.inuse, nil) + return nil +} + // handleOne handles a single incoming message. // // This should only be called with the token from recvr. Note that the received @@ -247,10 +398,10 @@ func (c *Client) waitAndRecv(done chan error) error { } } -// sendRecv performs a roundtrip message exchange. +// sendRecvLegacy performs a roundtrip message exchange. // // This is called by internal functions. -func (c *Client) sendRecv(t message, r message) error { +func (c *Client) sendRecvLegacy(t message, r message) error { tag, ok := c.tagPool.Get() if !ok { return ErrOutOfTags @@ -296,12 +447,107 @@ func (c *Client) sendRecv(t message, r message) error { return nil } +// sendRecvChannel uses channels to send a message. +func (c *Client) sendRecvChannel(t message, r message) error { + c.channelsMu.Lock() + if len(c.channels) == 0 { + // No channel available. + c.channelsMu.Unlock() + return c.sendRecvLegacy(t, r) + } + + // Find the last used channel. + // + // Note that we must add one to the wait group while holding the + // channel mutex, in order for the Wait operation to be race-free + // below. The Wait operation shuts down all in use channels and + // waits for them to return, but must do so holding the mutex. + idx := len(c.channels) - 1 + ch := c.channels[idx] + c.channels = c.channels[:idx] + c.inuse[idx] = ch + c.channelsWg.Add(1) + c.channelsMu.Unlock() + + // Ensure that it's connected. + if !ch.connected { + ch.connected = true + if err := ch.data.Connect(); err != nil { + // The channel is unusable, so don't return it. + ch.Close() + c.channelsWg.Done() + return err + } + } + + // Send the message. + err := ch.sendRecv(c, t, r) + if err != nil { + // On shutdown, we'll see ENOENT. This is a normal situation, and + // we shouldn't generate a spurious warning message in that case. + log.Debugf("error calling sendRecvChannel: %v", err) + } + c.channelsWg.Done() + + // Return the channel. + // + // Note that we check the channel from the inuse slice here. This + // prevents a race where Close is called, which clears inuse, and + // means that we will not actually return the closed channel. + c.channelsMu.Lock() + if c.inuse[idx] != nil { + c.channels = append(c.channels, ch) + c.inuse[idx] = nil + } + c.channelsMu.Unlock() + + return err +} + // Version returns the negotiated 9P2000.L.Google version number. func (c *Client) Version() uint32 { return c.version } -// Close closes the underlying socket. +// Close closes the underlying socket and channels. +// +// Because Close may be called asynchronously from watch, it must be +// safe to call concurrently and multiple times. func (c *Client) Close() error { + c.channelsMu.Lock() + defer c.channelsMu.Unlock() + + // Close all inactive channels. + for _, ch := range c.channels { + ch.Shutdown() + ch.Close() + } + // Close all active channels. + for _, ch := range c.inuse { + if ch != nil { + log.Debugf("shutting down active channel@%p...", ch) + ch.Shutdown() + } + } + + // Wait for active users. + c.channelsWg.Wait() + + // Close all previously active channels. + for i, ch := range c.inuse { + if ch != nil { + ch.Close() + + // Clear the inuse entry here so that it will not be returned + // to the channel slice, which is cleared below. See the + // comment at the end of sendRecvChannel. + c.inuse[i] = nil + } + } + c.channels = nil // Prevent use again. + + // Close the main socket. Note that operation is safe to be called + // multiple times, unlikely the channel Close operations above, which + // we are careful to ensure aren't called twice. return c.socket.Close() } diff --git a/pkg/p9/client_test.go b/pkg/p9/client_test.go index 87b2dd61e..29a0afadf 100644 --- a/pkg/p9/client_test.go +++ b/pkg/p9/client_test.go @@ -35,23 +35,23 @@ func TestVersion(t *testing.T) { go s.Handle(serverSocket) // NewClient does a Tversion exchange, so this is our test for success. - c, err := NewClient(clientSocket, 1024*1024 /* 1M message size */, HighestVersionString()) + c, err := NewClient(clientSocket, DefaultMessageSize, HighestVersionString()) if err != nil { t.Fatalf("got %v, expected nil", err) } // Check a bogus version string. - if err := c.sendRecv(&Tversion{Version: "notokay", MSize: 1024 * 1024}, &Rversion{}); err != syscall.EINVAL { + if err := c.sendRecv(&Tversion{Version: "notokay", MSize: DefaultMessageSize}, &Rversion{}); err != syscall.EINVAL { t.Errorf("got %v expected %v", err, syscall.EINVAL) } // Check a bogus version number. - if err := c.sendRecv(&Tversion{Version: "9P1000.L", MSize: 1024 * 1024}, &Rversion{}); err != syscall.EINVAL { + if err := c.sendRecv(&Tversion{Version: "9P1000.L", MSize: DefaultMessageSize}, &Rversion{}); err != syscall.EINVAL { t.Errorf("got %v expected %v", err, syscall.EINVAL) } // Check a too high version number. - if err := c.sendRecv(&Tversion{Version: versionString(highestSupportedVersion + 1), MSize: 1024 * 1024}, &Rversion{}); err != syscall.EAGAIN { + if err := c.sendRecv(&Tversion{Version: versionString(highestSupportedVersion + 1), MSize: DefaultMessageSize}, &Rversion{}); err != syscall.EAGAIN { t.Errorf("got %v expected %v", err, syscall.EAGAIN) } @@ -60,3 +60,45 @@ func TestVersion(t *testing.T) { t.Errorf("got %v expected %v", err, syscall.EINVAL) } } + +func benchmarkSendRecv(b *testing.B, fn func(c *Client) func(message, message) error) { + // See above. + serverSocket, clientSocket, err := unet.SocketPair(false) + if err != nil { + b.Fatalf("socketpair got err %v expected nil", err) + } + defer clientSocket.Close() + + // See above. + s := NewServer(nil) + go s.Handle(serverSocket) + + // See above. + c, err := NewClient(clientSocket, DefaultMessageSize, HighestVersionString()) + if err != nil { + b.Fatalf("got %v, expected nil", err) + } + + // Initialize messages. + sendRecv := fn(c) + tversion := &Tversion{ + Version: versionString(highestSupportedVersion), + MSize: DefaultMessageSize, + } + rversion := new(Rversion) + + // Run in a loop. + for i := 0; i < b.N; i++ { + if err := sendRecv(tversion, rversion); err != nil { + b.Fatalf("got unexpected err: %v", err) + } + } +} + +func BenchmarkSendRecvLegacy(b *testing.B) { + benchmarkSendRecv(b, func(c *Client) func(message, message) error { return c.sendRecvLegacy }) +} + +func BenchmarkSendRecvChannel(b *testing.B) { + benchmarkSendRecv(b, func(c *Client) func(message, message) error { return c.sendRecvChannel }) +} diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go index 999b4f684..ba9a55d6d 100644 --- a/pkg/p9/handlers.go +++ b/pkg/p9/handlers.go @@ -305,7 +305,9 @@ func (t *Tlopen) handle(cs *connState) message { ref.opened = true ref.openFlags = t.Flags - return &Rlopen{QID: qid, IoUnit: ioUnit, File: osFile} + rlopen := &Rlopen{QID: qid, IoUnit: ioUnit} + rlopen.SetFilePayload(osFile) + return rlopen } func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) { @@ -364,7 +366,9 @@ func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) { // Replace the FID reference. cs.InsertFID(t.FID, newRef) - return &Rlcreate{Rlopen: Rlopen{QID: qid, IoUnit: ioUnit, File: osFile}}, nil + rlcreate := &Rlcreate{Rlopen: Rlopen{QID: qid, IoUnit: ioUnit}} + rlcreate.SetFilePayload(osFile) + return rlcreate, nil } // handle implements handler.handle. @@ -1287,5 +1291,48 @@ func (t *Tlconnect) handle(cs *connState) message { return newErr(err) } - return &Rlconnect{File: osFile} + rlconnect := &Rlconnect{} + rlconnect.SetFilePayload(osFile) + return rlconnect +} + +// handle implements handler.handle. +func (t *Tchannel) handle(cs *connState) message { + // Ensure that channels are enabled. + if err := cs.initializeChannels(); err != nil { + return newErr(err) + } + + // Lookup the given channel. + ch := cs.lookupChannel(t.ID) + if ch == nil { + return newErr(syscall.ENOSYS) + } + + // Return the payload. Note that we need to duplicate the file + // descriptor for the channel allocator, because sending is a + // destructive operation between sendRecvLegacy (and now the newer + // channel send operations). Same goes for the client FD. + rchannel := &Rchannel{ + Offset: uint64(ch.desc.Offset), + Length: uint64(ch.desc.Length), + } + switch t.Control { + case 0: + // Open the main data channel. + mfd, err := syscall.Dup(int(cs.channelAlloc.FD())) + if err != nil { + return newErr(err) + } + rchannel.SetFilePayload(fd.New(mfd)) + case 1: + cfd, err := syscall.Dup(ch.client.FD()) + if err != nil { + return newErr(err) + } + rchannel.SetFilePayload(fd.New(cfd)) + default: + return newErr(syscall.EINVAL) + } + return rchannel } diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go index fd9eb1c5d..ffdd7e8c6 100644 --- a/pkg/p9/messages.go +++ b/pkg/p9/messages.go @@ -64,6 +64,21 @@ type filer interface { SetFilePayload(*fd.FD) } +// filePayload embeds a File object. +type filePayload struct { + File *fd.FD +} + +// FilePayload returns the file payload. +func (f *filePayload) FilePayload() *fd.FD { + return f.File +} + +// SetFilePayload sets the received file. +func (f *filePayload) SetFilePayload(file *fd.FD) { + f.File = file +} + // Tversion is a version request. type Tversion struct { // MSize is the message size to use. @@ -524,10 +539,7 @@ type Rlopen struct { // IoUnit is the recommended I/O unit. IoUnit uint32 - // File may be attached via the socket. - // - // This is an extension specific to this package. - File *fd.FD + filePayload } // Decode implements encoder.Decode. @@ -547,16 +559,6 @@ func (*Rlopen) Type() MsgType { return MsgRlopen } -// FilePayload returns the file payload. -func (r *Rlopen) FilePayload() *fd.FD { - return r.File -} - -// SetFilePayload sets the received file. -func (r *Rlopen) SetFilePayload(file *fd.FD) { - r.File = file -} - // String implements fmt.Stringer. func (r *Rlopen) String() string { return fmt.Sprintf("Rlopen{QID: %s, IoUnit: %d, File: %v}", r.QID, r.IoUnit, r.File) @@ -2171,8 +2173,7 @@ func (t *Tlconnect) String() string { // Rlconnect is a connect response. type Rlconnect struct { - // File is a host socket. - File *fd.FD + filePayload } // Decode implements encoder.Decode. @@ -2186,19 +2187,71 @@ func (*Rlconnect) Type() MsgType { return MsgRlconnect } -// FilePayload returns the file payload. -func (r *Rlconnect) FilePayload() *fd.FD { - return r.File +// String implements fmt.Stringer. +func (r *Rlconnect) String() string { + return fmt.Sprintf("Rlconnect{File: %v}", r.File) } -// SetFilePayload sets the received file. -func (r *Rlconnect) SetFilePayload(file *fd.FD) { - r.File = file +// Tchannel creates a new channel. +type Tchannel struct { + // ID is the channel ID. + ID uint32 + + // Control is 0 if the Rchannel response should provide the flipcall + // component of the channel, and 1 if the Rchannel response should + // provide the fdchannel component of the channel. + Control uint32 +} + +// Decode implements encoder.Decode. +func (t *Tchannel) Decode(b *buffer) { + t.ID = b.Read32() + t.Control = b.Read32() +} + +// Encode implements encoder.Encode. +func (t *Tchannel) Encode(b *buffer) { + b.Write32(t.ID) + b.Write32(t.Control) +} + +// Type implements message.Type. +func (*Tchannel) Type() MsgType { + return MsgTchannel } // String implements fmt.Stringer. -func (r *Rlconnect) String() string { - return fmt.Sprintf("Rlconnect{File: %v}", r.File) +func (t *Tchannel) String() string { + return fmt.Sprintf("Tchannel{ID: %d, Control: %d}", t.ID, t.Control) +} + +// Rchannel is the channel response. +type Rchannel struct { + Offset uint64 + Length uint64 + filePayload +} + +// Decode implements encoder.Decode. +func (r *Rchannel) Decode(b *buffer) { + r.Offset = b.Read64() + r.Length = b.Read64() +} + +// Encode implements encoder.Encode. +func (r *Rchannel) Encode(b *buffer) { + b.Write64(r.Offset) + b.Write64(r.Length) +} + +// Type implements message.Type. +func (*Rchannel) Type() MsgType { + return MsgRchannel +} + +// String implements fmt.Stringer. +func (r *Rchannel) String() string { + return fmt.Sprintf("Rchannel{Offset: %d, Length: %d}", r.Offset, r.Length) } const maxCacheSize = 3 @@ -2356,4 +2409,6 @@ func init() { msgRegistry.register(MsgRlconnect, func() message { return &Rlconnect{} }) msgRegistry.register(MsgTallocate, func() message { return &Tallocate{} }) msgRegistry.register(MsgRallocate, func() message { return &Rallocate{} }) + msgRegistry.register(MsgTchannel, func() message { return &Tchannel{} }) + msgRegistry.register(MsgRchannel, func() message { return &Rchannel{} }) } diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go index e12831dbd..25530adca 100644 --- a/pkg/p9/p9.go +++ b/pkg/p9/p9.go @@ -378,6 +378,8 @@ const ( MsgRlconnect = 137 MsgTallocate = 138 MsgRallocate = 139 + MsgTchannel = 250 + MsgRchannel = 251 ) // QIDType represents the file type for QIDs. diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go index 95846e5f7..9d74638bb 100644 --- a/pkg/p9/p9test/p9test.go +++ b/pkg/p9/p9test/p9test.go @@ -315,7 +315,7 @@ func NewHarness(t *testing.T) (*Harness, *p9.Client) { }() // Create the client. - client, err := p9.NewClient(clientSocket, 1024, p9.HighestVersionString()) + client, err := p9.NewClient(clientSocket, p9.DefaultMessageSize, p9.HighestVersionString()) if err != nil { serverSocket.Close() clientSocket.Close() diff --git a/pkg/p9/server.go b/pkg/p9/server.go index b294efbb0..69c886a5d 100644 --- a/pkg/p9/server.go +++ b/pkg/p9/server.go @@ -21,6 +21,9 @@ import ( "sync/atomic" "syscall" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/fdchannel" + "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/unet" ) @@ -45,7 +48,6 @@ type Server struct { } // NewServer returns a new server. -// func NewServer(attacher Attacher) *Server { return &Server{ attacher: attacher, @@ -85,6 +87,8 @@ type connState struct { // version 0 implies 9P2000.L. version uint32 + // -- below relates to the legacy handler -- + // recvOkay indicates that a receive may start. recvOkay chan bool @@ -93,6 +97,20 @@ type connState struct { // sendDone is signalled when a send is finished. sendDone chan error + + // -- below relates to the flipcall handler -- + + // channelMu protects below. + channelMu sync.Mutex + + // channelWg represents active workers. + channelWg sync.WaitGroup + + // channelAlloc allocates channel memory. + channelAlloc *flipcall.PacketWindowAllocator + + // channels are the set of initialized channels. + channels []*channel } // fidRef wraps a node and tracks references. @@ -386,6 +404,99 @@ func (cs *connState) WaitTag(t Tag) { <-ch } +// initializeChannels initializes all channels. +// +// This is a no-op if channels are already initialized. +func (cs *connState) initializeChannels() (err error) { + cs.channelMu.Lock() + defer cs.channelMu.Unlock() + + // Initialize our channel allocator. + if cs.channelAlloc == nil { + alloc, err := flipcall.NewPacketWindowAllocator() + if err != nil { + return err + } + cs.channelAlloc = alloc + } + + // Create all the channels. + for len(cs.channels) < channelsPerClient { + res := &channel{ + done: make(chan struct{}), + } + + res.desc, err = cs.channelAlloc.Allocate(channelSize) + if err != nil { + return err + } + if err := res.data.Init(flipcall.ServerSide, res.desc); err != nil { + return err + } + + socks, err := fdchannel.NewConnectedSockets() + if err != nil { + res.data.Destroy() // Cleanup. + return err + } + res.fds.Init(socks[0]) + res.client = fd.New(socks[1]) + + cs.channels = append(cs.channels, res) + + // Start servicing the channel. + // + // When we call stop, we will close all the channels and these + // routines should finish. We need the wait group to ensure + // that active handlers are actually finished before cleanup. + cs.channelWg.Add(1) + go func() { // S/R-SAFE: Server side. + defer cs.channelWg.Done() + res.service(cs) + }() + } + + return nil +} + +// lookupChannel looks up the channel with given id. +// +// The function returns nil if no such channel is available. +func (cs *connState) lookupChannel(id uint32) *channel { + cs.channelMu.Lock() + defer cs.channelMu.Unlock() + if id >= uint32(len(cs.channels)) { + return nil + } + return cs.channels[id] +} + +// handle handles a single message. +func (cs *connState) handle(m message) (r message) { + defer func() { + if r == nil { + // Don't allow a panic to propagate. + recover() + + // Include a useful log message. + log.Warningf("panic in handler: %s", debug.Stack()) + + // Wrap in an EFAULT error; we don't really have a + // better way to describe this kind of error. It will + // usually manifest as a result of the test framework. + r = newErr(syscall.EFAULT) + } + }() + if handler, ok := m.(handler); ok { + // Call the message handler. + r = handler.handle(cs) + } else { + // Produce an ENOSYS error. + r = newErr(syscall.ENOSYS) + } + return +} + // handleRequest handles a single request. // // The recvDone channel is signaled when recv is done (with a error if @@ -428,41 +539,20 @@ func (cs *connState) handleRequest() { } // Handle the message. - var r message // r is the response. - defer func() { - if r == nil { - // Don't allow a panic to propagate. - recover() + r := cs.handle(m) - // Include a useful log message. - log.Warningf("panic in handler: %s", debug.Stack()) + // Clear the tag before sending. That's because as soon as this hits + // the wire, the client can legally send the same tag. + cs.ClearTag(tag) - // Wrap in an EFAULT error; we don't really have a - // better way to describe this kind of error. It will - // usually manifest as a result of the test framework. - r = newErr(syscall.EFAULT) - } + // Send back the result. + cs.sendMu.Lock() + err = send(cs.conn, tag, r) + cs.sendMu.Unlock() + cs.sendDone <- err - // Clear the tag before sending. That's because as soon as this - // hits the wire, the client can legally send another message - // with the same tag. - cs.ClearTag(tag) - - // Send back the result. - cs.sendMu.Lock() - err = send(cs.conn, tag, r) - cs.sendMu.Unlock() - cs.sendDone <- err - }() - if handler, ok := m.(handler); ok { - // Call the message handler. - r = handler.handle(cs) - } else { - // Produce an ENOSYS error. - r = newErr(syscall.ENOSYS) - } + // Return the message to the cache. msgRegistry.put(m) - m = nil // 'm' should not be touched after this point. } func (cs *connState) handleRequests() { @@ -477,7 +567,27 @@ func (cs *connState) stop() { close(cs.recvDone) close(cs.sendDone) - for _, fidRef := range cs.fids { + // Free the channels. + cs.channelMu.Lock() + for _, ch := range cs.channels { + ch.Shutdown() + } + cs.channelWg.Wait() + for _, ch := range cs.channels { + ch.Close() + } + cs.channels = nil // Clear. + cs.channelMu.Unlock() + + // Free the channel memory. + if cs.channelAlloc != nil { + cs.channelAlloc.Destroy() + } + + // Close all remaining fids. + for fid, fidRef := range cs.fids { + delete(cs.fids, fid) + // Drop final reference in the FID table. Note this should // always close the file, since we've ensured that there are no // handlers running via the wait for Pending => 0 below. @@ -510,7 +620,7 @@ func (cs *connState) service() error { for i := 0; i < pending; i++ { <-cs.sendDone } - return err + return nil } // This handler is now pending. diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go index 5648df589..6e8b4bbcd 100644 --- a/pkg/p9/transport.go +++ b/pkg/p9/transport.go @@ -54,7 +54,10 @@ const ( headerLength uint32 = 7 // maximumLength is the largest possible message. - maximumLength uint32 = 4 * 1024 * 1024 + maximumLength uint32 = 1 << 20 + + // DefaultMessageSize is a sensible default. + DefaultMessageSize uint32 = 64 << 10 // initialBufferLength is the initial data buffer we allocate. initialBufferLength uint32 = 64 diff --git a/pkg/p9/transport_flipcall.go b/pkg/p9/transport_flipcall.go new file mode 100644 index 000000000..aebb54959 --- /dev/null +++ b/pkg/p9/transport_flipcall.go @@ -0,0 +1,254 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package p9 + +import ( + "runtime" + "syscall" + + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/fdchannel" + "gvisor.dev/gvisor/pkg/flipcall" + "gvisor.dev/gvisor/pkg/log" +) + +// channelsPerClient is the number of channels to create per client. +// +// While the client and server will generally agree on this number, in reality +// it's completely up to the server. We simply define a minimum of 2, and a +// maximum of 4, and select the number of available processes as a tie-breaker. +// Note that we don't want the number of channels to be too large, because each +// will account for channelSize memory used, which can be large. +var channelsPerClient = func() int { + n := runtime.NumCPU() + if n < 2 { + return 2 + } + if n > 4 { + return 4 + } + return n +}() + +// channelSize is the channel size to create. +// +// We simply ensure that this is larger than the largest possible message size, +// plus the flipcall packet header, plus the two bytes we write below. +const channelSize = int(2 + flipcall.PacketHeaderBytes + 2 + maximumLength) + +// channel is a fast IPC channel. +// +// The same object is used by both the server and client implementations. In +// general, the client will use only the send and recv methods. +type channel struct { + desc flipcall.PacketWindowDescriptor + data flipcall.Endpoint + fds fdchannel.Endpoint + buf buffer + + // -- client only -- + connected bool + + // -- server only -- + client *fd.FD + done chan struct{} +} + +// reset resets the channel buffer. +func (ch *channel) reset(sz uint32) { + ch.buf.data = ch.data.Data()[:sz] +} + +// service services the channel. +func (ch *channel) service(cs *connState) error { + rsz, err := ch.data.RecvFirst() + if err != nil { + return err + } + for rsz > 0 { + m, err := ch.recv(nil, rsz) + if err != nil { + return err + } + r := cs.handle(m) + msgRegistry.put(m) + rsz, err = ch.send(r) + if err != nil { + return err + } + } + return nil // Done. +} + +// Shutdown shuts down the channel. +// +// This must be called before Close. +func (ch *channel) Shutdown() { + ch.data.Shutdown() +} + +// Close closes the channel. +// +// This must only be called once, and cannot return an error. Note that +// synchronization for this method is provided at a high-level, depending on +// whether it is the client or server. This cannot be called while there are +// active callers in either service or sendRecv. +// +// Precondition: the channel should be shutdown. +func (ch *channel) Close() error { + // Close all backing transports. + ch.fds.Destroy() + ch.data.Destroy() + if ch.client != nil { + ch.client.Close() + } + return nil +} + +// send sends the given message. +// +// The return value is the size of the received response. Not that in the +// server case, this is the size of the next request. +func (ch *channel) send(m message) (uint32, error) { + if log.IsLogging(log.Debug) { + log.Debugf("send [channel @%p] %s", ch, m.String()) + } + + // Send any file payload. + sentFD := false + if filer, ok := m.(filer); ok { + if f := filer.FilePayload(); f != nil { + if err := ch.fds.SendFD(f.FD()); err != nil { + return 0, syscall.EIO // Map everything to EIO. + } + f.Close() // Per sendRecvLegacy. + sentFD = true // To mark below. + } + } + + // Encode the message. + // + // Note that IPC itself encodes the length of messages, so we don't + // need to encode a standard 9P header. We write only the message type. + ch.reset(0) + + ch.buf.WriteMsgType(m.Type()) + if sentFD { + ch.buf.Write8(1) // Incoming FD. + } else { + ch.buf.Write8(0) // No incoming FD. + } + m.Encode(&ch.buf) + ssz := uint32(len(ch.buf.data)) // Updated below. + + // Is there a payload? + if payloader, ok := m.(payloader); ok { + p := payloader.Payload() + copy(ch.data.Data()[ssz:], p) + ssz += uint32(len(p)) + } + + // Perform the one-shot communication. + n, err := ch.data.SendRecv(ssz) + if err != nil { + if n > 0 { + return n, nil + } + return 0, syscall.EIO // See above. + } + + return n, nil +} + +// recv decodes a message that exists on the channel. +// +// If the passed r is non-nil, then the type must match or an error will be +// generated. If the passed r is nil, then a new message will be created and +// returned. +func (ch *channel) recv(r message, rsz uint32) (message, error) { + // Decode the response from the inline buffer. + ch.reset(rsz) + t := ch.buf.ReadMsgType() + hasFD := ch.buf.Read8() != 0 + if t == MsgRlerror { + // Change the message type. We check for this special case + // after decoding below, and transform into an error. + r = &Rlerror{} + } else if r == nil { + nr, err := msgRegistry.get(0, t) + if err != nil { + return nil, err + } + r = nr // New message. + } else if t != r.Type() { + // Not an error and not the expected response; propagate. + return nil, &ErrBadResponse{Got: t, Want: r.Type()} + } + + // Is there a payload? Set to the latter portion. + if payloader, ok := r.(payloader); ok { + fs := payloader.FixedSize() + payloader.SetPayload(ch.buf.data[fs:]) + ch.buf.data = ch.buf.data[:fs] + } + + r.Decode(&ch.buf) + if ch.buf.isOverrun() { + // Nothing valid was available. + log.Debugf("recv [got %d bytes, needed more]", rsz) + return nil, ErrNoValidMessage + } + + // Read any FD result. + if hasFD { + if rfd, err := ch.fds.RecvFDNonblock(); err == nil { + f := fd.New(rfd) + if filer, ok := r.(filer); ok { + // Set the payload. + filer.SetFilePayload(f) + } else { + // Don't want the FD. + f.Close() + } + } else { + // The header bit was set but nothing came in. + log.Warningf("expected FD, got err: %v", err) + } + } + + // Log a message. + if log.IsLogging(log.Debug) { + log.Debugf("recv [channel @%p] %s", ch, r.String()) + } + + // Convert errors appropriately; see above. + if rlerr, ok := r.(*Rlerror); ok { + return nil, syscall.Errno(rlerr.Error) + } + + return r, nil +} + +// sendRecv sends the given message over the channel. +// +// This is used by the client. +func (ch *channel) sendRecv(c *Client, m, r message) error { + rsz, err := ch.send(m) + if err != nil { + return err + } + _, err = ch.recv(r, rsz) + return err +} diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go index cdb3bc841..2f50ff3ea 100644 --- a/pkg/p9/transport_test.go +++ b/pkg/p9/transport_test.go @@ -124,7 +124,9 @@ func TestSendRecvWithFile(t *testing.T) { t.Fatalf("unable to create file: %v", err) } - if err := send(client, Tag(1), &Rlopen{File: f}); err != nil { + rlopen := &Rlopen{} + rlopen.SetFilePayload(f) + if err := send(client, Tag(1), rlopen); err != nil { t.Fatalf("send got err %v expected nil", err) } diff --git a/pkg/p9/version.go b/pkg/p9/version.go index c2a2885ae..f1ffdd23a 100644 --- a/pkg/p9/version.go +++ b/pkg/p9/version.go @@ -26,7 +26,7 @@ const ( // // Clients are expected to start requesting this version number and // to continuously decrement it until a Tversion request succeeds. - highestSupportedVersion uint32 = 7 + highestSupportedVersion uint32 = 8 // lowestSupportedVersion is the lowest supported version X in a // version string of the format 9P2000.L.Google.X. @@ -148,3 +148,10 @@ func VersionSupportsMultiUser(v uint32) bool { func versionSupportsTallocate(v uint32) bool { return v >= 7 } + +// versionSupportsFlipcall returns true if version v supports IPC channels from +// the flipcall package. Note that these must be negotiated, but this version +// string indicates that such a facility exists. +func versionSupportsFlipcall(v uint32) bool { + return v >= 8 +} diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 7ca776b3a..a2ecc6bcb 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -88,14 +88,24 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), seccomp.AllowAny{}, seccomp.AllowAny{}, - seccomp.AllowValue(0), }, { seccomp.AllowAny{}, seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), seccomp.AllowAny{}, + }, + // Non-private variants are included for flipcall support. They are otherwise + // unncessary, as the sentry will use only private futexes internally. + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE), seccomp.AllowAny{}, - seccomp.AllowValue(0), }, }, syscall.SYS_GETPID: {}, diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD index e2318a978..02168ad1b 100644 --- a/runsc/fsgofer/filter/BUILD +++ b/runsc/fsgofer/filter/BUILD @@ -17,6 +17,7 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/flipcall", "//pkg/log", "//pkg/seccomp", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 8ddfa77d6..2f3f2039a 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -83,6 +83,11 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowAny{}, seccomp.AllowValue(syscall.F_GETFD), }, + // Used by flipcall.PacketWindowAllocator.Init(). + { + seccomp.AllowAny{}, + seccomp.AllowValue(unix.F_ADD_SEALS), + }, }, syscall.SYS_FSTAT: {}, syscall.SYS_FSTATFS: {}, @@ -103,6 +108,19 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowAny{}, seccomp.AllowValue(0), }, + // Non-private futex used for flipcall. + seccomp.Rule{ + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + seccomp.Rule{ + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, }, syscall.SYS_GETDENTS64: {}, syscall.SYS_GETPID: {}, @@ -112,6 +130,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_LINKAT: {}, syscall.SYS_LSEEK: {}, syscall.SYS_MADVISE: {}, + unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init(). syscall.SYS_MKDIRAT: {}, syscall.SYS_MMAP: []seccomp.Rule{ { @@ -160,6 +179,13 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_RT_SIGPROCMASK: {}, syscall.SYS_SCHED_YIELD: {}, syscall.SYS_SENDMSG: []seccomp.Rule{ + // Used by fdchannel.Endpoint.SendFD(). + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + // Used by unet.SocketWriter.WriteVec(). { seccomp.AllowAny{}, seccomp.AllowAny{}, @@ -170,7 +196,15 @@ var allowedSyscalls = seccomp.SyscallRules{ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, - syscall.SYS_SYMLINKAT: {}, + // Used by fdchannel.NewConnectedSockets(). + syscall.SYS_SOCKETPAIR: { + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_SYMLINKAT: {}, syscall.SYS_TGKILL: []seccomp.Rule{ { seccomp.AllowValue(uint64(os.Getpid())), -- cgit v1.2.3 From ac38a7ead0870118d27d570a8a98a90a7a225a12 Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Thu, 19 Sep 2019 12:37:15 -0400 Subject: Place the host UDS mounting behind --fsgofer-host-uds-allowed. This commit allows the use of the `--fsgofer-host-uds-allowed` flag to enable mounting sockets and add the appropriate seccomp filters. --- runsc/boot/config.go | 3 ++ runsc/cmd/gofer.go | 25 +++++++++++----- runsc/container/container.go | 5 ++++ runsc/fsgofer/filter/config.go | 23 ++++++++------- runsc/fsgofer/filter/filter.go | 12 ++++++++ runsc/fsgofer/fsgofer.go | 18 +++++++++--- runsc/main.go | 66 ++++++++++++++++++++++-------------------- 7 files changed, 98 insertions(+), 54 deletions(-) (limited to 'runsc/fsgofer') diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 7ae0dd05d..954ad2c2a 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -138,6 +138,9 @@ type Config struct { // Overlay is whether to wrap the root filesystem in an overlay. Overlay bool + // fsGoferHostUDSAllowed enables the gofer to mount a host UDS + FSGoferHostUDSAllowed bool + // Network indicates what type of network to use. Network NetworkType diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 9faabf494..8e63c80e0 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -56,10 +56,11 @@ var goferCaps = &specs.LinuxCapabilities{ // Gofer implements subcommands.Command for the "gofer" command, which starts a // filesystem gofer. This command should not be called directly. type Gofer struct { - bundleDir string - ioFDs intFlags - applyCaps bool - setUpRoot bool + bundleDir string + ioFDs intFlags + applyCaps bool + hostUDSAllowed bool + setUpRoot bool panicOnWrite bool specFD int @@ -86,6 +87,7 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") + f.BoolVar(&g.hostUDSAllowed, "host-uds-allowed", false, "if true, allow the Gofer to mount a host UDS") f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") @@ -180,8 +182,9 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) for _, m := range spec.Mounts { if specutils.Is9PMount(m) { cfg := fsgofer.Config{ - ROMount: isReadonlyMount(m.Options), - PanicOnWrite: g.panicOnWrite, + ROMount: isReadonlyMount(m.Options), + PanicOnWrite: g.panicOnWrite, + HostUDSAllowed: g.hostUDSAllowed, } ap, err := fsgofer.NewAttachPoint(m.Destination, cfg) if err != nil { @@ -200,8 +203,14 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) } - if err := filter.Install(); err != nil { - Fatalf("installing seccomp filters: %v", err) + if g.hostUDSAllowed { + if err := filter.InstallUDS(); err != nil { + Fatalf("installing UDS seccomp filters: %v", err) + } + } else { + if err := filter.Install(); err != nil { + Fatalf("installing seccomp filters: %v", err) + } } runServers(ats, g.ioFDs) diff --git a/runsc/container/container.go b/runsc/container/container.go index bbb364214..ceadb38aa 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -941,6 +941,11 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund args = append(args, "--panic-on-write=true") } + // Add support for mounting host UDS in the gofer + if conf.FSGoferHostUDSAllowed { + args = append(args, "--host-uds-allowed=true") + } + // Open the spec file to donate to the sandbox. specFile, err := specutils.OpenSpec(bundleDir) if err != nil { diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 73407383d..8989cdb2f 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -26,16 +26,6 @@ import ( // allowedSyscalls is the set of syscalls executed by the gofer. var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_ACCEPT: {}, - syscall.SYS_SOCKET: []seccomp.Rule{ - { - seccomp.AllowValue(syscall.AF_UNIX), - }, - }, - syscall.SYS_CONNECT: []seccomp.Rule{ - { - seccomp.AllowAny{}, - }, - }, syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ {seccomp.AllowValue(linux.ARCH_GET_FS)}, {seccomp.AllowValue(linux.ARCH_SET_FS)}, @@ -194,3 +184,16 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_UTIMENSAT: {}, syscall.SYS_WRITE: {}, } + +var udsSyscalls = seccomp.SyscallRules{ + syscall.SYS_SOCKET: []seccomp.Rule{ + { + seccomp.AllowValue(syscall.AF_UNIX), + }, + }, + syscall.SYS_CONNECT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + }, + }, +} diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go index 65053415f..12ef19d18 100644 --- a/runsc/fsgofer/filter/filter.go +++ b/runsc/fsgofer/filter/filter.go @@ -31,3 +31,15 @@ func Install() error { return seccomp.Install(s) } + +// InstallUDS installs the standard Gofer seccomp filters along with filters +// allowing the gofer to connect to a host UDS. +func InstallUDS() error { + // Use the base syscall + s := allowedSyscalls + + // Add additional filters required for connecting to the host's sockets. + s.Merge(udsSyscalls) + + return seccomp.Install(s) +} diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 89171c811..d9f3ba8d6 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -85,6 +85,9 @@ type Config struct { // PanicOnWrite panics on attempts to write to RO mounts. PanicOnWrite bool + + // HostUDS prevents + HostUDSAllowed bool } type attachPoint struct { @@ -128,12 +131,21 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err) } + // Acquire the attach point lock + a.attachedMu.Lock() + defer a.attachedMu.Unlock() + // Hold the file descriptor we are converting into a p9.File var f *fd.FD // Apply the S_IFMT bitmask so we can detect file type appropriately switch fmtStat := stat.Mode & syscall.S_IFMT; { case fmtStat == syscall.S_IFSOCK: + // Check to see if the CLI option has been set to allow the UDS mount + if !a.conf.HostUDSAllowed { + return nil, fmt.Errorf("host UDS support is disabled") + } + // Attempt to open a connection. Bubble up the failures. f, err = fd.OpenUnix(a.prefix) if err != nil { @@ -144,7 +156,7 @@ func (a *attachPoint) Attach() (p9.File, error) { // Default to Read/Write permissions. mode := syscall.O_RDWR - // If the configuration is Read Only & the mount point is a directory, + // If the configuration is Read Only or the mount point is a directory, // set the mode to Read Only. if a.conf.ROMount || fmtStat == syscall.S_IFDIR { mode = syscall.O_RDONLY @@ -157,9 +169,7 @@ func (a *attachPoint) Attach() (p9.File, error) { } } - // Close the connection if the UDS is already attached. - a.attachedMu.Lock() - defer a.attachedMu.Unlock() + // Close the connection if already attached. if a.attached { f.Close() return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) diff --git a/runsc/main.go b/runsc/main.go index c61583441..5eba949f6 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -63,17 +63,18 @@ var ( straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") // Flags that control sandbox runtime behavior. - platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") - network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload") - fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") - overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") - watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") - panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") - profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") - netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") - numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") - rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") + platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + gso = flag.Bool("gso", true, "enable generic segmenation offload") + fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + fsGoferHostUDSAllowed = flag.Bool("fsgofer-host-uds-allowed", false, "Allow the gofer to mount Unix Domain Sockets.") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") + panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") + profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") + numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") + rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -171,27 +172,28 @@ func main() { // Create a new Config from the flags. conf := &boot.Config{ - RootDir: *rootDir, - Debug: *debug, - LogFilename: *logFilename, - LogFormat: *logFormat, - DebugLog: *debugLog, - DebugLogFormat: *debugLogFormat, - FileAccess: fsAccess, - Overlay: *overlay, - Network: netType, - GSO: *gso, - LogPackets: *logPackets, - Platform: platformType, - Strace: *strace, - StraceLogSize: *straceLogSize, - WatchdogAction: wa, - PanicSignal: *panicSignal, - ProfileEnable: *profile, - EnableRaw: *netRaw, - NumNetworkChannels: *numNetworkChannels, - Rootless: *rootless, - AlsoLogToStderr: *alsoLogToStderr, + RootDir: *rootDir, + Debug: *debug, + LogFilename: *logFilename, + LogFormat: *logFormat, + DebugLog: *debugLog, + DebugLogFormat: *debugLogFormat, + FileAccess: fsAccess, + FSGoferHostUDSAllowed: *fsGoferHostUDSAllowed, + Overlay: *overlay, + Network: netType, + GSO: *gso, + LogPackets: *logPackets, + Platform: platformType, + Strace: *strace, + StraceLogSize: *straceLogSize, + WatchdogAction: wa, + PanicSignal: *panicSignal, + ProfileEnable: *profile, + EnableRaw: *netRaw, + NumNetworkChannels: *numNetworkChannels, + Rootless: *rootless, + AlsoLogToStderr: *alsoLogToStderr, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, } -- cgit v1.2.3 From 46beb919121f02d8bd110a54fb8f6de5dfd2891e Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Thu, 19 Sep 2019 17:10:50 -0400 Subject: Fix documentation, clean up seccomp filter installation, rename helpers. Filter installation has been streamlined and functions renamed. Documentation has been fixed to be standards compliant, and missing documentation added. gofmt has also been applied to modified files. --- pkg/fd/fd.go | 6 +++--- runsc/boot/config.go | 2 +- runsc/cmd/gofer.go | 12 +++++------- runsc/fsgofer/filter/filter.go | 19 ++++++------------- runsc/fsgofer/fsgofer.go | 21 +++++++++++---------- 5 files changed, 26 insertions(+), 34 deletions(-) (limited to 'runsc/fsgofer') diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go index 7f1f9d984..24e959944 100644 --- a/pkg/fd/fd.go +++ b/pkg/fd/fd.go @@ -17,12 +17,12 @@ package fd import ( "fmt" + "gvisor.dev/gvisor/pkg/unet" "io" "os" "runtime" "sync/atomic" "syscall" - "gvisor.dev/gvisor/pkg/unet" ) // ReadWriter implements io.ReadWriter, io.ReaderAt, and io.WriterAt for fd. It @@ -186,8 +186,8 @@ func OpenAt(dir *FD, path string, flags int, mode uint32) (*FD, error) { return New(f), nil } -// OpenUnix Open a Unix Domain Socket and return the file descriptor for it. -func OpenUnix(path string) (*FD, error) { +// DialUnix connects to a Unix Domain Socket and return the file descriptor. +func DialUnix(path string) (*FD, error) { socket, err := unet.Connect(path, false) return New(socket.FD()), err } diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 954ad2c2a..f1adaba01 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -138,7 +138,7 @@ type Config struct { // Overlay is whether to wrap the root filesystem in an overlay. Overlay bool - // fsGoferHostUDSAllowed enables the gofer to mount a host UDS + // FSGoferHostUDSAllowed enables the gofer to mount a host UDS. FSGoferHostUDSAllowed bool // Network indicates what type of network to use. diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 8e63c80e0..fa4f0034d 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -204,13 +204,11 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } if g.hostUDSAllowed { - if err := filter.InstallUDS(); err != nil { - Fatalf("installing UDS seccomp filters: %v", err) - } - } else { - if err := filter.Install(); err != nil { - Fatalf("installing seccomp filters: %v", err) - } + filter.InstallUDSFilters() + } + + if err := filter.Install(); err != nil { + Fatalf("installing seccomp filters: %v", err) } runServers(ats, g.ioFDs) diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go index 12ef19d18..8d4ec9c24 100644 --- a/runsc/fsgofer/filter/filter.go +++ b/runsc/fsgofer/filter/filter.go @@ -23,23 +23,16 @@ import ( // Install installs seccomp filters. func Install() error { - s := allowedSyscalls - // Set of additional filters used by -race and -msan. Returns empty // when not enabled. - s.Merge(instrumentationFilters()) + allowedSyscalls.Merge(instrumentationFilters()) - return seccomp.Install(s) + return seccomp.Install(allowedSyscalls) } -// InstallUDS installs the standard Gofer seccomp filters along with filters -// allowing the gofer to connect to a host UDS. -func InstallUDS() error { - // Use the base syscall - s := allowedSyscalls - +// InstallUDSFilters installs the seccomp filters required to let the gofer connect +// to a host UDS. +func InstallUDSFilters() { // Add additional filters required for connecting to the host's sockets. - s.Merge(udsSyscalls) - - return seccomp.Install(s) + allowedSyscalls.Merge(udsSyscalls) } diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index d9f3ba8d6..357d712c6 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -21,6 +21,7 @@ package fsgofer import ( + "errors" "fmt" "io" "math" @@ -86,7 +87,7 @@ type Config struct { // PanicOnWrite panics on attempts to write to RO mounts. PanicOnWrite bool - // HostUDS prevents + // HostUDSAllowed signals whether the gofer can mount a host's UDS. HostUDSAllowed bool } @@ -131,23 +132,23 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err) } - // Acquire the attach point lock + // Acquire the attach point lock. a.attachedMu.Lock() defer a.attachedMu.Unlock() - // Hold the file descriptor we are converting into a p9.File + // Hold the file descriptor we are converting into a p9.File. var f *fd.FD - // Apply the S_IFMT bitmask so we can detect file type appropriately - switch fmtStat := stat.Mode & syscall.S_IFMT; { - case fmtStat == syscall.S_IFSOCK: - // Check to see if the CLI option has been set to allow the UDS mount + // Apply the S_IFMT bitmask so we can detect file type appropriately. + switch fmtStat := stat.Mode & syscall.S_IFMT; fmtStat { + case syscall.S_IFSOCK: + // Check to see if the CLI option has been set to allow the UDS mount. if !a.conf.HostUDSAllowed { - return nil, fmt.Errorf("host UDS support is disabled") + return nil, errors.New("host UDS support is disabled") } // Attempt to open a connection. Bubble up the failures. - f, err = fd.OpenUnix(a.prefix) + f, err = fd.DialUnix(a.prefix) if err != nil { return nil, err } @@ -1058,7 +1059,7 @@ func (l *localFile) Flush() error { // Connect implements p9.File. func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) { - return fd.OpenUnix(l.hostPath) + return fd.DialUnix(l.hostPath) } // Close implements p9.File. -- cgit v1.2.3 From e975184bc50944e82a6bf5f4c57bbe970933fdc5 Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Thu, 19 Sep 2019 17:44:46 -0400 Subject: Update InstallUDSFilters documentation to be accurate to functionality. --- runsc/fsgofer/filter/filter.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'runsc/fsgofer') diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go index 8d4ec9c24..289886720 100644 --- a/runsc/fsgofer/filter/filter.go +++ b/runsc/fsgofer/filter/filter.go @@ -30,8 +30,8 @@ func Install() error { return seccomp.Install(allowedSyscalls) } -// InstallUDSFilters installs the seccomp filters required to let the gofer connect -// to a host UDS. +// InstallUDSFilters extends the allowed syscalls to include those necessary for +// connecting to a host UDS. func InstallUDSFilters() { // Add additional filters required for connecting to the host's sockets. allowedSyscalls.Merge(udsSyscalls) -- cgit v1.2.3 From 7810b30983ec4d3a706df01163c29814cd21d6ca Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Tue, 24 Sep 2019 18:24:10 -0400 Subject: Refactor command line options and remove the allowed terminology for uds --- runsc/boot/config.go | 5 ++-- runsc/cmd/gofer.go | 18 ++++++------ runsc/container/container.go | 5 ---- runsc/fsgofer/fsgofer.go | 10 +++++-- runsc/main.go | 68 ++++++++++++++++++++++---------------------- 5 files changed, 52 insertions(+), 54 deletions(-) (limited to 'runsc/fsgofer') diff --git a/runsc/boot/config.go b/runsc/boot/config.go index f1adaba01..b76b0e574 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -138,8 +138,8 @@ type Config struct { // Overlay is whether to wrap the root filesystem in an overlay. Overlay bool - // FSGoferHostUDSAllowed enables the gofer to mount a host UDS. - FSGoferHostUDSAllowed bool + // FSGoferHostUDS enables the gofer to mount a host UDS. + FSGoferHostUDS bool // Network indicates what type of network to use. Network NetworkType @@ -217,6 +217,7 @@ func (c *Config) ToFlags() []string { "--debug-log-format=" + c.DebugLogFormat, "--file-access=" + c.FileAccess.String(), "--overlay=" + strconv.FormatBool(c.Overlay), + "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS), "--network=" + c.Network.String(), "--log-packets=" + strconv.FormatBool(c.LogPackets), "--platform=" + c.Platform, diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index fa4f0034d..fbd579fb8 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -56,11 +56,10 @@ var goferCaps = &specs.LinuxCapabilities{ // Gofer implements subcommands.Command for the "gofer" command, which starts a // filesystem gofer. This command should not be called directly. type Gofer struct { - bundleDir string - ioFDs intFlags - applyCaps bool - hostUDSAllowed bool - setUpRoot bool + bundleDir string + ioFDs intFlags + applyCaps bool + setUpRoot bool panicOnWrite bool specFD int @@ -87,7 +86,6 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") - f.BoolVar(&g.hostUDSAllowed, "host-uds-allowed", false, "if true, allow the Gofer to mount a host UDS") f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") @@ -182,9 +180,9 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) for _, m := range spec.Mounts { if specutils.Is9PMount(m) { cfg := fsgofer.Config{ - ROMount: isReadonlyMount(m.Options), - PanicOnWrite: g.panicOnWrite, - HostUDSAllowed: g.hostUDSAllowed, + ROMount: isReadonlyMount(m.Options), + PanicOnWrite: g.panicOnWrite, + HostUDS: conf.FSGoferHostUDS, } ap, err := fsgofer.NewAttachPoint(m.Destination, cfg) if err != nil { @@ -203,7 +201,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) } - if g.hostUDSAllowed { + if conf.FSGoferHostUDS { filter.InstallUDSFilters() } diff --git a/runsc/container/container.go b/runsc/container/container.go index ceadb38aa..bbb364214 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -941,11 +941,6 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund args = append(args, "--panic-on-write=true") } - // Add support for mounting host UDS in the gofer - if conf.FSGoferHostUDSAllowed { - args = append(args, "--host-uds-allowed=true") - } - // Open the spec file to donate to the sandbox. specFile, err := specutils.OpenSpec(bundleDir) if err != nil { diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 357d712c6..507d52b50 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -87,8 +87,8 @@ type Config struct { // PanicOnWrite panics on attempts to write to RO mounts. PanicOnWrite bool - // HostUDSAllowed signals whether the gofer can mount a host's UDS. - HostUDSAllowed bool + // HostUDS signals whether the gofer can mount a host's UDS. + HostUDS bool } type attachPoint struct { @@ -143,7 +143,7 @@ func (a *attachPoint) Attach() (p9.File, error) { switch fmtStat := stat.Mode & syscall.S_IFMT; fmtStat { case syscall.S_IFSOCK: // Check to see if the CLI option has been set to allow the UDS mount. - if !a.conf.HostUDSAllowed { + if !a.conf.HostUDS { return nil, errors.New("host UDS support is disabled") } @@ -1059,6 +1059,10 @@ func (l *localFile) Flush() error { // Connect implements p9.File. func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) { + // Check to see if the CLI option has been set to allow the UDS mount. + if !l.attachPoint.conf.HostUDS { + return nil, errors.New("host UDS support is disabled") + } return fd.DialUnix(l.hostPath) } diff --git a/runsc/main.go b/runsc/main.go index 5eba949f6..b788b1f76 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -63,18 +63,18 @@ var ( straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") // Flags that control sandbox runtime behavior. - platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") - network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload") - fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") - fsGoferHostUDSAllowed = flag.Bool("fsgofer-host-uds-allowed", false, "Allow the gofer to mount Unix Domain Sockets.") - overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") - watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") - panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") - profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") - netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") - numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") - rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") + platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + gso = flag.Bool("gso", true, "enable generic segmenation offload") + fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "Allow the gofer to mount Unix Domain Sockets.") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") + panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") + profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") + numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") + rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -172,28 +172,28 @@ func main() { // Create a new Config from the flags. conf := &boot.Config{ - RootDir: *rootDir, - Debug: *debug, - LogFilename: *logFilename, - LogFormat: *logFormat, - DebugLog: *debugLog, - DebugLogFormat: *debugLogFormat, - FileAccess: fsAccess, - FSGoferHostUDSAllowed: *fsGoferHostUDSAllowed, - Overlay: *overlay, - Network: netType, - GSO: *gso, - LogPackets: *logPackets, - Platform: platformType, - Strace: *strace, - StraceLogSize: *straceLogSize, - WatchdogAction: wa, - PanicSignal: *panicSignal, - ProfileEnable: *profile, - EnableRaw: *netRaw, - NumNetworkChannels: *numNetworkChannels, - Rootless: *rootless, - AlsoLogToStderr: *alsoLogToStderr, + RootDir: *rootDir, + Debug: *debug, + LogFilename: *logFilename, + LogFormat: *logFormat, + DebugLog: *debugLog, + DebugLogFormat: *debugLogFormat, + FileAccess: fsAccess, + FSGoferHostUDS: *fsGoferHostUDS, + Overlay: *overlay, + Network: netType, + GSO: *gso, + LogPackets: *logPackets, + Platform: platformType, + Strace: *strace, + StraceLogSize: *straceLogSize, + WatchdogAction: wa, + PanicSignal: *panicSignal, + ProfileEnable: *profile, + EnableRaw: *netRaw, + NumNetworkChannels: *numNetworkChannels, + Rootless: *rootless, + AlsoLogToStderr: *alsoLogToStderr, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, } -- cgit v1.2.3 From 9ebd498a55fa87129cdc60cdc3bca66f26c49454 Mon Sep 17 00:00:00 2001 From: Robert Tonic Date: Tue, 24 Sep 2019 18:37:25 -0400 Subject: Remove unecessary seccomp permission. This removes the F_DUPFD_CLOEXEC support for the gofer, previously required when depending on the STL net package. --- runsc/fsgofer/filter/config.go | 4 ---- 1 file changed, 4 deletions(-) (limited to 'runsc/fsgofer') diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 8989cdb2f..a3f104a58 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -83,10 +83,6 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowAny{}, seccomp.AllowValue(syscall.F_GETFD), }, - { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_DUPFD_CLOEXEC), - }, }, syscall.SYS_FSTAT: {}, syscall.SYS_FSTATFS: {}, -- cgit v1.2.3 From 8337e4f50955863c6aa3a7df70b1446b9dba66ae Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 26 Sep 2019 18:14:45 -0700 Subject: Disallow opening of sockets if --fsgofer-host-uds=false Updates #235 PiperOrigin-RevId: 271475319 --- runsc/fsgofer/fsgofer.go | 19 ++++++++++--------- runsc/fsgofer/fsgofer_test.go | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'runsc/fsgofer') diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index a570f1a41..29a82138e 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -136,6 +136,10 @@ func (a *attachPoint) Attach() (p9.File, error) { a.attachedMu.Lock() defer a.attachedMu.Unlock() + if a.attached { + return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) + } + // Hold the file descriptor we are converting into a p9.File. var f *fd.FD @@ -170,12 +174,6 @@ func (a *attachPoint) Attach() (p9.File, error) { } } - // Close the connection if already attached. - if a.attached { - f.Close() - return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) - } - // Return a localFile object to the caller with the UDS FD included. rv, err := newLocalFile(a, f, a.prefix, stat) if err != nil { @@ -330,7 +328,7 @@ func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) return file, nil } -func getSupportedFileType(stat syscall.Stat_t) (fileType, error) { +func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) { var ft fileType switch stat.Mode & syscall.S_IFMT { case syscall.S_IFREG: @@ -340,6 +338,9 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) { case syscall.S_IFLNK: ft = symlink case syscall.S_IFSOCK: + if !permitSocket { + return unknown, syscall.EPERM + } ft = socket default: return unknown, syscall.EPERM @@ -348,7 +349,7 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) { } func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) { - ft, err := getSupportedFileType(stat) + ft, err := getSupportedFileType(stat, a.conf.HostUDS) if err != nil { return nil, err } @@ -1065,7 +1066,7 @@ func (l *localFile) Flush() error { func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) { // Check to see if the CLI option has been set to allow the UDS mount. if !l.attachPoint.conf.HostUDS { - return nil, errors.New("host UDS support is disabled") + return nil, syscall.ECONNREFUSED } return fd.DialUnix(l.hostPath) } diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index cbbe71019..05af7e397 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -665,7 +665,7 @@ func TestAttachInvalidType(t *testing.T) { } f, err := a.Attach() if f != nil || err == nil { - t.Fatalf("Attach should have failed, got (%v, nil)", f) + t.Fatalf("Attach should have failed, got (%v, %v)", f, err) } }) } -- cgit v1.2.3