diff options
author | Googler <noreply@google.com> | 2018-04-27 10:37:02 -0700 |
---|---|---|
committer | Adin Scannell <ascannell@google.com> | 2018-04-28 01:44:26 -0400 |
commit | d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch) | |
tree | 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/socket/rpcinet | |
parent | f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff) |
Check in gVisor.
PiperOrigin-RevId: 194583126
Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/socket/rpcinet')
-rw-r--r-- | pkg/sentry/socket/rpcinet/BUILD | 59 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/conn/BUILD | 17 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/conn/conn.go | 167 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/device.go | 19 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/notifier/BUILD | 15 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/notifier/notifier.go | 230 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/rpcinet.go | 16 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/socket.go | 567 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/stack.go | 175 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/stack_unsafe.go | 193 | ||||
-rw-r--r-- | pkg/sentry/socket/rpcinet/syscall_rpc.proto | 351 |
11 files changed, 1809 insertions, 0 deletions
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD new file mode 100644 index 000000000..b0351b363 --- /dev/null +++ b/pkg/sentry/socket/rpcinet/BUILD @@ -0,0 +1,59 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "rpcinet", + srcs = [ + "device.go", + "rpcinet.go", + "socket.go", + "stack.go", + "stack_unsafe.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet", + visibility = ["//pkg/sentry:internal"], + deps = [ + ":syscall_rpc_go_proto", + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/socket", + "//pkg/sentry/socket/hostinet", + "//pkg/sentry/socket/rpcinet/conn", + "//pkg/sentry/socket/rpcinet/notifier", + "//pkg/sentry/usermem", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip/buffer", + "//pkg/tcpip/transport/unix", + "//pkg/unet", + "//pkg/waiter", + ], +) + +proto_library( + name = "syscall_rpc_proto", + srcs = ["syscall_rpc.proto"], + visibility = [ + "//visibility:public", + ], +) + +go_proto_library( + name = "syscall_rpc_go_proto", + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto", + proto = ":syscall_rpc_proto", + visibility = [ + "//visibility:public", + ], +) diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD new file mode 100644 index 000000000..4923dee4b --- /dev/null +++ b/pkg/sentry/socket/rpcinet/conn/BUILD @@ -0,0 +1,17 @@ +package(licenses = ["notice"]) # BSD + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "conn", + srcs = ["conn.go"], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/binary", + "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", + "//pkg/syserr", + "//pkg/unet", + "@com_github_golang_protobuf//proto:go_default_library", + ], +) diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go new file mode 100644 index 000000000..ea6ec87ed --- /dev/null +++ b/pkg/sentry/socket/rpcinet/conn/conn.go @@ -0,0 +1,167 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package conn is an RPC connection to a syscall RPC server. +package conn + +import ( + "fmt" + "sync" + "sync/atomic" + "syscall" + + "github.com/golang/protobuf/proto" + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/syserr" + "gvisor.googlesource.com/gvisor/pkg/unet" + + pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" +) + +type request struct { + response []byte + ready chan struct{} + ignoreResult bool +} + +// RPCConnection represents a single RPC connection to a syscall gofer. +type RPCConnection struct { + // reqID is the ID of the last request and must be accessed atomically. + reqID uint64 + + sendMu sync.Mutex + socket *unet.Socket + + reqMu sync.Mutex + requests map[uint64]request +} + +// NewRPCConnection initializes a RPC connection to a socket gofer. +func NewRPCConnection(s *unet.Socket) *RPCConnection { + conn := &RPCConnection{socket: s, requests: map[uint64]request{}} + go func() { // S/R-FIXME + var nums [16]byte + for { + for n := 0; n < len(nums); { + nn, err := conn.socket.Read(nums[n:]) + if err != nil { + panic(fmt.Sprint("error reading length from socket rpc gofer: ", err)) + } + n += nn + } + + b := make([]byte, binary.LittleEndian.Uint64(nums[:8])) + id := binary.LittleEndian.Uint64(nums[8:]) + + for n := 0; n < len(b); { + nn, err := conn.socket.Read(b[n:]) + if err != nil { + panic(fmt.Sprint("error reading request from socket rpc gofer: ", err)) + } + n += nn + } + + conn.reqMu.Lock() + r := conn.requests[id] + if r.ignoreResult { + delete(conn.requests, id) + } else { + r.response = b + conn.requests[id] = r + } + conn.reqMu.Unlock() + close(r.ready) + } + }() + return conn +} + +// NewRequest makes a request to the RPC gofer and returns the request ID and a +// channel which will be closed once the request completes. +func (c *RPCConnection) NewRequest(req pb.SyscallRequest, ignoreResult bool) (uint64, chan struct{}) { + b, err := proto.Marshal(&req) + if err != nil { + panic(fmt.Sprint("invalid proto: ", err)) + } + + id := atomic.AddUint64(&c.reqID, 1) + ch := make(chan struct{}) + + c.reqMu.Lock() + c.requests[id] = request{ready: ch, ignoreResult: ignoreResult} + c.reqMu.Unlock() + + c.sendMu.Lock() + defer c.sendMu.Unlock() + + var nums [16]byte + binary.LittleEndian.PutUint64(nums[:8], uint64(len(b))) + binary.LittleEndian.PutUint64(nums[8:], id) + for n := 0; n < len(nums); { + nn, err := c.socket.Write(nums[n:]) + if err != nil { + panic(fmt.Sprint("error writing length and ID to socket gofer: ", err)) + } + n += nn + } + + for n := 0; n < len(b); { + nn, err := c.socket.Write(b[n:]) + if err != nil { + panic(fmt.Sprint("error writing request to socket gofer: ", err)) + } + n += nn + } + + return id, ch +} + +// RPCReadFile will execute the ReadFile helper RPC method which avoids the +// common pattern of open(2), read(2), close(2) by doing all three operations +// as a single RPC. It will read the entire file or return EFBIG if the file +// was too large. +func (c *RPCConnection) RPCReadFile(path string) ([]byte, *syserr.Error) { + req := &pb.SyscallRequest_ReadFile{&pb.ReadFileRequest{ + Path: path, + }} + + id, ch := c.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) + <-ch + + res := c.Request(id).Result.(*pb.SyscallResponse_ReadFile).ReadFile.Result + if e, ok := res.(*pb.ReadFileResponse_ErrorNumber); ok { + return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + return res.(*pb.ReadFileResponse_Data).Data, nil +} + +// Request retrieves the request corresponding to the given request ID. +// +// The channel returned by NewRequest must have been closed before Request can +// be called. This will happen automatically, do not manually close the +// channel. +func (c *RPCConnection) Request(id uint64) pb.SyscallResponse { + c.reqMu.Lock() + r := c.requests[id] + delete(c.requests, id) + c.reqMu.Unlock() + + var resp pb.SyscallResponse + if err := proto.Unmarshal(r.response, &resp); err != nil { + panic(fmt.Sprint("invalid proto: ", err)) + } + + return resp +} diff --git a/pkg/sentry/socket/rpcinet/device.go b/pkg/sentry/socket/rpcinet/device.go new file mode 100644 index 000000000..f7b63436e --- /dev/null +++ b/pkg/sentry/socket/rpcinet/device.go @@ -0,0 +1,19 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rpcinet + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +var socketDevice = device.NewAnonDevice() diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD new file mode 100644 index 000000000..6f3b06a05 --- /dev/null +++ b/pkg/sentry/socket/rpcinet/notifier/BUILD @@ -0,0 +1,15 @@ +package(licenses = ["notice"]) # BSD + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "notifier", + srcs = ["notifier.go"], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", + "//pkg/sentry/socket/rpcinet/conn", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go new file mode 100644 index 000000000..f88a908ed --- /dev/null +++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go @@ -0,0 +1,230 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package notifier implements an FD notifier implementation over RPC. +package notifier + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn" + pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +type fdInfo struct { + queue *waiter.Queue + waiting bool +} + +// Notifier holds all the state necessary to issue notifications when IO events +// occur in the observed FDs. +type Notifier struct { + // rpcConn is the connection that is used for sending RPCs. + rpcConn *conn.RPCConnection + + // epFD is the epoll file descriptor used to register for io + // notifications. + epFD uint32 + + // mu protects fdMap. + mu sync.Mutex + + // fdMap maps file descriptors to their notification queues and waiting + // status. + fdMap map[uint32]*fdInfo +} + +// NewRPCNotifier creates a new notifier object. +func NewRPCNotifier(cn *conn.RPCConnection) (*Notifier, error) { + id, c := cn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCreate1{&pb.EpollCreate1Request{}}}, false /* ignoreResult */) + <-c + + res := cn.Request(id).Result.(*pb.SyscallResponse_EpollCreate1).EpollCreate1.Result + if e, ok := res.(*pb.EpollCreate1Response_ErrorNumber); ok { + return nil, syscall.Errno(e.ErrorNumber) + } + + w := &Notifier{ + rpcConn: cn, + epFD: res.(*pb.EpollCreate1Response_Fd).Fd, + fdMap: make(map[uint32]*fdInfo), + } + + go w.waitAndNotify() // S/R-FIXME + + return w, nil +} + +// waitFD waits on mask for fd. The fdMap mutex must be hold. +func (n *Notifier) waitFD(fd uint32, fi *fdInfo, mask waiter.EventMask) error { + if !fi.waiting && mask == 0 { + return nil + } + + e := pb.EpollEvent{ + Events: uint32(mask) | -syscall.EPOLLET, + Fd: fd, + } + + switch { + case !fi.waiting && mask != 0: + id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_ADD, Fd: fd, Event: &e}}}, false /* ignoreResult */) + <-c + + e := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollCtl).EpollCtl.ErrorNumber + if e != 0 { + return syscall.Errno(e) + } + + fi.waiting = true + case fi.waiting && mask == 0: + id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_DEL, Fd: fd}}}, false /* ignoreResult */) + <-c + n.rpcConn.Request(id) + + fi.waiting = false + case fi.waiting && mask != 0: + id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_MOD, Fd: fd, Event: &e}}}, false /* ignoreResult */) + <-c + + e := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollCtl).EpollCtl.ErrorNumber + if e != 0 { + return syscall.Errno(e) + } + } + + return nil +} + +// addFD adds an FD to the list of FDs observed by n. +func (n *Notifier) addFD(fd uint32, queue *waiter.Queue) { + n.mu.Lock() + defer n.mu.Unlock() + + // Panic if we're already notifying on this FD. + if _, ok := n.fdMap[fd]; ok { + panic(fmt.Sprintf("File descriptor %d added twice", fd)) + } + + // We have nothing to wait for at the moment. Just add it to the map. + n.fdMap[fd] = &fdInfo{queue: queue} +} + +// updateFD updates the set of events the FD needs to be notified on. +func (n *Notifier) updateFD(fd uint32) error { + n.mu.Lock() + defer n.mu.Unlock() + + if fi, ok := n.fdMap[fd]; ok { + return n.waitFD(fd, fi, fi.queue.Events()) + } + + return nil +} + +// RemoveFD removes an FD from the list of FDs observed by n. +func (n *Notifier) removeFD(fd uint32) { + n.mu.Lock() + defer n.mu.Unlock() + + // Remove from map, then from epoll object. + n.waitFD(fd, n.fdMap[fd], 0) + delete(n.fdMap, fd) +} + +// hasFD returns true if the FD is in the list of observed FDs. +func (n *Notifier) hasFD(fd uint32) bool { + n.mu.Lock() + defer n.mu.Unlock() + + _, ok := n.fdMap[fd] + return ok +} + +// waitAndNotify loops waiting for io event notifications from the epoll +// object. Once notifications arrive, they are dispatched to the +// registered queue. +func (n *Notifier) waitAndNotify() error { + for { + id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollWait{&pb.EpollWaitRequest{Fd: n.epFD, NumEvents: 100, Msec: -1}}}, false /* ignoreResult */) + <-c + + res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollWait).EpollWait.Result + if e, ok := res.(*pb.EpollWaitResponse_ErrorNumber); ok { + err := syscall.Errno(e.ErrorNumber) + // NOTE: I don't think epoll_wait can return EAGAIN but I'm being + // conseratively careful here since exiting the notification thread + // would be really bad. + if err == syscall.EINTR || err == syscall.EAGAIN { + continue + } + return err + } + + n.mu.Lock() + for _, e := range res.(*pb.EpollWaitResponse_Events).Events.Events { + if fi, ok := n.fdMap[e.Fd]; ok { + fi.queue.Notify(waiter.EventMask(e.Events)) + } + } + n.mu.Unlock() + } +} + +// AddFD adds an FD to the list of observed FDs. +func (n *Notifier) AddFD(fd uint32, queue *waiter.Queue) error { + n.addFD(fd, queue) + return nil +} + +// UpdateFD updates the set of events the FD needs to be notified on. +func (n *Notifier) UpdateFD(fd uint32) error { + return n.updateFD(fd) +} + +// RemoveFD removes an FD from the list of observed FDs. +func (n *Notifier) RemoveFD(fd uint32) { + n.removeFD(fd) +} + +// HasFD returns true if the FD is in the list of observed FDs. +// +// This should only be used by tests to assert that FDs are correctly +// registered. +func (n *Notifier) HasFD(fd uint32) bool { + return n.hasFD(fd) +} + +// NonBlockingPoll polls the given fd in non-blocking fashion. It is used just +// to query the FD's current state; this method will block on the RPC response +// although the syscall is non-blocking. +func (n *Notifier) NonBlockingPoll(fd uint32, mask waiter.EventMask) waiter.EventMask { + for { + id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Poll{&pb.PollRequest{Fd: fd, Events: uint32(mask)}}}, false /* ignoreResult */) + <-c + + res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_Poll).Poll.Result + if e, ok := res.(*pb.PollResponse_ErrorNumber); ok { + if syscall.Errno(e.ErrorNumber) == syscall.EINTR { + continue + } + return mask + } + + return waiter.EventMask(res.(*pb.PollResponse_Events).Events) + } +} diff --git a/pkg/sentry/socket/rpcinet/rpcinet.go b/pkg/sentry/socket/rpcinet/rpcinet.go new file mode 100644 index 000000000..10b0dedc2 --- /dev/null +++ b/pkg/sentry/socket/rpcinet/rpcinet.go @@ -0,0 +1,16 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package rpcinet implements sockets using an RPC for each syscall. +package rpcinet diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go new file mode 100644 index 000000000..574d99ba5 --- /dev/null +++ b/pkg/sentry/socket/rpcinet/socket.go @@ -0,0 +1,567 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rpcinet + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier" + pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserr" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// socketOperations implements fs.FileOperations and socket.Socket for a socket +// implemented using a host socket. +type socketOperations struct { + socket.ReceiveTimeout + fsutil.PipeSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + + fd uint32 // must be O_NONBLOCK + wq *waiter.Queue + rpcConn *conn.RPCConnection + notifier *notifier.Notifier +} + +// Verify that we actually implement socket.Socket. +var _ = socket.Socket(&socketOperations{}) + +// New creates a new RPC socket. +func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, protocol int) (*fs.File, *syserr.Error) { + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */) + <-c + + res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Socket).Socket.Result + if e, ok := res.(*pb.SocketResponse_ErrorNumber); ok { + return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + fd := res.(*pb.SocketResponse_Fd).Fd + + var wq waiter.Queue + stack.notifier.AddFD(fd, &wq) + + dirent := socket.NewDirent(ctx, socketDevice) + return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{ + wq: &wq, + fd: fd, + rpcConn: stack.rpcConn, + notifier: stack.notifier, + }), nil +} + +func isBlockingErrno(err error) bool { + return err == syscall.EAGAIN || err == syscall.EWOULDBLOCK +} + +func translateIOSyscallError(err error) error { + if isBlockingErrno(err) { + return syserror.ErrWouldBlock + } + return err +} + +// Release implements fs.FileOperations.Release. +func (s *socketOperations) Release() { + s.notifier.RemoveFD(s.fd) + + // We always need to close the FD. + _, _ = s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Close{&pb.CloseRequest{Fd: s.fd}}}, true /* ignoreResult */) +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.notifier.NonBlockingPoll(s.fd, mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.wq.EventRegister(e, mask) + s.notifier.UpdateFD(s.fd) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketOperations) EventUnregister(e *waiter.Entry) { + s.wq.EventUnregister(e) + s.notifier.UpdateFD(s.fd) +} + +func rpcRead(t *kernel.Task, req *pb.SyscallRequest_Read) (*pb.ReadResponse_Data, *syserr.Error) { + s := t.NetworkContext().(*Stack) + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) + <-c + + res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Read).Read.Result + if e, ok := res.(*pb.ReadResponse_ErrorNumber); ok { + return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + return res.(*pb.ReadResponse_Data), nil +} + +// Read implements fs.FileOperations.Read. +func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + req := &pb.SyscallRequest_Read{&pb.ReadRequest{ + Fd: s.fd, + Length: uint32(dst.NumBytes()), + }} + + res, se := rpcRead(ctx.(*kernel.Task), req) + if se == nil { + n, e := dst.CopyOut(ctx, res.Data) + return int64(n), e + } + if se != syserr.ErrWouldBlock { + return 0, se.ToError() + } + + // We'll have to block. Register for notifications and read again when ready. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + for { + res, se := rpcRead(ctx.(*kernel.Task), req) + if se == nil { + n, e := dst.CopyOut(ctx, res.Data) + return int64(n), e + } + if se != syserr.ErrWouldBlock { + return 0, se.ToError() + } + + if err := ctx.(*kernel.Task).Block(ch); err != nil { + return 0, err + } + } +} + +func rpcWrite(t *kernel.Task, req *pb.SyscallRequest_Write) (uint32, *syserr.Error) { + s := t.NetworkContext().(*Stack) + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) + <-c + + res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Write).Write.Result + if e, ok := res.(*pb.WriteResponse_ErrorNumber); ok { + return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + return res.(*pb.WriteResponse_Length).Length, nil +} + +// Write implements fs.FileOperations.Write. +func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + t := ctx.(*kernel.Task) + v := buffer.NewView(int(src.NumBytes())) + + // Copy all the data into the buffer. + if _, err := src.CopyIn(t, v); err != nil { + return 0, err + } + + n, err := rpcWrite(t, &pb.SyscallRequest_Write{&pb.WriteRequest{Fd: s.fd, Data: v}}) + return int64(n), err.ToError() +} + +func rpcConnect(t *kernel.Task, fd uint32, sockaddr []byte) *syserr.Error { + s := t.NetworkContext().(*Stack) + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Connect{&pb.ConnectRequest{Fd: uint32(fd), Address: sockaddr}}}, false /* ignoreResult */) + <-c + + if e := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Connect).Connect.ErrorNumber; e != 0 { + return syserr.FromHost(syscall.Errno(e)) + } + return nil +} + +// Connect implements socket.Socket.Connect. +func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { + if !blocking { + return rpcConnect(t, s.fd, sockaddr) + } + + // Register for notification when the endpoint becomes writable, then + // initiate the connection. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + + if err := rpcConnect(t, s.fd, sockaddr); err != syserr.ErrConnectStarted && err != syserr.ErrAlreadyConnecting { + return err + } + + // It's pending, so we have to wait for a notification, and fetch the + // result once the wait completes. + if err := t.Block(ch); err != nil { + return syserr.FromError(err) + } + + // Call Connect() again after blocking to find connect's result. + return rpcConnect(t, s.fd, sockaddr) +} + +func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultPayload, *syserr.Error) { + stack := t.NetworkContext().(*Stack) + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Accept{&pb.AcceptRequest{Fd: fd, Peer: peer, Flags: syscall.SOCK_NONBLOCK}}}, false /* ignoreResult */) + <-c + + res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Accept).Accept.Result + if e, ok := res.(*pb.AcceptResponse_ErrorNumber); ok { + return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + return res.(*pb.AcceptResponse_Payload).Payload, nil +} + +// Accept implements socket.Socket.Accept. +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { + payload, se := rpcAccept(t, s.fd, peerRequested) + + // Check if we need to block. + if blocking && se == syserr.ErrWouldBlock { + // Register for notifications. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + // Try to accept the connection again; if it fails, then wait until we + // get a notification. + for { + if payload, se = rpcAccept(t, s.fd, peerRequested); se != syserr.ErrWouldBlock { + break + } + + if err := t.Block(ch); err != nil { + return 0, nil, 0, syserr.FromError(err) + } + } + } + + // Handle any error from accept. + if se != nil { + return 0, nil, 0, se + } + + var wq waiter.Queue + s.notifier.AddFD(payload.Fd, &wq) + + dirent := socket.NewDirent(t, socketDevice) + file := fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonBlocking: flags&linux.SOCK_NONBLOCK != 0}, &socketOperations{ + wq: &wq, + fd: payload.Fd, + notifier: s.notifier, + }) + + fdFlags := kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + } + fd, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, 0, syserr.FromError(err) + } + + return fd, payload.Address.Address, payload.Address.Length, nil +} + +// Bind implements socket.Socket.Bind. +func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + stack := t.NetworkContext().(*Stack) + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Bind{&pb.BindRequest{Fd: s.fd, Address: sockaddr}}}, false /* ignoreResult */) + <-c + + if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 { + syserr.FromHost(syscall.Errno(e)) + } + return nil +} + +// Listen implements socket.Socket.Listen. +func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { + stack := t.NetworkContext().(*Stack) + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Listen{&pb.ListenRequest{Fd: s.fd, Backlog: int64(backlog)}}}, false /* ignoreResult */) + <-c + + if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Listen).Listen.ErrorNumber; e != 0 { + syserr.FromHost(syscall.Errno(e)) + } + return nil +} + +// Shutdown implements socket.Socket.Shutdown. +func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { + stack := t.NetworkContext().(*Stack) + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Shutdown{&pb.ShutdownRequest{Fd: s.fd, How: int64(how)}}}, false /* ignoreResult */) + <-c + + if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Shutdown).Shutdown.ErrorNumber; e != 0 { + return syserr.FromHost(syscall.Errno(e)) + } + return nil +} + +// GetSockOpt implements socket.Socket.GetSockOpt. +func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) { + stack := t.NetworkContext().(*Stack) + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockOpt{&pb.GetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Length: uint32(outLen)}}}, false /* ignoreResult */) + <-c + + res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetSockOpt).GetSockOpt.Result + if e, ok := res.(*pb.GetSockOptResponse_ErrorNumber); ok { + return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + return res.(*pb.GetSockOptResponse_Opt).Opt, nil +} + +// SetSockOpt implements socket.Socket.SetSockOpt. +func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { + stack := t.NetworkContext().(*Stack) + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_SetSockOpt{&pb.SetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Opt: opt}}}, false /* ignoreResult */) + <-c + + if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_SetSockOpt).SetSockOpt.ErrorNumber; e != 0 { + syserr.FromHost(syscall.Errno(e)) + } + return nil +} + +// GetPeerName implements socket.Socket.GetPeerName. +func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) { + stack := t.NetworkContext().(*Stack) + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetPeerName{&pb.GetPeerNameRequest{Fd: s.fd}}}, false /* ignoreResult */) + <-c + + res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetPeerName).GetPeerName.Result + if e, ok := res.(*pb.GetPeerNameResponse_ErrorNumber); ok { + return nil, 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + addr := res.(*pb.GetPeerNameResponse_Address).Address + return addr.Address, addr.Length, nil +} + +// GetSockName implements socket.Socket.GetSockName. +func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) { + stack := t.NetworkContext().(*Stack) + id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockName{&pb.GetSockNameRequest{Fd: s.fd}}}, false /* ignoreResult */) + <-c + + res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetSockName).GetSockName.Result + if e, ok := res.(*pb.GetSockNameResponse_ErrorNumber); ok { + return nil, 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + addr := res.(*pb.GetSockNameResponse_Address).Address + return addr.Address, addr.Length, nil +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +func rpcRecvMsg(t *kernel.Task, req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) { + s := t.NetworkContext().(*Stack) + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) + <-c + + res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Recvmsg).Recvmsg.Result + if e, ok := res.(*pb.RecvmsgResponse_ErrorNumber); ok { + return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + return res.(*pb.RecvmsgResponse_Payload).Payload, nil +} + +// RecvMsg implements socket.Socket.RecvMsg. +func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) { + req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{ + Fd: s.fd, + Length: uint32(dst.NumBytes()), + Sender: senderRequested, + Trunc: flags&linux.MSG_TRUNC != 0, + Peek: flags&linux.MSG_PEEK != 0, + }} + + res, err := rpcRecvMsg(t, req) + if err == nil { + n, e := dst.CopyOut(t, res.Data) + return int(n), res.Address.GetAddress(), res.Address.GetLength(), unix.ControlMessages{}, syserr.FromError(e) + } + if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + return 0, nil, 0, unix.ControlMessages{}, err + } + + // We'll have to block. Register for notifications and keep trying to + // send all the data. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + for { + res, err := rpcRecvMsg(t, req) + if err == nil { + n, e := dst.CopyOut(t, res.Data) + return int(n), res.Address.GetAddress(), res.Address.GetLength(), unix.ControlMessages{}, syserr.FromError(e) + } + if err != syserr.ErrWouldBlock { + return 0, nil, 0, unix.ControlMessages{}, err + } + + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain + } + return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err) + } + } +} + +func rpcSendMsg(t *kernel.Task, req *pb.SyscallRequest_Sendmsg) (uint32, *syserr.Error) { + s := t.NetworkContext().(*Stack) + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) + <-c + + res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Sendmsg).Sendmsg.Result + if e, ok := res.(*pb.SendmsgResponse_ErrorNumber); ok { + return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + return res.(*pb.SendmsgResponse_Length).Length, nil +} + +// SendMsg implements socket.Socket.SendMsg. +func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) { + // Whitelist flags. + if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { + return 0, syserr.ErrInvalidArgument + } + + // Reject control messages. + if !controlMessages.Empty() { + return 0, syserr.ErrInvalidArgument + } + + v := buffer.NewView(int(src.NumBytes())) + + // Copy all the data into the buffer. + if _, err := src.CopyIn(t, v); err != nil { + return 0, syserr.FromError(err) + } + + // TODO: this needs to change to map directly to a SendMsg syscall + // in the RPC. + req := &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{ + Fd: uint32(s.fd), + Data: v, + Address: to, + More: flags&linux.MSG_MORE != 0, + EndOfRecord: flags&linux.MSG_EOR != 0, + }} + + n, err := rpcSendMsg(t, req) + if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + return int(n), err + } + + // We'll have to block. Register for notification and keep trying to + // send all the data. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + + for { + n, err := rpcSendMsg(t, req) + if err != syserr.ErrWouldBlock { + return int(n), err + } + + if err := t.Block(ch); err != nil { + return 0, syserr.FromError(err) + } + } +} + +type socketProvider struct { + family int +} + +// Socket implements socket.Provider.Socket. +func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protocol int) (*fs.File, *syserr.Error) { + // Check that we are using the RPC network stack. + stack := t.NetworkContext() + if stack == nil { + return nil, nil + } + + s, ok := stack.(*Stack) + if !ok { + return nil, nil + } + + // Only accept TCP and UDP. + // + // Try to restrict the flags we will accept to minimize backwards + // incompatability with netstack. + stype := int(stypeflags) & linux.SOCK_TYPE_MASK + switch stype { + case syscall.SOCK_STREAM: + switch protocol { + case 0, syscall.IPPROTO_TCP: + // ok + default: + return nil, nil + } + case syscall.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + // ok + default: + return nil, nil + } + default: + return nil, nil + } + + return newSocketFile(t, s, p.family, stype, 0) +} + +// Pair implements socket.Provider.Pair. +func (p *socketProvider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { + // Not supported by AF_INET/AF_INET6. + return nil, nil, nil +} + +func init() { + for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { + socket.RegisterProvider(family, &socketProvider{family}) + } +} diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go new file mode 100644 index 000000000..503e0e932 --- /dev/null +++ b/pkg/sentry/socket/rpcinet/stack.go @@ -0,0 +1,175 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rpcinet + +import ( + "fmt" + "strings" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/unet" +) + +// Stack implements inet.Stack for RPC backed sockets. +type Stack struct { + // We intentionally do not allow these values to be changed to remain + // consistent with the other networking stacks. + interfaces map[int32]inet.Interface + interfaceAddrs map[int32][]inet.InterfaceAddr + supportsIPv6 bool + tcpRecvBufSize inet.TCPBufferSize + tcpSendBufSize inet.TCPBufferSize + tcpSACKEnabled bool + rpcConn *conn.RPCConnection + notifier *notifier.Notifier +} + +func readTCPBufferSizeFile(conn *conn.RPCConnection, filename string) (inet.TCPBufferSize, error) { + contents, se := conn.RPCReadFile(filename) + if se != nil { + return inet.TCPBufferSize{}, fmt.Errorf("failed to read %s: %v", filename, se) + } + ioseq := usermem.BytesIOSequence(contents) + fields := make([]int32, 3) + if n, err := usermem.CopyInt32StringsInVec(context.Background(), ioseq.IO, ioseq.Addrs, fields, ioseq.Opts); n != ioseq.NumBytes() || err != nil { + return inet.TCPBufferSize{}, fmt.Errorf("failed to parse %s (%q): got %v after %d/%d bytes", filename, contents, err, n, ioseq.NumBytes()) + } + return inet.TCPBufferSize{ + Min: int(fields[0]), + Default: int(fields[1]), + Max: int(fields[2]), + }, nil +} + +// NewStack returns a Stack containing the current state of the host network +// stack. +func NewStack(fd int32) (*Stack, error) { + sock, err := unet.NewSocket(int(fd)) + if err != nil { + return nil, err + } + + stack := &Stack{ + interfaces: make(map[int32]inet.Interface), + interfaceAddrs: make(map[int32][]inet.InterfaceAddr), + rpcConn: conn.NewRPCConnection(sock), + } + + var e error + stack.notifier, e = notifier.NewRPCNotifier(stack.rpcConn) + if e != nil { + return nil, e + } + + // Load the configuration values from procfs. + tcpRMem, e := readTCPBufferSizeFile(stack.rpcConn, "/proc/sys/net/ipv4/tcp_rmem") + if e != nil { + return nil, e + } + stack.tcpRecvBufSize = tcpRMem + + tcpWMem, e := readTCPBufferSizeFile(stack.rpcConn, "/proc/sys/net/ipv4/tcp_wmem") + if e != nil { + return nil, e + } + stack.tcpSendBufSize = tcpWMem + + ipv6, se := stack.rpcConn.RPCReadFile("/proc/net/if_inet6") + if len(string(ipv6)) > 0 { + stack.supportsIPv6 = true + } + + sackFile := "/proc/sys/net/ipv4/tcp_sack" + sack, se := stack.rpcConn.RPCReadFile(sackFile) + if se != nil { + return nil, fmt.Errorf("failed to read %s: %v", sackFile, se) + } + stack.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0" + + links, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETLINK) + if err != nil { + return nil, fmt.Errorf("RTM_GETLINK failed: %v", err) + } + + addrs, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETADDR) + if err != nil { + return nil, fmt.Errorf("RTM_GETADDR failed: %v", err) + } + + e = hostinet.ExtractHostInterfaces(links, addrs, stack.interfaces, stack.interfaceAddrs) + if e != nil { + return nil, e + } + + return stack, nil +} + +// Interfaces implements inet.Stack.Interfaces. +func (s *Stack) Interfaces() map[int32]inet.Interface { + return s.interfaces +} + +// InterfaceAddrs implements inet.Stack.InterfaceAddrs. +func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { + return s.interfaceAddrs +} + +// SupportsIPv6 implements inet.Stack.SupportsIPv6. +func (s *Stack) SupportsIPv6() bool { + return s.supportsIPv6 +} + +// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. +func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { + return s.tcpRecvBufSize, nil +} + +// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. +func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { + // To keep all the supported stacks consistent we don't allow changing this + // value even though it would be possible via an RPC. + return syserror.EACCES +} + +// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. +func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { + return s.tcpSendBufSize, nil +} + +// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. +func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { + // To keep all the supported stacks consistent we don't allow changing this + // value even though it would be possible via an RPC. + return syserror.EACCES +} + +// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. +func (s *Stack) TCPSACKEnabled() (bool, error) { + return s.tcpSACKEnabled, nil +} + +// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. +func (s *Stack) SetTCPSACKEnabled(enabled bool) error { + // To keep all the supported stacks consistent we don't allow changing this + // value even though it would be possible via an RPC. + return syserror.EACCES +} diff --git a/pkg/sentry/socket/rpcinet/stack_unsafe.go b/pkg/sentry/socket/rpcinet/stack_unsafe.go new file mode 100644 index 000000000..9a896c623 --- /dev/null +++ b/pkg/sentry/socket/rpcinet/stack_unsafe.go @@ -0,0 +1,193 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rpcinet + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/binary" + pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserr" +) + +// NewNetlinkRouteRequest builds a netlink message for getting the RIB, +// the routing information base. +func newNetlinkRouteRequest(proto, seq, family int) []byte { + rr := &syscall.NetlinkRouteRequest{} + rr.Header.Len = uint32(syscall.NLMSG_HDRLEN + syscall.SizeofRtGenmsg) + rr.Header.Type = uint16(proto) + rr.Header.Flags = syscall.NLM_F_DUMP | syscall.NLM_F_REQUEST + rr.Header.Seq = uint32(seq) + rr.Data.Family = uint8(family) + return netlinkRRtoWireFormat(rr) +} + +func netlinkRRtoWireFormat(rr *syscall.NetlinkRouteRequest) []byte { + b := make([]byte, rr.Header.Len) + *(*uint32)(unsafe.Pointer(&b[0:4][0])) = rr.Header.Len + *(*uint16)(unsafe.Pointer(&b[4:6][0])) = rr.Header.Type + *(*uint16)(unsafe.Pointer(&b[6:8][0])) = rr.Header.Flags + *(*uint32)(unsafe.Pointer(&b[8:12][0])) = rr.Header.Seq + *(*uint32)(unsafe.Pointer(&b[12:16][0])) = rr.Header.Pid + b[16] = byte(rr.Data.Family) + return b +} + +func (s *Stack) getNetlinkFd() (uint32, *syserr.Error) { + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(syscall.AF_NETLINK), Type: int64(syscall.SOCK_RAW | syscall.SOCK_NONBLOCK), Protocol: int64(syscall.NETLINK_ROUTE)}}}, false /* ignoreResult */) + <-c + + res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Socket).Socket.Result + if e, ok := res.(*pb.SocketResponse_ErrorNumber); ok { + return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + return res.(*pb.SocketResponse_Fd).Fd, nil +} + +func (s *Stack) bindNetlinkFd(fd uint32, sockaddr []byte) *syserr.Error { + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Bind{&pb.BindRequest{Fd: fd, Address: sockaddr}}}, false /* ignoreResult */) + <-c + + if e := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 { + return syserr.FromHost(syscall.Errno(e)) + } + return nil +} + +func (s *Stack) closeNetlinkFd(fd uint32) { + _, _ = s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Close{&pb.CloseRequest{Fd: fd}}}, true /* ignoreResult */) +} + +func (s *Stack) rpcSendMsg(req *pb.SyscallRequest_Sendmsg) (uint32, *syserr.Error) { + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) + <-c + + res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Sendmsg).Sendmsg.Result + if e, ok := res.(*pb.SendmsgResponse_ErrorNumber); ok { + return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + return res.(*pb.SendmsgResponse_Length).Length, nil +} + +func (s *Stack) sendMsg(fd uint32, buf []byte, to []byte, flags int) (int, *syserr.Error) { + // Whitelist flags. + if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { + return 0, syserr.ErrInvalidArgument + } + + req := &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{ + Fd: fd, + Data: buf, + Address: to, + More: flags&linux.MSG_MORE != 0, + EndOfRecord: flags&linux.MSG_EOR != 0, + }} + + n, err := s.rpcSendMsg(req) + return int(n), err +} + +func (s *Stack) rpcRecvMsg(req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) { + id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) + <-c + + res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Recvmsg).Recvmsg.Result + if e, ok := res.(*pb.RecvmsgResponse_ErrorNumber); ok { + return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) + } + + return res.(*pb.RecvmsgResponse_Payload).Payload, nil +} + +func (s *Stack) recvMsg(fd, l, flags uint32) ([]byte, *syserr.Error) { + req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{ + Fd: fd, + Length: l, + Sender: false, + Trunc: flags&linux.MSG_TRUNC != 0, + Peek: flags&linux.MSG_PEEK != 0, + }} + + res, err := s.rpcRecvMsg(req) + if err != nil { + return nil, err + } + return res.Data, nil +} + +func (s *Stack) netlinkRequest(proto, family int) ([]byte, error) { + fd, err := s.getNetlinkFd() + if err != nil { + return nil, err.ToError() + } + defer s.closeNetlinkFd(fd) + + lsa := syscall.SockaddrNetlink{Family: syscall.AF_NETLINK} + b := binary.Marshal(nil, usermem.ByteOrder, &lsa) + if err := s.bindNetlinkFd(fd, b); err != nil { + return nil, err.ToError() + } + + wb := newNetlinkRouteRequest(proto, 1, family) + _, err = s.sendMsg(fd, wb, b, 0) + if err != nil { + return nil, err.ToError() + } + + var tab []byte +done: + for { + rb, err := s.recvMsg(fd, uint32(syscall.Getpagesize()), 0) + nr := len(rb) + if err != nil { + return nil, err.ToError() + } + + if nr < syscall.NLMSG_HDRLEN { + return nil, syserr.ErrInvalidArgument.ToError() + } + + tab = append(tab, rb...) + msgs, e := syscall.ParseNetlinkMessage(rb) + if e != nil { + return nil, e + } + + for _, m := range msgs { + if m.Header.Type == syscall.NLMSG_DONE { + break done + } + if m.Header.Type == syscall.NLMSG_ERROR { + return nil, syserr.ErrInvalidArgument.ToError() + } + } + } + + return tab, nil +} + +// DoNetlinkRouteRequest returns routing information base, also known as RIB, +// which consists of network facility information, states and parameters. +func (s *Stack) DoNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) { + data, err := s.netlinkRequest(req, syscall.AF_UNSPEC) + if err != nil { + return nil, err + } + return syscall.ParseNetlinkMessage(data) +} diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto new file mode 100644 index 000000000..b845b1bce --- /dev/null +++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto @@ -0,0 +1,351 @@ +syntax = "proto3"; + +// package syscall_rpc is a set of networking related system calls that can be +// forwarded to a socket gofer. +// +// TODO: Document individual RPCs. +package syscall_rpc; + +message SendmsgRequest { + uint32 fd = 1; + bytes data = 2; + bytes address = 3; + bool more = 4; + bool end_of_record = 5; +} + +message SendmsgResponse { + oneof result { + uint32 error_number = 1; + uint32 length = 2; + } +} + +message IOCtlRequest { + uint32 fd = 1; + uint32 cmd = 2; + uint64 arg = 3; +} + +message IOCtlResponse { + oneof result { + uint32 error_number = 1; + uint64 value = 2; + } +} + +message RecvmsgRequest { + uint32 fd = 1; + uint32 length = 2; + bool sender = 3; + bool peek = 4; + bool trunc = 5; +} + +message OpenRequest { + bytes path = 1; + uint32 flags = 2; + uint32 mode = 3; +} + +message OpenResponse { + oneof result { + uint32 error_number = 1; + uint32 fd = 2; + } +} + +message ReadRequest { + uint32 fd = 1; + uint32 length = 2; +} + +message ReadResponse { + oneof result { + uint32 error_number = 1; + bytes data = 2; + } +} + +message ReadFileRequest { + string path = 1; +} + +message ReadFileResponse { + oneof result { + uint32 error_number = 1; + bytes data = 2; + } +} + +message WriteRequest { + uint32 fd = 1; + bytes data = 2; +} + +message WriteResponse { + oneof result { + uint32 error_number = 1; + uint32 length = 2; + } +} + +message WriteFileRequest { + string path = 1; + bytes content = 2; +} + +message WriteFileResponse { + uint32 error_number = 1; + uint32 written = 2; +} + +message AddressResponse { + bytes address = 1; + uint32 length = 2; +} + +message RecvmsgResponse { + message ResultPayload { + bytes data = 1; + AddressResponse address = 2; + uint32 length = 3; + } + oneof result { + uint32 error_number = 1; + ResultPayload payload = 2; + } +} + +message BindRequest { + uint32 fd = 1; + bytes address = 2; +} + +message BindResponse { + uint32 error_number = 1; +} + +message AcceptRequest { + uint32 fd = 1; + bool peer = 2; + int64 flags = 3; +} + +message AcceptResponse { + message ResultPayload { + uint32 fd = 1; + AddressResponse address = 2; + } + oneof result { + uint32 error_number = 1; + ResultPayload payload = 2; + } +} + +message ConnectRequest { + uint32 fd = 1; + bytes address = 2; +} + +message ConnectResponse { + uint32 error_number = 1; +} + +message ListenRequest { + uint32 fd = 1; + int64 backlog = 2; +} + +message ListenResponse { + uint32 error_number = 1; +} + +message ShutdownRequest { + uint32 fd = 1; + int64 how = 2; +} + +message ShutdownResponse { + uint32 error_number = 1; +} + +message CloseRequest { + uint32 fd = 1; +} + +message CloseResponse { + uint32 error_number = 1; +} + +message GetSockOptRequest { + uint32 fd = 1; + int64 level = 2; + int64 name = 3; + uint32 length = 4; +} + +message GetSockOptResponse { + oneof result { + uint32 error_number = 1; + bytes opt = 2; + } +} + +message SetSockOptRequest { + uint32 fd = 1; + int64 level = 2; + int64 name = 3; + bytes opt = 4; +} + +message SetSockOptResponse { + uint32 error_number = 1; +} + +message GetSockNameRequest { + uint32 fd = 1; +} + +message GetSockNameResponse { + oneof result { + uint32 error_number = 1; + AddressResponse address = 2; + } +} + +message GetPeerNameRequest { + uint32 fd = 1; +} + +message GetPeerNameResponse { + oneof result { + uint32 error_number = 1; + AddressResponse address = 2; + } +} + +message SocketRequest { + int64 family = 1; + int64 type = 2; + int64 protocol = 3; +} + +message SocketResponse { + oneof result { + uint32 error_number = 1; + uint32 fd = 2; + } +} + +message EpollWaitRequest { + uint32 fd = 1; + uint32 num_events = 2; + sint64 msec = 3; +} + +message EpollEvent { + uint32 fd = 1; + uint32 events = 2; +} + +message EpollEvents { + repeated EpollEvent events = 1; +} + +message EpollWaitResponse { + oneof result { + uint32 error_number = 1; + EpollEvents events = 2; + } +} + +message EpollCtlRequest { + uint32 epfd = 1; + int64 op = 2; + uint32 fd = 3; + EpollEvent event = 4; +} + +message EpollCtlResponse { + uint32 error_number = 1; +} + +message EpollCreate1Request { + int64 flag = 1; +} + +message EpollCreate1Response { + oneof result { + uint32 error_number = 1; + uint32 fd = 2; + } +} + +message PollRequest { + uint32 fd = 1; + uint32 events = 2; +} + +message PollResponse { + oneof result { + uint32 error_number = 1; + uint32 events = 2; + } +} + +message SyscallRequest { + oneof args { + SocketRequest socket = 1; + SendmsgRequest sendmsg = 2; + RecvmsgRequest recvmsg = 3; + BindRequest bind = 4; + AcceptRequest accept = 5; + ConnectRequest connect = 6; + ListenRequest listen = 7; + ShutdownRequest shutdown = 8; + CloseRequest close = 9; + GetSockOptRequest get_sock_opt = 10; + SetSockOptRequest set_sock_opt = 11; + GetSockNameRequest get_sock_name = 12; + GetPeerNameRequest get_peer_name = 13; + EpollWaitRequest epoll_wait = 14; + EpollCtlRequest epoll_ctl = 15; + EpollCreate1Request epoll_create1 = 16; + PollRequest poll = 17; + ReadRequest read = 18; + WriteRequest write = 19; + OpenRequest open = 20; + IOCtlRequest ioctl = 21; + WriteFileRequest write_file = 22; + ReadFileRequest read_file = 23; + } +} + +message SyscallResponse { + oneof result { + SocketResponse socket = 1; + SendmsgResponse sendmsg = 2; + RecvmsgResponse recvmsg = 3; + BindResponse bind = 4; + AcceptResponse accept = 5; + ConnectResponse connect = 6; + ListenResponse listen = 7; + ShutdownResponse shutdown = 8; + CloseResponse close = 9; + GetSockOptResponse get_sock_opt = 10; + SetSockOptResponse set_sock_opt = 11; + GetSockNameResponse get_sock_name = 12; + GetPeerNameResponse get_peer_name = 13; + EpollWaitResponse epoll_wait = 14; + EpollCtlResponse epoll_ctl = 15; + EpollCreate1Response epoll_create1 = 16; + PollResponse poll = 17; + ReadResponse read = 18; + WriteResponse write = 19; + OpenResponse open = 20; + IOCtlResponse ioctl = 21; + WriteFileResponse write_file = 22; + ReadFileResponse read_file = 23; + } +} |