diff options
author | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
commit | ceb0d792f328d1fc0692197d8856a43c3936a571 (patch) | |
tree | 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/socket/netlink | |
parent | deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff) | |
parent | 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff) |
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/sentry/socket/netlink')
-rw-r--r-- | pkg/sentry/socket/netlink/message.go | 159 | ||||
-rwxr-xr-x | pkg/sentry/socket/netlink/netlink_state_autogen.go | 36 | ||||
-rw-r--r-- | pkg/sentry/socket/netlink/port/port.go | 116 | ||||
-rwxr-xr-x | pkg/sentry/socket/netlink/port/port_state_autogen.go | 22 | ||||
-rw-r--r-- | pkg/sentry/socket/netlink/provider.go | 105 | ||||
-rw-r--r-- | pkg/sentry/socket/netlink/route/protocol.go | 197 | ||||
-rwxr-xr-x | pkg/sentry/socket/netlink/route/route_state_autogen.go | 20 | ||||
-rw-r--r-- | pkg/sentry/socket/netlink/socket.go | 618 |
8 files changed, 1273 insertions, 0 deletions
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go new file mode 100644 index 000000000..5bd3b49ce --- /dev/null +++ b/pkg/sentry/socket/netlink/message.go @@ -0,0 +1,159 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "fmt" + "math" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// alignUp rounds a length up to an alignment. +// +// Preconditions: align is a power of two. +func alignUp(length int, align uint) int { + return (length + int(align) - 1) &^ (int(align) - 1) +} + +// Message contains a complete serialized netlink message. +type Message struct { + buf []byte +} + +// NewMessage creates a new Message containing the passed header. +// +// The header length will be updated by Finalize. +func NewMessage(hdr linux.NetlinkMessageHeader) *Message { + return &Message{ + buf: binary.Marshal(nil, usermem.ByteOrder, hdr), + } +} + +// Finalize returns the []byte containing the entire message, with the total +// length set in the message header. The Message must not be modified after +// calling Finalize. +func (m *Message) Finalize() []byte { + // Update length, which is the first 4 bytes of the header. + usermem.ByteOrder.PutUint32(m.buf, uint32(len(m.buf))) + + // Align the message. Note that the message length in the header (set + // above) is the useful length of the message, not the total aligned + // length. See net/netlink/af_netlink.c:__nlmsg_put. + aligned := alignUp(len(m.buf), linux.NLMSG_ALIGNTO) + m.putZeros(aligned - len(m.buf)) + return m.buf +} + +// putZeros adds n zeros to the message. +func (m *Message) putZeros(n int) { + for n > 0 { + m.buf = append(m.buf, 0) + n-- + } +} + +// Put serializes v into the message. +func (m *Message) Put(v interface{}) { + m.buf = binary.Marshal(m.buf, usermem.ByteOrder, v) +} + +// PutAttr adds v to the message as a netlink attribute. +// +// Preconditions: The serialized attribute (linux.NetlinkAttrHeaderSize + +// binary.Size(v) fits in math.MaxUint16 bytes. +func (m *Message) PutAttr(atype uint16, v interface{}) { + l := linux.NetlinkAttrHeaderSize + int(binary.Size(v)) + if l > math.MaxUint16 { + panic(fmt.Sprintf("attribute too large: %d", l)) + } + + m.Put(linux.NetlinkAttrHeader{ + Type: atype, + Length: uint16(l), + }) + m.Put(v) + + // Align the attribute. + aligned := alignUp(l, linux.NLA_ALIGNTO) + m.putZeros(aligned - l) +} + +// PutAttrString adds s to the message as a netlink attribute. +func (m *Message) PutAttrString(atype uint16, s string) { + l := linux.NetlinkAttrHeaderSize + len(s) + 1 + m.Put(linux.NetlinkAttrHeader{ + Type: atype, + Length: uint16(l), + }) + + // String + NUL-termination. + m.Put([]byte(s)) + m.putZeros(1) + + // Align the attribute. + aligned := alignUp(l, linux.NLA_ALIGNTO) + m.putZeros(aligned - l) +} + +// MessageSet contains a series of netlink messages. +type MessageSet struct { + // Multi indicates that this a multi-part message, to be terminated by + // NLMSG_DONE. NLMSG_DONE is sent even if the set contains only one + // Message. + // + // If Multi is set, all added messages will have NLM_F_MULTI set. + Multi bool + + // PortID is the destination port for all messages. + PortID int32 + + // Seq is the sequence counter for all messages in the set. + Seq uint32 + + // Messages contains the messages in the set. + Messages []*Message +} + +// NewMessageSet creates a new MessageSet. +// +// portID is the destination port to set as PortID in all messages. +// +// seq is the sequence counter to set as seq in all messages in the set. +func NewMessageSet(portID int32, seq uint32) *MessageSet { + return &MessageSet{ + PortID: portID, + Seq: seq, + } +} + +// AddMessage adds a new message to the set and returns it for further +// additions. +// +// The passed header will have Seq, PortID and the multi flag set +// automatically. +func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message { + hdr.Seq = ms.Seq + hdr.PortID = uint32(ms.PortID) + if ms.Multi { + hdr.Flags |= linux.NLM_F_MULTI + } + + m := NewMessage(hdr) + ms.Messages = append(ms.Messages, m) + return m +} diff --git a/pkg/sentry/socket/netlink/netlink_state_autogen.go b/pkg/sentry/socket/netlink/netlink_state_autogen.go new file mode 100755 index 000000000..59d902798 --- /dev/null +++ b/pkg/sentry/socket/netlink/netlink_state_autogen.go @@ -0,0 +1,36 @@ +// automatically generated by stateify. + +package netlink + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *Socket) beforeSave() {} +func (x *Socket) save(m state.Map) { + x.beforeSave() + m.Save("SendReceiveTimeout", &x.SendReceiveTimeout) + m.Save("ports", &x.ports) + m.Save("protocol", &x.protocol) + m.Save("ep", &x.ep) + m.Save("connection", &x.connection) + m.Save("bound", &x.bound) + m.Save("portID", &x.portID) + m.Save("sendBufferSize", &x.sendBufferSize) +} + +func (x *Socket) afterLoad() {} +func (x *Socket) load(m state.Map) { + m.Load("SendReceiveTimeout", &x.SendReceiveTimeout) + m.Load("ports", &x.ports) + m.Load("protocol", &x.protocol) + m.Load("ep", &x.ep) + m.Load("connection", &x.connection) + m.Load("bound", &x.bound) + m.Load("portID", &x.portID) + m.Load("sendBufferSize", &x.sendBufferSize) +} + +func init() { + state.Register("netlink.Socket", (*Socket)(nil), state.Fns{Save: (*Socket).save, Load: (*Socket).load}) +} diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go new file mode 100644 index 000000000..e9d3275b1 --- /dev/null +++ b/pkg/sentry/socket/netlink/port/port.go @@ -0,0 +1,116 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package port provides port ID allocation for netlink sockets. +// +// A netlink port is any int32 value. Positive ports are typically equivalent +// to the PID of the binding process. If that port is unavailable, negative +// ports are searched to find a free port that will not conflict with other +// PIDS. +package port + +import ( + "fmt" + "math" + "math/rand" + "sync" +) + +// maxPorts is a sanity limit on the maximum number of ports to allocate per +// protocol. +const maxPorts = 10000 + +// Manager allocates netlink port IDs. +// +// +stateify savable +type Manager struct { + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // ports contains a map of allocated ports for each protocol. + ports map[int]map[int32]struct{} +} + +// New creates a new Manager. +func New() *Manager { + return &Manager{ + ports: make(map[int]map[int32]struct{}), + } +} + +// Allocate reserves a new port ID for protocol. hint will be taken if +// available. +func (m *Manager) Allocate(protocol int, hint int32) (int32, bool) { + m.mu.Lock() + defer m.mu.Unlock() + + proto, ok := m.ports[protocol] + if !ok { + proto = make(map[int32]struct{}) + // Port 0 is reserved for the kernel. + proto[0] = struct{}{} + m.ports[protocol] = proto + } + + if len(proto) >= maxPorts { + return 0, false + } + + if _, ok := proto[hint]; !ok { + // Hint is available, reserve it. + proto[hint] = struct{}{} + return hint, true + } + + // Search for any free port in [math.MinInt32, -4096). The positive + // port space is left open for pid-based allocations. This behavior is + // consistent with Linux. + start := int32(math.MinInt32 + rand.Int63n(math.MaxInt32-4096+1)) + curr := start + for { + if _, ok := proto[curr]; !ok { + proto[curr] = struct{}{} + return curr, true + } + + curr-- + if curr >= -4096 { + curr = -4097 + } + if curr == start { + // Nothing found. We should always find a free port + // because maxPorts < -4096 - MinInt32. + panic(fmt.Sprintf("No free port found in %+v", proto)) + } + } +} + +// Release frees the specified port for protocol. +// +// Preconditions: port is already allocated. +func (m *Manager) Release(protocol int, port int32) { + m.mu.Lock() + defer m.mu.Unlock() + + proto, ok := m.ports[protocol] + if !ok { + panic(fmt.Sprintf("Released port %d for protocol %d which has no allocations", port, protocol)) + } + + if _, ok := proto[port]; !ok { + panic(fmt.Sprintf("Released port %d for protocol %d is not allocated", port, protocol)) + } + + delete(proto, port) +} diff --git a/pkg/sentry/socket/netlink/port/port_state_autogen.go b/pkg/sentry/socket/netlink/port/port_state_autogen.go new file mode 100755 index 000000000..f01d9704f --- /dev/null +++ b/pkg/sentry/socket/netlink/port/port_state_autogen.go @@ -0,0 +1,22 @@ +// automatically generated by stateify. + +package port + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *Manager) beforeSave() {} +func (x *Manager) save(m state.Map) { + x.beforeSave() + m.Save("ports", &x.ports) +} + +func (x *Manager) afterLoad() {} +func (x *Manager) load(m state.Map) { + m.Load("ports", &x.ports) +} + +func init() { + state.Register("port.Manager", (*Manager)(nil), state.Fns{Save: (*Manager).save, Load: (*Manager).load}) +} diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go new file mode 100644 index 000000000..76cf12fd4 --- /dev/null +++ b/pkg/sentry/socket/netlink/provider.go @@ -0,0 +1,105 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.googlesource.com/gvisor/pkg/syserr" +) + +// Protocol is the implementation of a netlink socket protocol. +type Protocol interface { + // Protocol returns the Linux netlink protocol value. + Protocol() int + + // ProcessMessage processes a single message from userspace. + // + // If err == nil, any messages added to ms will be sent back to the + // other end of the socket. Setting ms.Multi will cause an NLMSG_DONE + // message to be sent even if ms contains no messages. + ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error +} + +// Provider is a function that creates a new Protocol for a specific netlink +// protocol. +// +// Note that this is distinct from socket.Provider, which is used for all +// socket families. +type Provider func(t *kernel.Task) (Protocol, *syserr.Error) + +// protocols holds a map of all known address protocols and their provider. +var protocols = make(map[int]Provider) + +// RegisterProvider registers the provider of a given address protocol so that +// netlink sockets of that type can be created via socket(2). +// +// Preconditions: May only be called before any netlink sockets are created. +func RegisterProvider(protocol int, provider Provider) { + if p, ok := protocols[protocol]; ok { + panic(fmt.Sprintf("Netlink protocol %d already provided by %+v", protocol, p)) + } + + protocols[protocol] = provider +} + +// socketProvider implements socket.Provider. +type socketProvider struct { +} + +// Socket implements socket.Provider.Socket. +func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { + // Netlink sockets must be specified as datagram or raw, but they + // behave the same regardless of type. + if stype != transport.SockDgram && stype != transport.SockRaw { + return nil, syserr.ErrSocketNotSupported + } + + provider, ok := protocols[protocol] + if !ok { + return nil, syserr.ErrProtocolNotSupported + } + + p, err := provider(t) + if err != nil { + return nil, err + } + + s, err := NewSocket(t, p) + if err != nil { + return nil, err + } + + d := socket.NewDirent(t, netlinkSocketDevice) + defer d.DecRef() + return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true}, s), nil +} + +// Pair implements socket.Provider.Pair by returning an error. +func (*socketProvider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) { + // Netlink sockets never supports creating socket pairs. + return nil, nil, syserr.ErrNotSupported +} + +// init registers the socket provider. +func init() { + socket.RegisterProvider(linux.AF_NETLINK, &socketProvider{}) +} diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go new file mode 100644 index 000000000..9f0a81403 --- /dev/null +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -0,0 +1,197 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package route provides a NETLINK_ROUTE socket protocol. +package route + +import ( + "bytes" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink" + "gvisor.googlesource.com/gvisor/pkg/syserr" +) + +// commandKind describes the operational class of a message type. +// +// The route message types use the lower 2 bits of the type to describe class +// of command. +type commandKind int + +const ( + kindNew commandKind = 0x0 + kindDel = 0x1 + kindGet = 0x2 + kindSet = 0x3 +) + +func typeKind(typ uint16) commandKind { + return commandKind(typ & 0x3) +} + +// Protocol implements netlink.Protocol. +// +// +stateify savable +type Protocol struct{} + +var _ netlink.Protocol = (*Protocol)(nil) + +// NewProtocol creates a NETLINK_ROUTE netlink.Protocol. +func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) { + return &Protocol{}, nil +} + +// Protocol implements netlink.Protocol.Protocol. +func (p *Protocol) Protocol() int { + return linux.NETLINK_ROUTE +} + +// dumpLinks handles RTM_GETLINK + NLM_F_DUMP requests. +func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { + // NLM_F_DUMP + RTM_GETLINK messages are supposed to include an + // ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some + // userspace applications (including glibc) still include rtgenmsg. + // Linux has a workaround based on the total message length. + // + // We don't bother to check for either, since we don't support any + // extra attributes that may be included anyways. + // + // The message may also contain netlink attribute IFLA_EXT_MASK, which + // we don't support. + + // The RTM_GETLINK dump response is a set of messages each containing + // an InterfaceInfoMessage followed by a set of netlink attributes. + + // We always send back an NLMSG_DONE. + ms.Multi = true + + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network devices. + return nil + } + + for id, i := range stack.Interfaces() { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.RTM_NEWLINK, + }) + + m.Put(linux.InterfaceInfoMessage{ + Family: linux.AF_UNSPEC, + Type: i.DeviceType, + Index: id, + Flags: i.Flags, + }) + + m.PutAttrString(linux.IFLA_IFNAME, i.Name) + m.PutAttr(linux.IFLA_MTU, i.MTU) + + mac := make([]byte, 6) + brd := mac + if len(i.Addr) > 0 { + mac = i.Addr + brd = bytes.Repeat([]byte{0xff}, len(i.Addr)) + } + m.PutAttr(linux.IFLA_ADDRESS, mac) + m.PutAttr(linux.IFLA_BROADCAST, brd) + + // TODO(b/68878065): There are many more attributes. + } + + return nil +} + +// dumpAddrs handles RTM_GETADDR + NLM_F_DUMP requests. +func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { + // RTM_GETADDR dump requests need not contain anything more than the + // netlink header and 1 byte protocol family common to all + // NETLINK_ROUTE requests. + // + // TODO(b/68878065): Filter output by passed protocol family. + + // The RTM_GETADDR dump response is a set of RTM_NEWADDR messages each + // containing an InterfaceAddrMessage followed by a set of netlink + // attributes. + + // We always send back an NLMSG_DONE. + ms.Multi = true + + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network devices. + return nil + } + + for id, as := range stack.InterfaceAddrs() { + for _, a := range as { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.RTM_NEWADDR, + }) + + m.Put(linux.InterfaceAddrMessage{ + Family: a.Family, + PrefixLen: a.PrefixLen, + Index: uint32(id), + }) + + m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr)) + + // TODO(b/68878065): There are many more attributes. + } + } + + return nil +} + +// ProcessMessage implements netlink.Protocol.ProcessMessage. +func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { + // All messages start with a 1 byte protocol family. + if len(data) < 1 { + // Linux ignores messages missing the protocol family. See + // net/core/rtnetlink.c:rtnetlink_rcv_msg. + return nil + } + + // Non-GET message types require CAP_NET_ADMIN. + if typeKind(hdr.Type) != kindGet { + creds := auth.CredentialsFromContext(ctx) + if !creds.HasCapability(linux.CAP_NET_ADMIN) { + return syserr.ErrPermissionDenied + } + } + + // TODO(b/68878065): Only the dump variant of the types below are + // supported. + if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP { + return syserr.ErrNotSupported + } + + switch hdr.Type { + case linux.RTM_GETLINK: + return p.dumpLinks(ctx, hdr, data, ms) + case linux.RTM_GETADDR: + return p.dumpAddrs(ctx, hdr, data, ms) + default: + return syserr.ErrNotSupported + } +} + +// init registers the NETLINK_ROUTE provider. +func init() { + netlink.RegisterProvider(linux.NETLINK_ROUTE, NewProtocol) +} diff --git a/pkg/sentry/socket/netlink/route/route_state_autogen.go b/pkg/sentry/socket/netlink/route/route_state_autogen.go new file mode 100755 index 000000000..8431bb3d5 --- /dev/null +++ b/pkg/sentry/socket/netlink/route/route_state_autogen.go @@ -0,0 +1,20 @@ +// automatically generated by stateify. + +package route + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *Protocol) beforeSave() {} +func (x *Protocol) save(m state.Map) { + x.beforeSave() +} + +func (x *Protocol) afterLoad() {} +func (x *Protocol) load(m state.Map) { +} + +func init() { + state.Register("route.Protocol", (*Protocol)(nil), state.Fns{Save: (*Protocol).save, Load: (*Protocol).load}) +} diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go new file mode 100644 index 000000000..afd06ca33 --- /dev/null +++ b/pkg/sentry/socket/netlink/socket.go @@ -0,0 +1,618 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package netlink provides core functionality for netlink sockets. +package netlink + +import ( + "math" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserr" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +const sizeOfInt32 int = 4 + +const ( + // minBufferSize is the smallest size of a send buffer. + minSendBufferSize = 4 << 10 // 4096 bytes. + + // defaultSendBufferSize is the default size for the send buffer. + defaultSendBufferSize = 16 * 1024 + + // maxBufferSize is the largest size a send buffer can grow to. + maxSendBufferSize = 4 << 20 // 4MB +) + +// netlinkSocketDevice is the netlink socket virtual device. +var netlinkSocketDevice = device.NewAnonDevice() + +// Socket is the base socket type for netlink sockets. +// +// This implementation only supports userspace sending and receiving messages +// to/from the kernel. +// +// Socket implements socket.Socket. +// +// +stateify savable +type Socket struct { + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + socket.SendReceiveTimeout + + // ports provides netlink port allocation. + ports *port.Manager + + // protocol is the netlink protocol implementation. + protocol Protocol + + // ep is a datagram unix endpoint used to buffer messages sent from the + // kernel to userspace. RecvMsg reads messages from this endpoint. + ep transport.Endpoint + + // connection is the kernel's connection to ep, used to write messages + // sent to userspace. + connection transport.ConnectedEndpoint + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // bound indicates that portid is valid. + bound bool + + // portID is the port ID allocated for this socket. + portID int32 + + // sendBufferSize is the send buffer "size". We don't actually have a + // fixed buffer but only consume this many bytes. + sendBufferSize uint32 +} + +var _ socket.Socket = (*Socket)(nil) + +// NewSocket creates a new Socket. +func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) { + // Datagram endpoint used to buffer kernel -> user messages. + ep := transport.NewConnectionless() + + // Bind the endpoint for good measure so we can connect to it. The + // bound address will never be exposed. + if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { + ep.Close() + return nil, err + } + + // Create a connection from which the kernel can write messages. + connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect() + if err != nil { + ep.Close() + return nil, err + } + + return &Socket{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, nil +} + +// Release implements fs.FileOperations.Release. +func (s *Socket) Release() { + s.connection.Release() + s.ep.Close() + + if s.bound { + s.ports.Release(s.protocol.Protocol(), s.portID) + } +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { + // ep holds messages to be read and thus handles EventIn readiness. + ready := s.ep.Readiness(mask) + + if mask&waiter.EventOut == waiter.EventOut { + // sendMsg handles messages synchronously and is thus always + // ready for writing. + ready |= waiter.EventOut + } + + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *Socket) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.ep.EventRegister(e, mask) + // Writable readiness never changes, so no registration is needed. +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *Socket) EventUnregister(e *waiter.Entry) { + s.ep.EventUnregister(e) +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (s *Socket) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // TODO(b/68878065): no ioctls supported. + return 0, syserror.ENOTTY +} + +// ExtractSockAddr extracts the SockAddrNetlink from b. +func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) { + if len(b) < linux.SockAddrNetlinkSize { + return nil, syserr.ErrBadAddress + } + + var sa linux.SockAddrNetlink + binary.Unmarshal(b[:linux.SockAddrNetlinkSize], usermem.ByteOrder, &sa) + + if sa.Family != linux.AF_NETLINK { + return nil, syserr.ErrInvalidArgument + } + + return &sa, nil +} + +// bindPort binds this socket to a port, preferring 'port' if it is available. +// +// port of 0 defaults to the ThreadGroup ID. +// +// Preconditions: mu is held. +func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { + if s.bound { + // Re-binding is only allowed if the port doesn't change. + if port != s.portID { + return syserr.ErrInvalidArgument + } + + return nil + } + + if port == 0 { + port = int32(t.ThreadGroup().ID()) + } + port, ok := s.ports.Allocate(s.protocol.Protocol(), port) + if !ok { + return syserr.ErrBusy + } + + s.portID = port + s.bound = true + return nil +} + +// Bind implements socket.Socket.Bind. +func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + a, err := ExtractSockAddr(sockaddr) + if err != nil { + return err + } + + // No support for multicast groups yet. + if a.Groups != 0 { + return syserr.ErrPermissionDenied + } + + s.mu.Lock() + defer s.mu.Unlock() + + return s.bindPort(t, int32(a.PortID)) +} + +// Connect implements socket.Socket.Connect. +func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { + a, err := ExtractSockAddr(sockaddr) + if err != nil { + return err + } + + // No support for multicast groups yet. + if a.Groups != 0 { + return syserr.ErrPermissionDenied + } + + s.mu.Lock() + defer s.mu.Unlock() + + if a.PortID == 0 { + // Netlink sockets default to connected to the kernel, but + // connecting anyways automatically binds if not already bound. + if !s.bound { + // Pass port 0 to get an auto-selected port ID. + return s.bindPort(t, 0) + } + return nil + } + + // We don't support non-kernel destination ports. Linux returns EPERM + // if applications attempt to do this without NL_CFG_F_NONROOT_SEND, so + // we emulate that. + return syserr.ErrPermissionDenied +} + +// Accept implements socket.Socket.Accept. +func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { + // Netlink sockets never support accept. + return 0, nil, 0, syserr.ErrNotSupported +} + +// Listen implements socket.Socket.Listen. +func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error { + // Netlink sockets never support listen. + return syserr.ErrNotSupported +} + +// Shutdown implements socket.Socket.Shutdown. +func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error { + // Netlink sockets never support shutdown. + return syserr.ErrNotSupported +} + +// GetSockOpt implements socket.Socket.GetSockOpt. +func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) { + switch level { + case linux.SOL_SOCKET: + switch name { + case linux.SO_SNDBUF: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + s.mu.Lock() + defer s.mu.Unlock() + return int32(s.sendBufferSize), nil + + case linux.SO_RCVBUF: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + // We don't have limit on receiving size. + return int32(math.MaxInt32), nil + + default: + socket.GetSockOptEmitUnimplementedEvent(t, name) + } + case linux.SOL_NETLINK: + switch name { + case linux.NETLINK_BROADCAST_ERROR, + linux.NETLINK_CAP_ACK, + linux.NETLINK_DUMP_STRICT_CHK, + linux.NETLINK_EXT_ACK, + linux.NETLINK_LIST_MEMBERSHIPS, + linux.NETLINK_NO_ENOBUFS, + linux.NETLINK_PKTINFO: + + t.Kernel().EmitUnimplementedEvent(t) + } + } + // TODO(b/68878065): other sockopts are not supported. + return nil, syserr.ErrProtocolNotAvailable +} + +// SetSockOpt implements socket.Socket.SetSockOpt. +func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { + switch level { + case linux.SOL_SOCKET: + switch name { + case linux.SO_SNDBUF: + if len(opt) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + size := usermem.ByteOrder.Uint32(opt) + if size < minSendBufferSize { + size = minSendBufferSize + } else if size > maxSendBufferSize { + size = maxSendBufferSize + } + s.mu.Lock() + s.sendBufferSize = size + s.mu.Unlock() + return nil + case linux.SO_RCVBUF: + if len(opt) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + // We don't have limit on receiving size. So just accept anything as + // valid for compatibility. + return nil + default: + socket.SetSockOptEmitUnimplementedEvent(t, name) + } + + case linux.SOL_NETLINK: + switch name { + case linux.NETLINK_ADD_MEMBERSHIP, + linux.NETLINK_BROADCAST_ERROR, + linux.NETLINK_CAP_ACK, + linux.NETLINK_DROP_MEMBERSHIP, + linux.NETLINK_DUMP_STRICT_CHK, + linux.NETLINK_EXT_ACK, + linux.NETLINK_LISTEN_ALL_NSID, + linux.NETLINK_NO_ENOBUFS, + linux.NETLINK_PKTINFO: + + t.Kernel().EmitUnimplementedEvent(t) + } + + } + // TODO(b/68878065): other sockopts are not supported. + return syserr.ErrProtocolNotAvailable +} + +// GetSockName implements socket.Socket.GetSockName. +func (s *Socket) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) { + s.mu.Lock() + defer s.mu.Unlock() + + sa := linux.SockAddrNetlink{ + Family: linux.AF_NETLINK, + PortID: uint32(s.portID), + } + return sa, uint32(binary.Size(sa)), nil +} + +// GetPeerName implements socket.Socket.GetPeerName. +func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) { + sa := linux.SockAddrNetlink{ + Family: linux.AF_NETLINK, + // TODO(b/68878065): Support non-kernel peers. For now the peer + // must be the kernel. + PortID: 0, + } + return sa, uint32(binary.Size(sa)), nil +} + +// RecvMsg implements socket.Socket.RecvMsg. +func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) { + from := linux.SockAddrNetlink{ + Family: linux.AF_NETLINK, + PortID: 0, + } + fromLen := uint32(binary.Size(from)) + + trunc := flags&linux.MSG_TRUNC != 0 + + r := unix.EndpointReader{ + Endpoint: s.ep, + Peek: flags&linux.MSG_PEEK != 0, + } + + if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + var mflags int + if n < int64(r.MsgSize) { + mflags |= linux.MSG_TRUNC + } + if trunc { + n = int64(r.MsgSize) + } + return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) + } + + // We'll have to block. Register for notification and keep trying to + // receive all the data. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + for { + if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock { + var mflags int + if n < int64(r.MsgSize) { + mflags |= linux.MSG_TRUNC + } + if trunc { + n = int64(r.MsgSize) + } + return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) + } + + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain + } + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + } +} + +// Read implements fs.FileOperations.Read. +func (s *Socket) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &unix.EndpointReader{ + Endpoint: s.ep, + }) +} + +// sendResponse sends the response messages in ms back to userspace. +func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { + // Linux combines multiple netlink messages into a single datagram. + bufs := make([][]byte, 0, len(ms.Messages)) + for _, m := range ms.Messages { + bufs = append(bufs, m.Finalize()) + } + + if len(bufs) > 0 { + // RecvMsg never receives the address, so we don't need to send + // one. + _, notify, err := s.connection.Send(bufs, transport.ControlMessages{}, tcpip.FullAddress{}) + // If the buffer is full, we simply drop messages, just like + // Linux. + if err != nil && err != syserr.ErrWouldBlock { + return err + } + if notify { + s.connection.SendNotify() + } + } + + // N.B. multi-part messages should still send NLMSG_DONE even if + // MessageSet contains no messages. + // + // N.B. NLMSG_DONE is always sent in a different datagram. See + // net/netlink/af_netlink.c:netlink_dump. + if ms.Multi { + m := NewMessage(linux.NetlinkMessageHeader{ + Type: linux.NLMSG_DONE, + Flags: linux.NLM_F_MULTI, + Seq: ms.Seq, + PortID: uint32(ms.PortID), + }) + + _, notify, err := s.connection.Send([][]byte{m.Finalize()}, transport.ControlMessages{}, tcpip.FullAddress{}) + if err != nil && err != syserr.ErrWouldBlock { + return err + } + if notify { + s.connection.SendNotify() + } + } + + return nil +} + +// processMessages handles each message in buf, passing it to the protocol +// handler for final handling. +func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error { + for len(buf) > 0 { + if len(buf) < linux.NetlinkMessageHeaderSize { + // Linux ignores messages that are too short. See + // net/netlink/af_netlink.c:netlink_rcv_skb. + break + } + + var hdr linux.NetlinkMessageHeader + binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr) + + if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) { + // Linux ignores malformed messages. See + // net/netlink/af_netlink.c:netlink_rcv_skb. + break + } + + // Data from this message. + data := buf[linux.NetlinkMessageHeaderSize:hdr.Length] + + // Advance to the next message. + next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO) + if next >= len(buf)-1 { + next = len(buf) - 1 + } + buf = buf[next:] + + // Ignore control messages. + if hdr.Type < linux.NLMSG_MIN_TYPE { + continue + } + + // TODO(b/68877377): ACKs not supported yet. + if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK { + return syserr.ErrNotSupported + } + + ms := NewMessageSet(s.portID, hdr.Seq) + if err := s.protocol.ProcessMessage(ctx, hdr, data, ms); err != nil { + return err + } + + if err := s.sendResponse(ctx, ms); err != nil { + return err + } + } + + return nil +} + +// sendMsg is the core of message send, used for SendMsg and Write. +func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { + dstPort := int32(0) + + if len(to) != 0 { + a, err := ExtractSockAddr(to) + if err != nil { + return 0, err + } + + // No support for multicast groups yet. + if a.Groups != 0 { + return 0, syserr.ErrPermissionDenied + } + + dstPort = int32(a.PortID) + } + + if dstPort != 0 { + // Non-kernel destinations not supported yet. Treat as if + // NL_CFG_F_NONROOT_SEND is not set. + return 0, syserr.ErrPermissionDenied + } + + s.mu.Lock() + defer s.mu.Unlock() + + // For simplicity, and consistency with Linux, we copy in the entire + // message up front. + if src.NumBytes() > int64(s.sendBufferSize) { + return 0, syserr.ErrMessageTooLong + } + + buf := make([]byte, src.NumBytes()) + n, err := src.CopyIn(ctx, buf) + if err != nil { + // Don't partially consume messages. + return 0, syserr.FromError(err) + } + + if err := s.processMessages(ctx, buf); err != nil { + return 0, err + } + + return n, nil +} + +// SendMsg implements socket.Socket.SendMsg. +func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { + return s.sendMsg(t, src, to, flags, controlMessages) +} + +// Write implements fs.FileOperations.Write. +func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) + return int64(n), err.ToError() +} |