From 661b2b9f69855b6900195af84bf549341bdda0fe Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Fri, 16 Aug 2019 17:33:23 -0700 Subject: procfs: Migrate seqfile implementations. Migrates all (except 3) seqfile implementations to the vfs.DynamicBytesSource interface. There should not be any change in functionality due to this migration itself. Please note that the following seqfile implementations have not been migrated: - /proc/filesystems in proc/filesystems.go - /proc/[pid]/mountinfo in proc/mounts.go - /proc/[pid]/mounts in proc/mounts.go This is because these depend on pending changes in /pkg/senty/vfs. PiperOrigin-RevId: 263880719 --- pkg/sentry/fsimpl/proc/BUILD | 49 +++++ pkg/sentry/fsimpl/proc/filesystems.go | 25 +++ pkg/sentry/fsimpl/proc/loadavg.go | 40 ++++ pkg/sentry/fsimpl/proc/meminfo.go | 77 ++++++++ pkg/sentry/fsimpl/proc/mounts.go | 33 ++++ pkg/sentry/fsimpl/proc/net.go | 338 ++++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/net_test.go | 78 ++++++++ pkg/sentry/fsimpl/proc/proc.go | 16 ++ pkg/sentry/fsimpl/proc/stat.go | 127 +++++++++++++ pkg/sentry/fsimpl/proc/sys.go | 51 +++++ pkg/sentry/fsimpl/proc/task.go | 261 ++++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/version.go | 68 +++++++ pkg/sentry/mm/procfs.go | 92 +++++++-- 13 files changed, 1235 insertions(+), 20 deletions(-) create mode 100644 pkg/sentry/fsimpl/proc/BUILD create mode 100644 pkg/sentry/fsimpl/proc/filesystems.go create mode 100644 pkg/sentry/fsimpl/proc/loadavg.go create mode 100644 pkg/sentry/fsimpl/proc/meminfo.go create mode 100644 pkg/sentry/fsimpl/proc/mounts.go create mode 100644 pkg/sentry/fsimpl/proc/net.go create mode 100644 pkg/sentry/fsimpl/proc/net_test.go create mode 100644 pkg/sentry/fsimpl/proc/proc.go create mode 100644 pkg/sentry/fsimpl/proc/stat.go create mode 100644 pkg/sentry/fsimpl/proc/sys.go create mode 100644 pkg/sentry/fsimpl/proc/task.go create mode 100644 pkg/sentry/fsimpl/proc/version.go diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD new file mode 100644 index 000000000..3d8a4deaf --- /dev/null +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -0,0 +1,49 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "proc", + srcs = [ + "filesystems.go", + "loadavg.go", + "meminfo.go", + "mounts.go", + "net.go", + "proc.go", + "stat.go", + "sys.go", + "task.go", + "version.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/log", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/limits", + "//pkg/sentry/mm", + "//pkg/sentry/socket", + "//pkg/sentry/socket/unix", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + ], +) + +go_test( + name = "proc_test", + size = "small", + srcs = ["net_test.go"], + embed = [":proc"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/inet", + ], +) diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go new file mode 100644 index 000000000..c36c4aff5 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/filesystems.go @@ -0,0 +1,25 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems. +// +// +stateify savable +type filesystemsData struct{} + +// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for +// filesystemsData. We would need to retrive filesystem names from +// vfs.VirtualFilesystem. Also needs vfs replacement for +// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev. diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go new file mode 100644 index 000000000..9135afef1 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/loadavg.go @@ -0,0 +1,40 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// loadavgData backs /proc/loadavg. +// +// +stateify savable +type loadavgData struct{} + +var _ vfs.DynamicBytesSource = (*loadavgData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/62345059): Include real data in fields. + // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. + // Column 4-5: currently running processes and the total number of processes. + // Column 6: the last process ID used. + fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go new file mode 100644 index 000000000..9a827cd66 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/meminfo.go @@ -0,0 +1,77 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. +// +// +stateify savable +type meminfoData struct { + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*meminfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + mf := d.k.MemoryFile() + mf.UpdateUsage() + snapshot, totalUsage := usage.MemoryAccounting.Copy() + totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) + anon := snapshot.Anonymous + snapshot.Tmpfs + file := snapshot.PageCache + snapshot.Mapped + // We don't actually have active/inactive LRUs, so just make up numbers. + activeFile := (file / 2) &^ (usermem.PageSize - 1) + inactiveFile := file - activeFile + + fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) + memFree := (totalSize - totalUsage) / 1024 + // We use MemFree as MemAvailable because we don't swap. + // TODO(rahat): When reclaim is implemented the value of MemAvailable + // should change. + fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree) + fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices + fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) + // Emulate a system with no swap, which disables inactivation of anon pages. + fmt.Fprintf(buf, "SwapCache: 0 kB\n") + fmt.Fprintf(buf, "Active: %8d kB\n", (anon+activeFile)/1024) + fmt.Fprintf(buf, "Inactive: %8d kB\n", inactiveFile/1024) + fmt.Fprintf(buf, "Active(anon): %8d kB\n", anon/1024) + fmt.Fprintf(buf, "Inactive(anon): 0 kB\n") + fmt.Fprintf(buf, "Active(file): %8d kB\n", activeFile/1024) + fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024) + fmt.Fprintf(buf, "Unevictable: 0 kB\n") // TODO(b/31823263) + fmt.Fprintf(buf, "Mlocked: 0 kB\n") // TODO(b/31823263) + fmt.Fprintf(buf, "SwapTotal: 0 kB\n") + fmt.Fprintf(buf, "SwapFree: 0 kB\n") + fmt.Fprintf(buf, "Dirty: 0 kB\n") + fmt.Fprintf(buf, "Writeback: 0 kB\n") + fmt.Fprintf(buf, "AnonPages: %8d kB\n", anon/1024) + fmt.Fprintf(buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know + fmt.Fprintf(buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go new file mode 100644 index 000000000..e81b1e910 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/mounts.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import "gvisor.dev/gvisor/pkg/sentry/kernel" + +// TODO(b/138862512): Implement mountInfoFile and mountsFile. + +// mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo. +// +// +stateify savable +type mountInfoFile struct { + t *kernel.Task +} + +// mountsFile implements vfs.DynamicBytesSource for /proc/[pid]/mounts. +// +// +stateify savable +type mountsFile struct { + t *kernel.Task +} diff --git a/pkg/sentry/fsimpl/proc/net.go b/pkg/sentry/fsimpl/proc/net.go new file mode 100644 index 000000000..fd46eebf8 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/net.go @@ -0,0 +1,338 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. +// +// +stateify savable +type ifinet6 struct { + s inet.Stack +} + +var _ vfs.DynamicBytesSource = (*ifinet6)(nil) + +func (n *ifinet6) contents() []string { + var lines []string + nics := n.s.Interfaces() + for id, naddrs := range n.s.InterfaceAddrs() { + nic, ok := nics[id] + if !ok { + // NIC was added after NICNames was called. We'll just + // ignore it. + continue + } + + for _, a := range naddrs { + // IPv6 only. + if a.Family != linux.AF_INET6 { + continue + } + + // Fields: + // IPv6 address displayed in 32 hexadecimal chars without colons + // Netlink device number (interface index) in hexadecimal (use nic id) + // Prefix length in hexadecimal + // Scope value (use 0) + // Interface flags + // Device name + lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) + } + } + return lines +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { + for _, l := range n.contents() { + buf.WriteString(l) + } + return nil +} + +// netDev implements vfs.DynamicBytesSource for /proc/net/dev. +// +// +stateify savable +type netDev struct { + s inet.Stack +} + +var _ vfs.DynamicBytesSource = (*netDev)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { + interfaces := n.s.Interfaces() + buf.WriteString("Inter-| Receive | Transmit\n") + buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") + + for _, i := range interfaces { + // Implements the same format as + // net/core/net-procfs.c:dev_seq_printf_stats. + var stats inet.StatDev + if err := n.s.Statistics(&stats, i.Name); err != nil { + log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) + continue + } + fmt.Fprintf( + buf, + "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", + i.Name, + // Received + stats[0], // bytes + stats[1], // packets + stats[2], // errors + stats[3], // dropped + stats[4], // fifo + stats[5], // frame + stats[6], // compressed + stats[7], // multicast + // Transmitted + stats[8], // bytes + stats[9], // packets + stats[10], // errors + stats[11], // dropped + stats[12], // fifo + stats[13], // frame + stats[14], // compressed + stats[15], // multicast + ) + } + + return nil +} + +// netUnix implements vfs.DynamicBytesSource for /proc/net/unix. +// +// +stateify savable +type netUnix struct { + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*netUnix)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + s.DecRef() + // Not a unix socket. + continue + } + sops := sfile.FileOperations.(*unix.SocketOperations) + + addr, err := sops.Endpoint().GetLocalAddress() + if err != nil { + log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) + addr.Addr = "" + } + + sockFlags := 0 + if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok { + if ce.Listening() { + // For unix domain sockets, linux reports a single flag + // value if the socket is listening, of __SO_ACCEPTCON. + sockFlags = linux.SO_ACCEPTCON + } + } + + // In the socket entry below, the value for the 'Num' field requires + // some consideration. Linux prints the address to the struct + // unix_sock representing a socket in the kernel, but may redact the + // value for unprivileged users depending on the kptr_restrict + // sysctl. + // + // One use for this field is to allow a privileged user to + // introspect into the kernel memory to determine information about + // a socket not available through procfs, such as the socket's peer. + // + // In gvisor, returning a pointer to our internal structures would + // be pointless, as it wouldn't match the memory layout for struct + // unix_sock, making introspection difficult. We could populate a + // struct unix_sock with the appropriate data, but even that + // requires consideration for which kernel version to emulate, as + // the definition of this struct changes over time. + // + // For now, we always redact this pointer. + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", + (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. + sfile.ReadRefs()-1, // RefCount, don't count our own ref. + 0, // Protocol, always 0 for UDS. + sockFlags, // Flags. + sops.Endpoint().Type(), // Type. + sops.State(), // State. + sfile.InodeID(), // Inode. + ) + + // Path + if len(addr.Addr) != 0 { + if addr.Addr[0] == 0 { + // Abstract path. + fmt.Fprintf(buf, " @%s", string(addr.Addr[1:])) + } else { + fmt.Fprintf(buf, " %s", string(addr.Addr)) + } + } + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp. +// +// +stateify savable +type netTCP struct { + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*netTCP)(nil) + +func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { + t := kernel.TaskFromContext(ctx) + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { + s.DecRef() + // Not tcp4 sockets. + continue + } + + // Linux's documentation for the fields below can be found at + // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. + // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). + // Note that the header doesn't contain labels for all the fields. + + // Field: sl; entry number. + fmt.Fprintf(buf, "%4d: ", se.ID) + + portBuf := make([]byte, 2) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = *local.(*linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, localAddr.Port) + fmt.Fprintf(buf, "%08X:%04X ", + binary.LittleEndian.Uint32(localAddr.Addr[:]), + portBuf) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = *remote.(*linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) + fmt.Fprintf(buf, "%08X:%04X ", + binary.LittleEndian.Uint32(remoteAddr.Addr[:]), + portBuf) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when; timer active state and number of jiffies + // until timer expires. Unimplemented. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt; number of unrecovered RTO timeouts. + // Unimplemented. + fmt.Fprintf(buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(buf, "%5d ", 0) + } else { + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + } + + // Field: timeout; number of unanswered 0-window probes. + // Unimplemented. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + + // Field: refcount. Don't count the ref we obtain while deferencing + // the weakref to this socket. + fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: retransmit timeout. Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: predicted tick of soft clock (delayed ACK control data). + // Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: sending congestion window, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. + // Unimplemented, report as large threshold. + fmt.Fprintf(buf, "%d", -1) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + + return nil +} diff --git a/pkg/sentry/fsimpl/proc/net_test.go b/pkg/sentry/fsimpl/proc/net_test.go new file mode 100644 index 000000000..20a77a8ca --- /dev/null +++ b/pkg/sentry/fsimpl/proc/net_test.go @@ -0,0 +1,78 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/inet" +) + +func newIPv6TestStack() *inet.TestStack { + s := inet.NewTestStack() + s.SupportsIPv6Flag = true + return s +} + +func TestIfinet6NoAddresses(t *testing.T) { + n := &ifinet6{s: newIPv6TestStack()} + var buf bytes.Buffer + n.Generate(contexttest.Context(t), &buf) + if buf.Len() > 0 { + t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{}) + } +} + +func TestIfinet6(t *testing.T) { + s := newIPv6TestStack() + s.InterfacesMap[1] = inet.Interface{Name: "eth0"} + s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"), + }, + } + s.InterfacesMap[2] = inet.Interface{Name: "eth1"} + s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"), + }, + } + want := map[string]struct{}{ + "000102030405060708090a0b0c0d0e0f 01 80 00 00 eth0\n": {}, + "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, + } + + n := &ifinet6{s: s} + contents := n.contents() + if len(contents) != len(want) { + t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) + } + got := map[string]struct{}{} + for _, l := range contents { + got[l] = struct{}{} + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("Got n.contents() = %v, want = %v", got, want) + } +} diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/fsimpl/proc/proc.go new file mode 100644 index 000000000..31dec36de --- /dev/null +++ b/pkg/sentry/fsimpl/proc/proc.go @@ -0,0 +1,16 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package proc implements a partial in-memory file system for procfs. +package proc diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go new file mode 100644 index 000000000..720db3828 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/stat.go @@ -0,0 +1,127 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// cpuStats contains the breakdown of CPU time for /proc/stat. +type cpuStats struct { + // user is time spent in userspace tasks with non-positive niceness. + user uint64 + + // nice is time spent in userspace tasks with positive niceness. + nice uint64 + + // system is time spent in non-interrupt kernel context. + system uint64 + + // idle is time spent idle. + idle uint64 + + // ioWait is time spent waiting for IO. + ioWait uint64 + + // irq is time spent in interrupt context. + irq uint64 + + // softirq is time spent in software interrupt context. + softirq uint64 + + // steal is involuntary wait time. + steal uint64 + + // guest is time spent in guests with non-positive niceness. + guest uint64 + + // guestNice is time spent in guests with positive niceness. + guestNice uint64 +} + +// String implements fmt.Stringer. +func (c cpuStats) String() string { + return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice) +} + +// statData implements vfs.DynamicBytesSource for /proc/stat. +// +// +stateify savable +type statData struct { + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*statData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/37226836): We currently export only zero CPU stats. We could + // at least provide some aggregate stats. + var cpu cpuStats + fmt.Fprintf(buf, "cpu %s\n", cpu) + + for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { + fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) + } + + // The total number of interrupts is dependent on the CPUs and PCI + // devices on the system. See arch_probe_nr_irqs. + // + // Since we don't report real interrupt stats, just choose an arbitrary + // value from a representative VM. + const numInterrupts = 256 + + // The Kernel doesn't handle real interrupts, so report all zeroes. + // TODO(b/37226836): We could count page faults as #PF. + fmt.Fprintf(buf, "intr 0") // total + for i := 0; i < numInterrupts; i++ { + fmt.Fprintf(buf, " 0") + } + fmt.Fprintf(buf, "\n") + + // Total number of context switches. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "ctxt 0\n") + + // CLOCK_REALTIME timestamp from boot, in seconds. + fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) + + // Total number of clones. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "processes 0\n") + + // Number of runnable tasks. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "procs_running 0\n") + + // Number of tasks waiting on IO. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "procs_blocked 0\n") + + // Number of each softirq handled. + fmt.Fprintf(buf, "softirq 0") // total + for i := 0; i < linux.NumSoftIRQ; i++ { + fmt.Fprintf(buf, " 0") + } + fmt.Fprintf(buf, "\n") + return nil +} diff --git a/pkg/sentry/fsimpl/proc/sys.go b/pkg/sentry/fsimpl/proc/sys.go new file mode 100644 index 000000000..b88256e12 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/sys.go @@ -0,0 +1,51 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// mmapMinAddrData implements vfs.DynamicBytesSource for +// /proc/sys/vm/mmap_min_addr. +// +// +stateify savable +type mmapMinAddrData struct { + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*mmapMinAddrData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) + return nil +} + +// +stateify savable +type overcommitMemory struct{} + +var _ vfs.DynamicBytesSource = (*overcommitMemory)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *overcommitMemory) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "0\n") + return nil +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go new file mode 100644 index 000000000..c46e05c3a --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task.go @@ -0,0 +1,261 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// mapsCommon is embedded by mapsData and smapsData. +type mapsCommon struct { + t *kernel.Task +} + +// mm gets the kernel task's MemoryManager. No additional reference is taken on +// mm here. This is safe because MemoryManager.destroy is required to leave the +// MemoryManager in a state where it's still usable as a DynamicBytesSource. +func (md *mapsCommon) mm() *mm.MemoryManager { + var tmm *mm.MemoryManager + md.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + tmm = mm + } + }) + return tmm +} + +// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. +// +// +stateify savable +type mapsData struct { + mapsCommon +} + +var _ vfs.DynamicBytesSource = (*mapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := md.mm(); mm != nil { + mm.ReadMapsDataInto(ctx, buf) + } + return nil +} + +// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. +// +// +stateify savable +type smapsData struct { + mapsCommon +} + +var _ vfs.DynamicBytesSource = (*smapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := sd.mm(); mm != nil { + mm.ReadSmapsDataInto(ctx, buf) + } + return nil +} + +// +stateify savable +type taskStatData struct { + t *kernel.Task + + // If tgstats is true, accumulate fault stats (not implemented) and CPU + // time across all tasks in t's thread group. + tgstats bool + + // pidns is the PID namespace associated with the proc filesystem that + // includes the file using this statData. + pidns *kernel.PIDNamespace +} + +var _ vfs.DynamicBytesSource = (*taskStatData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) + fmt.Fprintf(buf, "(%s) ", s.t.Name()) + fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "%d ", ppid) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) + fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) + fmt.Fprintf(buf, "0 " /* flags */) + fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) + var cputime usage.CPUStats + if s.tgstats { + cputime = s.t.ThreadGroup().CPUStats() + } else { + cputime = s.t.CPUStats() + } + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + cputime = s.t.ThreadGroup().JoinedChildCPUStats() + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) + fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + + // itrealvalue. Since kernel 2.6.17, this field is no longer + // maintained, and is hard coded as 0. + fmt.Fprintf(buf, "0 ") + + // Start time is relative to boot time, expressed in clock ticks. + fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) + + // rsslim. + fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) + + fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) + fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) + fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) + terminationSignal := linux.Signal(0) + if s.t == s.t.ThreadGroup().Leader() { + terminationSignal = s.t.ThreadGroup().TerminationSignal() + } + fmt.Fprintf(buf, "%d ", terminationSignal) + fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) + fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) + fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) + fmt.Fprintf(buf, "0\n" /* exit_code */) + + return nil +} + +// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. +// +// +stateify savable +type statmData struct { + t *kernel.Task +} + +var _ vfs.DynamicBytesSource = (*statmData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + + fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize) + return nil +} + +// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. +// +// +stateify savable +type statusData struct { + t *kernel.Task + pidns *kernel.PIDNamespace +} + +var _ vfs.DynamicBytesSource = (*statusData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) + fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) + fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) + fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "PPid:\t%d\n", ppid) + tpid := kernel.ThreadID(0) + if tracer := s.t.Tracer(); tracer != nil { + tpid = s.pidns.IDOfTask(tracer) + } + fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) + var fds int + var vss, rss, data uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() + } + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } + }) + fmt.Fprintf(buf, "FDSize:\t%d\n", fds) + fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) + fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) + fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) + fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) + creds := s.t.Credentials() + fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) + fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) + fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) + fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) + fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + return nil +} + +// ioUsage is the /proc//io and /proc//task//io data provider. +type ioUsage interface { + // IOUsage returns the io usage data. + IOUsage() *usage.IO +} + +// +stateify savable +type ioData struct { + ioUsage +} + +var _ vfs.DynamicBytesSource = (*ioData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { + io := usage.IO{} + io.Accumulate(i.IOUsage()) + + fmt.Fprintf(buf, "char: %d\n", io.CharsRead) + fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) + fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) + fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) + fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) + fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) + fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go new file mode 100644 index 000000000..e1643d4e0 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/version.go @@ -0,0 +1,68 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// versionData implements vfs.DynamicBytesSource for /proc/version. +// +// +stateify savable +type versionData struct { + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*versionData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { + init := v.k.GlobalInit() + if init == nil { + // Attempted to read before the init Task is created. This can + // only occur during startup, which should never need to read + // this file. + panic("Attempted to read version before initial Task is available") + } + + // /proc/version takes the form: + // + // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST) + // (COMPILER_VERSION) VERSION" + // + // where: + // - SYSNAME, RELEASE, and VERSION are the same as returned by + // sys_utsname + // - COMPILE_USER is the user that build the kernel + // - COMPILE_HOST is the hostname of the machine on which the kernel + // was built + // - COMPILER_VERSION is the version reported by the building compiler + // + // Since we don't really want to expose build information to + // applications, those fields are omitted. + // + // FIXME(mpratt): Using Version from the init task SyscallTable + // disregards the different version a task may have (e.g., in a uts + // namespace). + ver := init.Leader().SyscallTable().Version + fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) + return nil +} diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go index a8819aa84..8c2246bb4 100644 --- a/pkg/sentry/mm/procfs.go +++ b/pkg/sentry/mm/procfs.go @@ -58,6 +58,34 @@ func (mm *MemoryManager) NeedsUpdate(generation int64) bool { return true } +// ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to +// implement /proc/[pid]/maps. +func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var start usermem.Addr + + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get + // "panic: autosave error: type usermem.Addr is not registered". + mm.appendVMAMapsEntryLocked(ctx, vseg, buf) + } + + // We always emulate vsyscall, so advertise it here. Everything about a + // vsyscall region is static, so just hard code the maps entry since we + // don't have a real vma backing it. The vsyscall region is at the end of + // the virtual address space so nothing should be mapped after it (if + // something is really mapped in the tiny ~10 MiB segment afterwards, we'll + // get the sorting on the maps file wrong at worst; but that's not possible + // on any current platform). + // + // Artifically adjust the seqfile handle so we only output vsyscall entry once. + if start != vsyscallEnd { + // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd. + buf.WriteString(vsyscallMapsEntry) + } +} + // ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to // implement /proc/[pid]/maps. func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { @@ -151,6 +179,27 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI b.WriteString("\n") } +// ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to +// implement /proc/[pid]/maps. +func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var start usermem.Addr + + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get + // "panic: autosave error: type usermem.Addr is not registered". + mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf) + } + + // We always emulate vsyscall, so advertise it here. See + // ReadMapsSeqFileData for additional commentary. + if start != vsyscallEnd { + // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd. + buf.WriteString(vsyscallSmapsEntry) + } +} + // ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to // implement /proc/[pid]/smaps. func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { @@ -190,7 +239,12 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { var b bytes.Buffer - mm.appendVMAMapsEntryLocked(ctx, vseg, &b) + mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b) + return b.Bytes() +} + +func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) { + mm.appendVMAMapsEntryLocked(ctx, vseg, b) vma := vseg.ValuePtr() // We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of @@ -211,40 +265,40 @@ func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterat } mm.activeMu.RUnlock() - fmt.Fprintf(&b, "Size: %8d kB\n", vseg.Range().Length()/1024) - fmt.Fprintf(&b, "Rss: %8d kB\n", rss/1024) + fmt.Fprintf(b, "Size: %8d kB\n", vseg.Range().Length()/1024) + fmt.Fprintf(b, "Rss: %8d kB\n", rss/1024) // Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma // is only mapped by that pma. This avoids having to query memmap.Mappables // for reference count information on each page. As a corollary, all pages // are accounted as "private" whether or not the vma is private; compare // Linux's fs/proc/task_mmu.c:smaps_account(). - fmt.Fprintf(&b, "Pss: %8d kB\n", rss/1024) - fmt.Fprintf(&b, "Shared_Clean: %8d kB\n", 0) - fmt.Fprintf(&b, "Shared_Dirty: %8d kB\n", 0) + fmt.Fprintf(b, "Pss: %8d kB\n", rss/1024) + fmt.Fprintf(b, "Shared_Clean: %8d kB\n", 0) + fmt.Fprintf(b, "Shared_Dirty: %8d kB\n", 0) // Pretend that all pages are dirty if the vma is writable, and clean otherwise. clean := rss if vma.effectivePerms.Write { clean = 0 } - fmt.Fprintf(&b, "Private_Clean: %8d kB\n", clean/1024) - fmt.Fprintf(&b, "Private_Dirty: %8d kB\n", (rss-clean)/1024) + fmt.Fprintf(b, "Private_Clean: %8d kB\n", clean/1024) + fmt.Fprintf(b, "Private_Dirty: %8d kB\n", (rss-clean)/1024) // Pretend that all pages are "referenced" (recently touched). - fmt.Fprintf(&b, "Referenced: %8d kB\n", rss/1024) - fmt.Fprintf(&b, "Anonymous: %8d kB\n", anon/1024) + fmt.Fprintf(b, "Referenced: %8d kB\n", rss/1024) + fmt.Fprintf(b, "Anonymous: %8d kB\n", anon/1024) // Hugepages (hugetlb and THP) are not implemented. - fmt.Fprintf(&b, "AnonHugePages: %8d kB\n", 0) - fmt.Fprintf(&b, "Shared_Hugetlb: %8d kB\n", 0) - fmt.Fprintf(&b, "Private_Hugetlb: %7d kB\n", 0) + fmt.Fprintf(b, "AnonHugePages: %8d kB\n", 0) + fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0) + fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0) // Swap is not implemented. - fmt.Fprintf(&b, "Swap: %8d kB\n", 0) - fmt.Fprintf(&b, "SwapPss: %8d kB\n", 0) - fmt.Fprintf(&b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024) - fmt.Fprintf(&b, "MMUPageSize: %8d kB\n", usermem.PageSize/1024) + fmt.Fprintf(b, "Swap: %8d kB\n", 0) + fmt.Fprintf(b, "SwapPss: %8d kB\n", 0) + fmt.Fprintf(b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024) + fmt.Fprintf(b, "MMUPageSize: %8d kB\n", usermem.PageSize/1024) locked := rss if vma.mlockMode == memmap.MLockNone { locked = 0 } - fmt.Fprintf(&b, "Locked: %8d kB\n", locked/1024) + fmt.Fprintf(b, "Locked: %8d kB\n", locked/1024) b.WriteString("VmFlags: ") if vma.realPerms.Read { @@ -284,6 +338,4 @@ func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterat b.WriteString("ac ") } b.WriteString("\n") - - return b.Bytes() } -- cgit v1.2.3 From 3e4102b2ead18aa768e1b8082d9865a9c567ce4e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 16 Aug 2019 19:30:59 -0700 Subject: netstack: disconnect an unix socket only if the address family is AF_UNSPEC Linux allows to call connect for ANY and the zero port. PiperOrigin-RevId: 263892534 --- pkg/sentry/socket/epsocket/epsocket.go | 45 +++++++----- pkg/sentry/socket/unix/unix.go | 2 +- pkg/sentry/strace/socket.go | 2 +- pkg/tcpip/stack/transport_test.go | 5 ++ pkg/tcpip/tcpip.go | 3 + pkg/tcpip/transport/icmp/endpoint.go | 10 +-- pkg/tcpip/transport/raw/endpoint.go | 10 +-- pkg/tcpip/transport/tcp/endpoint.go | 10 +-- pkg/tcpip/transport/udp/endpoint.go | 6 +- test/syscalls/linux/udp_socket.cc | 124 ++++++++++++++++++++++++++++----- 10 files changed, 160 insertions(+), 57 deletions(-) diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index 4d8a5ac22..635042263 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -291,18 +291,22 @@ func bytesToIPAddress(addr []byte) tcpip.Address { return tcpip.Address(addr) } -// GetAddress reads an sockaddr struct from the given address and converts it -// to the FullAddress format. It supports AF_UNIX, AF_INET and AF_INET6 -// addresses. -func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syserr.Error) { +// AddressAndFamily reads an sockaddr struct from the given address and +// converts it to the FullAddress format. It supports AF_UNIX, AF_INET and +// AF_INET6 addresses. +// +// strict indicates whether addresses with the AF_UNSPEC family are accepted of not. +// +// AddressAndFamily returns an address, its family. +func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) { // Make sure we have at least 2 bytes for the address family. if len(addr) < 2 { - return tcpip.FullAddress{}, syserr.ErrInvalidArgument + return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument } family := usermem.ByteOrder.Uint16(addr) if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) { - return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported + return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported } // Get the rest of the fields based on the address family. @@ -310,7 +314,7 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse case linux.AF_UNIX: path := addr[2:] if len(path) > linux.UnixPathMax { - return tcpip.FullAddress{}, syserr.ErrInvalidArgument + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } // Drop the terminating NUL (if one exists) and everything after // it for filesystem (non-abstract) addresses. @@ -321,12 +325,12 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse } return tcpip.FullAddress{ Addr: tcpip.Address(path), - }, nil + }, family, nil case linux.AF_INET: var a linux.SockAddrInet if len(addr) < sockAddrInetSize { - return tcpip.FullAddress{}, syserr.ErrInvalidArgument + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a) @@ -334,12 +338,12 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse Addr: bytesToIPAddress(a.Addr[:]), Port: ntohs(a.Port), } - return out, nil + return out, family, nil case linux.AF_INET6: var a linux.SockAddrInet6 if len(addr) < sockAddrInet6Size { - return tcpip.FullAddress{}, syserr.ErrInvalidArgument + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a) @@ -350,13 +354,13 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse if isLinkLocal(out.Addr) { out.NIC = tcpip.NICID(a.Scope_id) } - return out, nil + return out, family, nil case linux.AF_UNSPEC: - return tcpip.FullAddress{}, nil + return tcpip.FullAddress{}, family, nil default: - return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported + return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported } } @@ -482,11 +486,18 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { - addr, err := GetAddress(s.family, sockaddr, false /* strict */) + addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */) if err != nil { return err } + if family == linux.AF_UNSPEC { + err := s.Endpoint.Disconnect() + if err == tcpip.ErrNotSupported { + return syserr.ErrAddressFamilyNotSupported + } + return syserr.TranslateNetstackError(err) + } // Always return right away in the non-blocking case. if !blocking { return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) @@ -515,7 +526,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { - addr, err := GetAddress(s.family, sockaddr, true /* strict */) + addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */) if err != nil { return err } @@ -2023,7 +2034,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] var addr *tcpip.FullAddress if len(to) > 0 { - addrBuf, err := GetAddress(s.family, to, true /* strict */) + addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */) if err != nil { return 0, err } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 8a3f65236..0d0cb68df 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -116,7 +116,7 @@ func (s *SocketOperations) Endpoint() transport.Endpoint { // extractPath extracts and validates the address. func extractPath(sockaddr []byte) (string, *syserr.Error) { - addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr, true /* strict */) + addr, _, err := epsocket.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */) if err != nil { return "", err } diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index 386b40af7..f779186ad 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -332,7 +332,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string { switch family { case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX: - fa, err := epsocket.GetAddress(int(family), b, true /* strict */) + fa, _, err := epsocket.AddressAndFamily(int(family), b, true /* strict */) if err != nil { return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err) } diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go index 8146e8444..1c811ab68 100644 --- a/pkg/tcpip/stack/transport_test.go +++ b/pkg/tcpip/stack/transport_test.go @@ -105,6 +105,11 @@ func (*fakeTransportEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { return tcpip.ErrInvalidEndpointState } +// Disconnect implements tcpip.Endpoint.Disconnect. +func (*fakeTransportEndpoint) Disconnect() *tcpip.Error { + return tcpip.ErrNotSupported +} + func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { f.peerAddr = addr.Addr diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index ba2dd85b8..29a6025d9 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -358,6 +358,9 @@ type Endpoint interface { // ErrAddressFamilyNotSupported must be returned. Connect(address FullAddress) *Error + // Disconnect disconnects the endpoint from its peer. + Disconnect() *Error + // Shutdown closes the read and/or write end of the endpoint connection // to its peer. Shutdown(flags ShutdownFlags) *Error diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 2e8d5d4bf..451d3880e 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -428,16 +428,16 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (t return netProto, nil } +// Disconnect implements tcpip.Endpoint.Disconnect. +func (*endpoint) Disconnect() *tcpip.Error { + return tcpip.ErrNotSupported +} + // Connect connects the endpoint to its peer. Specifying a NIC is optional. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { e.mu.Lock() defer e.mu.Unlock() - if addr.Addr == "" { - // AF_UNSPEC isn't supported. - return tcpip.ErrAddressFamilyNotSupported - } - nicid := addr.NIC localPort := uint16(0) switch e.state { diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 53c9515a4..13e17e2a6 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -349,16 +349,16 @@ func (ep *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) return 0, tcpip.ControlMessages{}, nil } +// Disconnect implements tcpip.Endpoint.Disconnect. +func (*endpoint) Disconnect() *tcpip.Error { + return tcpip.ErrNotSupported +} + // Connect implements tcpip.Endpoint.Connect. func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { ep.mu.Lock() defer ep.mu.Unlock() - if addr.Addr == "" { - // AF_UNSPEC isn't supported. - return tcpip.ErrAddressFamilyNotSupported - } - if ep.closed { return tcpip.ErrInvalidEndpointState } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index e5f835c20..24b32e4af 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1362,13 +1362,13 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocol return netProto, nil } +// Disconnect implements tcpip.Endpoint.Disconnect. +func (*endpoint) Disconnect() *tcpip.Error { + return tcpip.ErrNotSupported +} + // Connect connects the endpoint to its peer. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { - if addr.Addr == "" && addr.Port == 0 { - // AF_UNSPEC isn't supported. - return tcpip.ErrAddressFamilyNotSupported - } - return e.connect(addr, true, true) } diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 640bb8667..8b3356406 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -711,7 +711,8 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (t return netProto, nil } -func (e *endpoint) disconnect() *tcpip.Error { +// Disconnect implements tcpip.Endpoint.Disconnect. +func (e *endpoint) Disconnect() *tcpip.Error { e.mu.Lock() defer e.mu.Unlock() @@ -750,9 +751,6 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { if err != nil { return err } - if addr.Addr == "" { - return e.disconnect() - } if addr.Port == 0 { // We don't support connecting to port zero. return tcpip.ErrInvalidEndpointState diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc index 6ffb65168..111dbacdf 100644 --- a/test/syscalls/linux/udp_socket.cc +++ b/test/syscalls/linux/udp_socket.cc @@ -378,16 +378,17 @@ TEST_P(UdpSocketTest, Connect) { EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0); } -TEST_P(UdpSocketTest, ConnectAny) { +void ConnectAny(AddressFamily family, int sockfd, uint16_t port) { struct sockaddr_storage addr = {}; // Precondition check. { socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast(&addr), &addrlen), - SyscallSucceeds()); + EXPECT_THAT( + getsockname(sockfd, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); - if (GetParam() == AddressFamily::kIpv4) { + if (family == AddressFamily::kIpv4) { auto addr_out = reinterpret_cast(&addr); EXPECT_EQ(addrlen, sizeof(*addr_out)); EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY)); @@ -400,21 +401,24 @@ TEST_P(UdpSocketTest, ConnectAny) { { socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getpeername(s_, reinterpret_cast(&addr), &addrlen), - SyscallFailsWithErrno(ENOTCONN)); + EXPECT_THAT( + getpeername(sockfd, reinterpret_cast(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); } struct sockaddr_storage baddr = {}; - if (GetParam() == AddressFamily::kIpv4) { + if (family == AddressFamily::kIpv4) { auto addr_in = reinterpret_cast(&baddr); addrlen = sizeof(*addr_in); addr_in->sin_family = AF_INET; addr_in->sin_addr.s_addr = htonl(INADDR_ANY); + addr_in->sin_port = port; } else { auto addr_in = reinterpret_cast(&baddr); addrlen = sizeof(*addr_in); addr_in->sin6_family = AF_INET6; - if (GetParam() == AddressFamily::kIpv6) { + addr_in->sin6_port = port; + if (family == AddressFamily::kIpv6) { addr_in->sin6_addr = IN6ADDR_ANY_INIT; } else { TestAddress const& v4_mapped_any = V4MappedAny(); @@ -424,21 +428,23 @@ TEST_P(UdpSocketTest, ConnectAny) { } } - ASSERT_THAT(connect(s_, reinterpret_cast(&baddr), addrlen), + // TODO(b/138658473): gVisor doesn't allow connecting to the zero port. + if (port == 0) { + SKIP_IF(IsRunningOnGvisor()); + } + + ASSERT_THAT(connect(sockfd, reinterpret_cast(&baddr), addrlen), SyscallSucceeds()); } - // TODO(b/138658473): gVisor doesn't return the correct local address after - // connecting to the any address. - SKIP_IF(IsRunningOnGvisor()); - // Postcondition check. { socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast(&addr), &addrlen), - SyscallSucceeds()); + EXPECT_THAT( + getsockname(sockfd, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); - if (GetParam() == AddressFamily::kIpv4) { + if (family == AddressFamily::kIpv4) { auto addr_out = reinterpret_cast(&addr); EXPECT_EQ(addrlen, sizeof(*addr_out)); EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK)); @@ -446,7 +452,7 @@ TEST_P(UdpSocketTest, ConnectAny) { auto addr_out = reinterpret_cast(&addr); EXPECT_EQ(addrlen, sizeof(*addr_out)); struct in6_addr loopback; - if (GetParam() == AddressFamily::kIpv6) { + if (family == AddressFamily::kIpv6) { loopback = IN6ADDR_LOOPBACK_INIT; } else { TestAddress const& v4_mapped_loopback = V4MappedLoopback(); @@ -459,11 +465,91 @@ TEST_P(UdpSocketTest, ConnectAny) { } addrlen = sizeof(addr); - EXPECT_THAT(getpeername(s_, reinterpret_cast(&addr), &addrlen), - SyscallFailsWithErrno(ENOTCONN)); + if (port == 0) { + EXPECT_THAT( + getpeername(sockfd, reinterpret_cast(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); + } else { + EXPECT_THAT( + getpeername(sockfd, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); + } + } +} + +TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); } + +TEST_P(UdpSocketTest, ConnectAnyWithPort) { + auto port = *Port(reinterpret_cast(addr_[1])); + ConnectAny(GetParam(), s_, port); +} + +void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) { + struct sockaddr_storage addr = {}; + + socklen_t addrlen = sizeof(addr); + struct sockaddr_storage baddr = {}; + if (family == AddressFamily::kIpv4) { + auto addr_in = reinterpret_cast(&baddr); + addrlen = sizeof(*addr_in); + addr_in->sin_family = AF_INET; + addr_in->sin_addr.s_addr = htonl(INADDR_ANY); + addr_in->sin_port = port; + } else { + auto addr_in = reinterpret_cast(&baddr); + addrlen = sizeof(*addr_in); + addr_in->sin6_family = AF_INET6; + addr_in->sin6_port = port; + if (family == AddressFamily::kIpv6) { + addr_in->sin6_addr = IN6ADDR_ANY_INIT; + } else { + TestAddress const& v4_mapped_any = V4MappedAny(); + addr_in->sin6_addr = + reinterpret_cast(&v4_mapped_any.addr) + ->sin6_addr; + } + } + + // TODO(b/138658473): gVisor doesn't allow connecting to the zero port. + if (port == 0) { + SKIP_IF(IsRunningOnGvisor()); + } + + ASSERT_THAT(connect(sockfd, reinterpret_cast(&baddr), addrlen), + SyscallSucceeds()); + // Now the socket is bound to the loopback address. + + // Disconnect + addrlen = sizeof(addr); + addr.ss_family = AF_UNSPEC; + ASSERT_THAT(connect(sockfd, reinterpret_cast(&addr), addrlen), + SyscallSucceeds()); + + // Check that after disconnect the socket is bound to the ANY address. + EXPECT_THAT(getsockname(sockfd, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); + if (family == AddressFamily::kIpv4) { + auto addr_out = reinterpret_cast(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY)); + } else { + auto addr_out = reinterpret_cast(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + struct in6_addr loopback = IN6ADDR_ANY_INIT; + + EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0); } } +TEST_P(UdpSocketTest, DisconnectAfterConnectAny) { + DisconnectAfterConnectAny(GetParam(), s_, 0); +} + +TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) { + auto port = *Port(reinterpret_cast(addr_[1])); + DisconnectAfterConnectAny(GetParam(), s_, port); +} + TEST_P(UdpSocketTest, DisconnectAfterBind) { ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds()); // Connect the socket. -- cgit v1.2.3 From bd826092fecf32a87a3207a23c3795e819babce7 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Mon, 19 Aug 2019 10:04:54 -0700 Subject: Read iptables via sockopts. PiperOrigin-RevId: 264180125 --- pkg/sentry/socket/epsocket/stack.go | 4 +- pkg/sentry/socket/netfilter/BUILD | 1 + pkg/sentry/socket/netfilter/netfilter.go | 254 ++++++++++++++++++++++++++++++- test/syscalls/BUILD | 4 + test/syscalls/linux/BUILD | 18 +++ test/syscalls/linux/iptables.cc | 204 +++++++++++++++++++++++++ 6 files changed, 476 insertions(+), 9 deletions(-) create mode 100644 test/syscalls/linux/iptables.cc diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go index 8f1572bf4..1b11f4b2d 100644 --- a/pkg/sentry/socket/epsocket/stack.go +++ b/pkg/sentry/socket/epsocket/stack.go @@ -198,8 +198,8 @@ func (s *Stack) IPTables() (iptables.IPTables, error) { // FillDefaultIPTables sets the stack's iptables to the default tables, which // allow and do not modify all traffic. -func (s *Stack) FillDefaultIPTables() error { - return netfilter.FillDefaultIPTables(s.Stack) +func (s *Stack) FillDefaultIPTables() { + netfilter.FillDefaultIPTables(s.Stack) } // Resume implements inet.Stack.Resume. diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index 3021f83e7..354a0d6ee 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -13,6 +13,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/binary", "//pkg/sentry/kernel", "//pkg/sentry/usermem", "//pkg/syserr", diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index efdb42903..9f87c32f1 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,7 +17,10 @@ package netfilter import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" @@ -26,21 +29,258 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" ) +// errorTargetName is used to mark targets as error targets. Error targets +// shouldn't be reached - an error has occurred if we fall through to one. +const errorTargetName = "ERROR" + +// metadata is opaque to netstack. It holds data that we need to translate +// between Linux's and netstack's iptables representations. +type metadata struct { + HookEntry [linux.NF_INET_NUMHOOKS]uint32 + Underflow [linux.NF_INET_NUMHOOKS]uint32 + NumEntries uint32 + Size uint32 +} + // GetInfo returns information about iptables. func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) { - // TODO(b/129292233): Implement. - return linux.IPTGetinfo{}, syserr.ErrInvalidArgument + // Read in the struct and table name. + var info linux.IPTGetinfo + if _, err := t.CopyIn(outPtr, &info); err != nil { + return linux.IPTGetinfo{}, syserr.FromError(err) + } + + // Find the appropriate table. + table, err := findTable(ep, info.TableName()) + if err != nil { + return linux.IPTGetinfo{}, err + } + + // Get the hooks that apply to this table. + info.ValidHooks = table.ValidHooks() + + // Grab the metadata struct, which is used to store information (e.g. + // the number of entries) that applies to the user's encoding of + // iptables, but not netstack's. + metadata := table.Metadata().(metadata) + + // Set values from metadata. + info.HookEntry = metadata.HookEntry + info.Underflow = metadata.Underflow + info.NumEntries = metadata.NumEntries + info.Size = metadata.Size + + return info, nil } // GetEntries returns netstack's iptables rules encoded for the iptables tool. func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) { - // TODO(b/129292233): Implement. - return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument + // Read in the struct and table name. + var userEntries linux.IPTGetEntries + if _, err := t.CopyIn(outPtr, &userEntries); err != nil { + return linux.KernelIPTGetEntries{}, syserr.FromError(err) + } + + // Find the appropriate table. + table, err := findTable(ep, userEntries.TableName()) + if err != nil { + return linux.KernelIPTGetEntries{}, err + } + + // Convert netstack's iptables rules to something that the iptables + // tool can understand. + entries, _, err := convertNetstackToBinary(userEntries.TableName(), table) + if err != nil { + return linux.KernelIPTGetEntries{}, err + } + if binary.Size(entries) > uintptr(outLen) { + return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument + } + + return entries, nil +} + +func findTable(ep tcpip.Endpoint, tableName string) (iptables.Table, *syserr.Error) { + ipt, err := ep.IPTables() + if err != nil { + return iptables.Table{}, syserr.FromError(err) + } + table, ok := ipt.Tables[tableName] + if !ok { + return iptables.Table{}, syserr.ErrInvalidArgument + } + return table, nil } // FillDefaultIPTables sets stack's IPTables to the default tables and // populates them with metadata. -func FillDefaultIPTables(stack *stack.Stack) error { - stack.SetIPTables(iptables.DefaultTables()) - return nil +func FillDefaultIPTables(stack *stack.Stack) { + ipt := iptables.DefaultTables() + + // In order to fill in the metadata, we have to translate ipt from its + // netstack format to Linux's giant-binary-blob format. + for name, table := range ipt.Tables { + _, metadata, err := convertNetstackToBinary(name, table) + if err != nil { + panic(fmt.Errorf("Unable to set default IP tables: %v", err)) + } + table.SetMetadata(metadata) + ipt.Tables[name] = table + } + + stack.SetIPTables(ipt) +} + +// convertNetstackToBinary converts the iptables as stored in netstack to the +// format expected by the iptables tool. Linux stores each table as a binary +// blob that can only be traversed by parsing a bit, reading some offsets, +// jumping to those offsets, parsing again, etc. +func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) { + // Return values. + var entries linux.KernelIPTGetEntries + var meta metadata + + // The table name has to fit in the struct. + if linux.XT_TABLE_MAXNAMELEN < len(name) { + return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument + } + copy(entries.Name[:], name) + + // Deal with the built in chains first (INPUT, OUTPUT, etc.). Each of + // these chains ends with an unconditional policy entry. + for hook := iptables.Prerouting; hook < iptables.NumHooks; hook++ { + chain, ok := table.BuiltinChains[hook] + if !ok { + // This table doesn't support this hook. + continue + } + + // Sanity check. + if len(chain.Rules) < 1 { + return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument + } + + for ruleIdx, rule := range chain.Rules { + // If this is the first rule of a builtin chain, set + // the metadata hook entry point. + if ruleIdx == 0 { + meta.HookEntry[hook] = entries.Size + } + + // Each rule corresponds to an entry. + entry := linux.KernelIPTEntry{ + IPTEntry: linux.IPTEntry{ + NextOffset: linux.SizeOfIPTEntry, + TargetOffset: linux.SizeOfIPTEntry, + }, + } + + for _, matcher := range rule.Matchers { + // Serialize the matcher and add it to the + // entry. + serialized := marshalMatcher(matcher) + entry.Elems = append(entry.Elems, serialized...) + entry.NextOffset += uint16(len(serialized)) + entry.TargetOffset += uint16(len(serialized)) + } + + // Serialize and append the target. + serialized := marshalTarget(rule.Target) + entry.Elems = append(entry.Elems, serialized...) + entry.NextOffset += uint16(len(serialized)) + + // The underflow rule is the last rule in the chain, + // and is an unconditional rule (i.e. it matches any + // packet). This is enforced when saving iptables. + if ruleIdx == len(chain.Rules)-1 { + meta.Underflow[hook] = entries.Size + } + + entries.Size += uint32(entry.NextOffset) + entries.Entrytable = append(entries.Entrytable, entry) + meta.NumEntries++ + } + + } + + // TODO(gvisor.dev/issue/170): Deal with the user chains here. Each of + // these starts with an error node holding the chain's name and ends + // with an unconditional return. + + // Lastly, each table ends with an unconditional error target rule as + // its final entry. + errorEntry := linux.KernelIPTEntry{ + IPTEntry: linux.IPTEntry{ + NextOffset: linux.SizeOfIPTEntry, + TargetOffset: linux.SizeOfIPTEntry, + }, + } + var errorTarget linux.XTErrorTarget + errorTarget.Target.TargetSize = linux.SizeOfXTErrorTarget + copy(errorTarget.ErrorName[:], errorTargetName) + copy(errorTarget.Target.Name[:], errorTargetName) + + // Serialize and add it to the list of entries. + errorTargetBuf := make([]byte, 0, linux.SizeOfXTErrorTarget) + serializedErrorTarget := binary.Marshal(errorTargetBuf, usermem.ByteOrder, errorTarget) + errorEntry.Elems = append(errorEntry.Elems, serializedErrorTarget...) + errorEntry.NextOffset += uint16(len(serializedErrorTarget)) + + entries.Size += uint32(errorEntry.NextOffset) + entries.Entrytable = append(entries.Entrytable, errorEntry) + meta.NumEntries++ + meta.Size = entries.Size + + return entries, meta, nil +} + +func marshalMatcher(matcher iptables.Matcher) []byte { + switch matcher.(type) { + default: + // TODO(gvisor.dev/issue/170): We don't support any matchers yet, so + // any call to marshalMatcher will panic. + panic(fmt.Errorf("unknown matcher of type %T", matcher)) + } +} + +func marshalTarget(target iptables.Target) []byte { + switch target.(type) { + case iptables.UnconditionalAcceptTarget: + return marshalUnconditionalAcceptTarget() + default: + panic(fmt.Errorf("unknown target of type %T", target)) + } +} + +func marshalUnconditionalAcceptTarget() []byte { + // The target's name will be the empty string. + target := linux.XTStandardTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTStandardTarget, + }, + Verdict: translateStandardVerdict(iptables.Accept), + } + + ret := make([]byte, 0, linux.SizeOfXTStandardTarget) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + +// translateStandardVerdict translates verdicts the same way as the iptables +// tool. +func translateStandardVerdict(verdict iptables.Verdict) int32 { + switch verdict { + case iptables.Accept: + return -linux.NF_ACCEPT - 1 + case iptables.Drop: + return -linux.NF_DROP - 1 + case iptables.Queue: + return -linux.NF_QUEUE - 1 + case iptables.Return: + return linux.NF_RETURN + case iptables.Jump: + // TODO(gvisor.dev/issue/170): Support Jump. + panic("Jump isn't supported yet") + default: + panic(fmt.Sprintf("unknown standard verdict: %d", verdict)) + } } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index f50d83f38..6b8c4a39d 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -179,6 +179,10 @@ syscall_test( test = "//test/syscalls/linux:ioctl_test", ) +syscall_test( + test = "//test/syscalls/linux:iptables_test", +) + syscall_test( size = "medium", test = "//test/syscalls/linux:itimer_test", diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index db0a1e661..e2b0716ef 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -912,6 +912,24 @@ cc_library( ], ) +cc_binary( + name = "iptables_test", + testonly = 1, + srcs = [ + "iptables.cc", + ], + linkstatic = 1, + deps = [ + ":iptables_types", + ":socket_test_util", + "//test/util:capability_util", + "//test/util:file_descriptor", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_googletest//:gtest", + ], +) + cc_binary( name = "itimer_test", testonly = 1, diff --git a/test/syscalls/linux/iptables.cc b/test/syscalls/linux/iptables.cc new file mode 100644 index 000000000..b8e4ece64 --- /dev/null +++ b/test/syscalls/linux/iptables.cc @@ -0,0 +1,204 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/syscalls/linux/iptables.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "gtest/gtest.h" +#include "test/util/capability_util.h" +#include "test/util/file_descriptor.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +constexpr char kNatTablename[] = "nat"; +constexpr char kErrorTarget[] = "ERROR"; +constexpr size_t kEmptyStandardEntrySize = + sizeof(struct ipt_entry) + sizeof(struct ipt_standard_target); +constexpr size_t kEmptyErrorEntrySize = + sizeof(struct ipt_entry) + sizeof(struct ipt_error_target); + +TEST(IPTablesBasic, CreateSocket) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int sock; + ASSERT_THAT(sock = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), + SyscallSucceeds()); + + ASSERT_THAT(close(sock), SyscallSucceeds()); +} + +TEST(IPTablesBasic, FailSockoptNonRaw) { + // Even if the user has CAP_NET_RAW, they shouldn't be able to use the + // iptables sockopts with a non-raw socket. + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int sock; + ASSERT_THAT(sock = socket(AF_INET, SOCK_DGRAM, 0), SyscallSucceeds()); + + struct ipt_getinfo info = {}; + snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename); + socklen_t info_size = sizeof(info); + EXPECT_THAT(getsockopt(sock, IPPROTO_IP, SO_GET_INFO, &info, &info_size), + SyscallFailsWithErrno(ENOPROTOOPT)); + + ASSERT_THAT(close(sock), SyscallSucceeds()); +} + +// Fixture for iptables tests. +class IPTablesTest : public ::testing::Test { + protected: + // Creates a socket to be used in tests. + void SetUp() override; + + // Closes the socket created by SetUp(). + void TearDown() override; + + // The socket via which to manipulate iptables. + int s_; +}; + +void IPTablesTest::SetUp() { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds()); +} + +void IPTablesTest::TearDown() { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + EXPECT_THAT(close(s_), SyscallSucceeds()); +} + +// This tests the initial state of a machine with empty iptables. We don't have +// a guarantee that the iptables are empty when running in native, but we can +// test that gVisor has the same initial state that a newly-booted Linux machine +// would have. +TEST_F(IPTablesTest, InitialState) { + SKIP_IF(!IsRunningOnGvisor()); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + // + // Get info via sockopt. + // + struct ipt_getinfo info = {}; + snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename); + socklen_t info_size = sizeof(info); + ASSERT_THAT(getsockopt(s_, IPPROTO_IP, SO_GET_INFO, &info, &info_size), + SyscallSucceeds()); + + // The nat table supports PREROUTING, and OUTPUT. + unsigned int valid_hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT) | + (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_LOCAL_IN); + + EXPECT_EQ(info.valid_hooks, valid_hooks); + + // Each chain consists of an empty entry with a standard target.. + EXPECT_EQ(info.hook_entry[NF_IP_PRE_ROUTING], 0); + EXPECT_EQ(info.hook_entry[NF_IP_LOCAL_IN], kEmptyStandardEntrySize); + EXPECT_EQ(info.hook_entry[NF_IP_LOCAL_OUT], kEmptyStandardEntrySize * 2); + EXPECT_EQ(info.hook_entry[NF_IP_POST_ROUTING], kEmptyStandardEntrySize * 3); + + // The underflow points are the same as the entry points. + EXPECT_EQ(info.underflow[NF_IP_PRE_ROUTING], 0); + EXPECT_EQ(info.underflow[NF_IP_LOCAL_IN], kEmptyStandardEntrySize); + EXPECT_EQ(info.underflow[NF_IP_LOCAL_OUT], kEmptyStandardEntrySize * 2); + EXPECT_EQ(info.underflow[NF_IP_POST_ROUTING], kEmptyStandardEntrySize * 3); + + // One entry for each chain, plus an error entry at the end. + EXPECT_EQ(info.num_entries, 5); + + EXPECT_EQ(info.size, 4 * kEmptyStandardEntrySize + kEmptyErrorEntrySize); + EXPECT_EQ(strcmp(info.name, kNatTablename), 0); + + // + // Use info to get entries. + // + socklen_t entries_size = sizeof(struct ipt_get_entries) + info.size; + struct ipt_get_entries* entries = + static_cast(malloc(entries_size)); + snprintf(entries->name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename); + entries->size = info.size; + ASSERT_THAT( + getsockopt(s_, IPPROTO_IP, SO_GET_ENTRIES, entries, &entries_size), + SyscallSucceeds()); + + // Verify the name and size. + ASSERT_EQ(info.size, entries->size); + ASSERT_EQ(strcmp(entries->name, kNatTablename), 0); + + // Verify that the entrytable is 4 entries with accept targets and no matches + // followed by a single error target. + size_t entry_offset = 0; + while (entry_offset < entries->size) { + struct ipt_entry* entry = reinterpret_cast( + reinterpret_cast(entries->entrytable) + entry_offset); + + // ip should be zeroes. + struct ipt_ip zeroed = {}; + EXPECT_EQ(memcmp(static_cast(&zeroed), + static_cast(&entry->ip), sizeof(zeroed)), + 0); + + // target_offset should be zero. + EXPECT_EQ(entry->target_offset, sizeof(ipt_entry)); + + if (entry_offset < kEmptyStandardEntrySize * 4) { + // The first 4 entries are standard targets + struct ipt_standard_target* target = + reinterpret_cast(entry->elems); + EXPECT_EQ(entry->next_offset, kEmptyStandardEntrySize); + EXPECT_EQ(target->target.u.user.target_size, sizeof(*target)); + EXPECT_EQ(strcmp(target->target.u.user.name, ""), 0); + EXPECT_EQ(target->target.u.user.revision, 0); + // This is what's returned for an accept verdict. I don't know why. + EXPECT_EQ(target->verdict, -NF_ACCEPT - 1); + } else { + // The last entry is an error target + struct ipt_error_target* target = + reinterpret_cast(entry->elems); + EXPECT_EQ(entry->next_offset, kEmptyErrorEntrySize); + EXPECT_EQ(target->target.u.user.target_size, sizeof(*target)); + EXPECT_EQ(strcmp(target->target.u.user.name, kErrorTarget), 0); + EXPECT_EQ(target->target.u.user.revision, 0); + EXPECT_EQ(strcmp(target->errorname, kErrorTarget), 0); + } + + entry_offset += entry->next_offset; + } + + free(entries); +} + +} // namespace + +} // namespace testing +} // namespace gvisor -- cgit v1.2.3 From a63f88855f9c5d9340028e5828617a263dbf3023 Mon Sep 17 00:00:00 2001 From: Jianfeng Tan Date: Mon, 19 Aug 2019 12:09:08 -0700 Subject: hostinet: fix parsing route netlink message We wrongly parses output interface as gateway address. The fix is straightforward. Fixes #638 Signed-off-by: Jianfeng Tan Change-Id: Ia4bab31f3c238b0278ea57ab22590fad00eaf061 COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/684 from tanjianfeng:fix-638 b940e810367ad1273519bfa594f4371bdd293e83 PiperOrigin-RevId: 264211336 --- pkg/sentry/socket/hostinet/stack.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index 1902fe155..3a4fdec47 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -203,8 +203,14 @@ func ExtractHostRoutes(routeMsgs []syscall.NetlinkMessage) ([]inet.Route, error) inetRoute.DstAddr = attr.Value case syscall.RTA_SRC: inetRoute.SrcAddr = attr.Value - case syscall.RTA_OIF: + case syscall.RTA_GATEWAY: inetRoute.GatewayAddr = attr.Value + case syscall.RTA_OIF: + expected := int(binary.Size(inetRoute.OutputInterface)) + if len(attr.Value) != expected { + return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid attribute data length (%d bytes, expected %d bytes)", len(attr.Value), expected) + } + binary.Unmarshal(attr.Value, usermem.ByteOrder, &inetRoute.OutputInterface) } } -- cgit v1.2.3 From 3ffbdffd7e96e406fd3136818e0402cf0c2f0d4d Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Mon, 19 Aug 2019 12:41:48 -0700 Subject: Internal change. PiperOrigin-RevId: 264218306 --- pkg/tcpip/link/sniffer/sniffer.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index fc584c6a4..36c8c46fc 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -360,10 +360,9 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize { srcPort = udp.SourcePort() dstPort = udp.DestinationPort() + details = fmt.Sprintf("xsum: 0x%x", udp.Checksum()) + size -= header.UDPMinimumSize } - size -= header.UDPMinimumSize - - details = fmt.Sprintf("xsum: 0x%x", udp.Checksum()) case header.TCPProtocolNumber: transName = "tcp" -- cgit v1.2.3