diff options
author | Ian Lewis <ianmlewis@gmail.com> | 2020-08-17 21:44:31 -0400 |
---|---|---|
committer | Ian Lewis <ianmlewis@gmail.com> | 2020-08-17 21:44:31 -0400 |
commit | ac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch) | |
tree | 0cbc5018e8807421d701d190dc20525726c7ca76 /pkg/abi/linux | |
parent | 352ae1022ce19de28fc72e034cc469872ad79d06 (diff) | |
parent | 6d0c5803d557d453f15ac6f683697eeb46dab680 (diff) |
Merge branch 'master' into ip-forwarding
- Merges aleksej-paschenko's with HEAD
- Adds vfs2 support for ip_forward
Diffstat (limited to 'pkg/abi/linux')
34 files changed, 1814 insertions, 107 deletions
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 51774c6b6..b5c5cc20b 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") # Package linux contains the constants and types needed to interface with a # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix @@ -11,6 +10,7 @@ go_library( name = "linux", srcs = [ "aio.go", + "arch_amd64.go", "audit.go", "bpf.go", "capability.go", @@ -18,17 +18,22 @@ go_library( "dev.go", "elf.go", "epoll.go", + "epoll_amd64.go", + "epoll_arm64.go", "errors.go", "eventfd.go", "exec.go", + "fadvise.go", "fcntl.go", "file.go", "file_amd64.go", "file_arm64.go", "fs.go", + "fuse.go", "futex.go", "inotify.go", "ioctl.go", + "ioctl_tun.go", "ip.go", "ipc.go", "limits.go", @@ -36,11 +41,15 @@ go_library( "mm.go", "netdevice.go", "netfilter.go", + "netfilter_ipv6.go", "netlink.go", "netlink_route.go", "poll.go", "prctl.go", "ptrace.go", + "ptrace_amd64.go", + "ptrace_arm64.go", + "rseq.go", "rusage.go", "sched.go", "seccomp.go", @@ -57,13 +66,17 @@ go_library( "uio.go", "utsname.go", "wait.go", + "xattr.go", ], - importpath = "gvisor.dev/gvisor/pkg/abi/linux", + marshal = True, visibility = ["//visibility:public"], deps = [ "//pkg/abi", "//pkg/binary", "//pkg/bits", + "//pkg/usermem", + "//tools/go_marshal/marshal", + "//tools/go_marshal/primitive", ], ) @@ -71,7 +84,7 @@ go_test( name = "linux_test", size = "small", srcs = ["netfilter_test.go"], - embed = [":linux"], + library = ":linux", deps = [ "//pkg/binary", ], diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go index 3c6e0079d..86ee3f8b5 100644 --- a/pkg/abi/linux/aio.go +++ b/pkg/abi/linux/aio.go @@ -14,7 +14,63 @@ package linux +import "encoding/binary" + +// AIORingSize is sizeof(struct aio_ring). +const AIORingSize = 32 + +// I/O commands. const ( - // AIORingSize is sizeof(struct aio_ring). - AIORingSize = 32 + IOCB_CMD_PREAD = 0 + IOCB_CMD_PWRITE = 1 + IOCB_CMD_FSYNC = 2 + IOCB_CMD_FDSYNC = 3 + // 4 was the experimental IOCB_CMD_PREADX. + IOCB_CMD_POLL = 5 + IOCB_CMD_NOOP = 6 + IOCB_CMD_PREADV = 7 + IOCB_CMD_PWRITEV = 8 ) + +// I/O flags. +const ( + IOCB_FLAG_RESFD = 1 + IOCB_FLAG_IOPRIO = 2 +) + +// IOCallback describes an I/O request. +// +// The priority field is currently ignored in the implementation below. Also +// note that the IOCB_FLAG_RESFD feature is not supported. +type IOCallback struct { + Data uint64 + Key uint32 + _ uint32 + + OpCode uint16 + ReqPrio int16 + FD int32 + + Buf uint64 + Bytes uint64 + Offset int64 + + Reserved2 uint64 + Flags uint32 + + // eventfd to signal if IOCB_FLAG_RESFD is set in flags. + ResFD int32 +} + +// IOEvent describes an I/O result. +// +// +stateify savable +type IOEvent struct { + Data uint64 + Obj uint64 + Result int64 + Result2 int64 +} + +// IOEventSize is the size of an ioEvent encoded. +var IOEventSize = binary.Size(IOEvent{}) diff --git a/pkg/abi/linux/arch_amd64.go b/pkg/abi/linux/arch_amd64.go new file mode 100644 index 000000000..0be31e755 --- /dev/null +++ b/pkg/abi/linux/arch_amd64.go @@ -0,0 +1,23 @@ +// Copyright 2020 The gVisor Authors. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +// Start and end addresses of the vsyscall page. +const ( + VSyscallStartAddr uint64 = 0xffffffffff600000 + VSyscallEndAddr uint64 = 0xffffffffff601000 +) diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go index 421e11256..192e2093b 100644 --- a/pkg/abi/linux/dev.go +++ b/pkg/abi/linux/dev.go @@ -36,9 +36,20 @@ func DecodeDeviceID(rdev uint32) (uint16, uint32) { // // See Documentations/devices.txt and uapi/linux/major.h. const ( + // UNNAMED_MAJOR is the major device number for "unnamed" devices, whose + // minor numbers are dynamically allocated by the kernel. + UNNAMED_MAJOR = 0 + + // MEM_MAJOR is the major device number for "memory" character devices. + MEM_MAJOR = 1 + // TTYAUX_MAJOR is the major device number for alternate TTY devices. TTYAUX_MAJOR = 5 + // MISC_MAJOR is the major device number for non-serial mice, misc feature + // devices. + MISC_MAJOR = 10 + // UNIX98_PTY_MASTER_MAJOR is the initial major device number for // Unix98 PTY masters. UNIX98_PTY_MASTER_MAJOR = 128 diff --git a/pkg/abi/linux/elf.go b/pkg/abi/linux/elf.go index 40f0459a0..7c9a02f20 100644 --- a/pkg/abi/linux/elf.go +++ b/pkg/abi/linux/elf.go @@ -102,4 +102,7 @@ const ( // NT_X86_XSTATE is for x86 extended state using xsave. NT_X86_XSTATE = 0x202 + + // NT_ARM_TLS is for ARM TLS register. + NT_ARM_TLS = 0x401 ) diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go index 72083b604..1121a1a92 100644 --- a/pkg/abi/linux/epoll.go +++ b/pkg/abi/linux/epoll.go @@ -14,12 +14,9 @@ package linux -// EpollEvent is equivalent to struct epoll_event from epoll(2). -type EpollEvent struct { - Events uint32 - Fd int32 - Data int32 -} +import ( + "gvisor.dev/gvisor/pkg/binary" +) // Event masks. const ( @@ -38,8 +35,14 @@ const ( // Per-file descriptor flags. const ( - EPOLLET = 0x80000000 - EPOLLONESHOT = 0x40000000 + EPOLLEXCLUSIVE = 1 << 28 + EPOLLWAKEUP = 1 << 29 + EPOLLONESHOT = 1 << 30 + EPOLLET = 1 << 31 + + // EP_PRIVATE_BITS is fs/eventpoll.c:EP_PRIVATE_BITS, the set of all bits + // in an epoll event mask that correspond to flags rather than I/O events. + EP_PRIVATE_BITS = EPOLLEXCLUSIVE | EPOLLWAKEUP | EPOLLONESHOT | EPOLLET ) // Operation flags. @@ -54,3 +57,6 @@ const ( EPOLL_CTL_DEL = 0x2 EPOLL_CTL_MOD = 0x3 ) + +// SizeOfEpollEvent is the size of EpollEvent struct. +var SizeOfEpollEvent = int(binary.Size(EpollEvent{})) diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go new file mode 100644 index 000000000..7e74b1143 --- /dev/null +++ b/pkg/abi/linux/epoll_amd64.go @@ -0,0 +1,29 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +// EpollEvent is equivalent to struct epoll_event from epoll(2). +// +// +marshal slice:EpollEventSlice +type EpollEvent struct { + Events uint32 + // Linux makes struct epoll_event::data a __u64. We represent it as + // [2]int32 because, on amd64, Linux also makes struct epoll_event + // __attribute__((packed)), such that there is no padding between Events + // and Data. + Data [2]int32 +} diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go new file mode 100644 index 000000000..a35939cc9 --- /dev/null +++ b/pkg/abi/linux/epoll_arm64.go @@ -0,0 +1,28 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package linux + +// EpollEvent is equivalent to struct epoll_event from epoll(2). +// +// +marshal slice:EpollEventSlice +type EpollEvent struct { + Events uint32 + // Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding + // here. + _ int32 + Data [2]int32 +} diff --git a/pkg/abi/linux/fadvise.go b/pkg/abi/linux/fadvise.go new file mode 100644 index 000000000..b06ff9964 --- /dev/null +++ b/pkg/abi/linux/fadvise.go @@ -0,0 +1,24 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +const ( + POSIX_FADV_NORMAL = 0 + POSIX_FADV_RANDOM = 1 + POSIX_FADV_SEQUENTIAL = 2 + POSIX_FADV_WILLNEED = 3 + POSIX_FADV_DONTNEED = 4 + POSIX_FADV_NOREUSE = 5 +) diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go index f78315ebf..9242e80a5 100644 --- a/pkg/abi/linux/fcntl.go +++ b/pkg/abi/linux/fcntl.go @@ -16,15 +16,17 @@ package linux // Commands from linux/fcntl.h. const ( - F_DUPFD = 0x0 - F_GETFD = 0x1 - F_SETFD = 0x2 - F_GETFL = 0x3 - F_SETFL = 0x4 - F_SETLK = 0x6 - F_SETLKW = 0x7 - F_SETOWN = 0x8 - F_GETOWN = 0x9 + F_DUPFD = 0 + F_GETFD = 1 + F_SETFD = 2 + F_GETFL = 3 + F_SETFL = 4 + F_SETLK = 6 + F_SETLKW = 7 + F_SETOWN = 8 + F_GETOWN = 9 + F_SETOWN_EX = 15 + F_GETOWN_EX = 16 F_DUPFD_CLOEXEC = 1024 + 6 F_SETPIPE_SZ = 1024 + 7 F_GETPIPE_SZ = 1024 + 8 @@ -32,9 +34,9 @@ const ( // Commands for F_SETLK. const ( - F_RDLCK = 0x0 - F_WRLCK = 0x1 - F_UNLCK = 0x2 + F_RDLCK = 0 + F_WRLCK = 1 + F_UNLCK = 2 ) // Flags for fcntl. @@ -42,7 +44,7 @@ const ( FD_CLOEXEC = 00000001 ) -// Lock structure for F_SETLK. +// Flock is the lock structure for F_SETLK. type Flock struct { Type int16 Whence int16 @@ -52,3 +54,16 @@ type Flock struct { Pid int32 _ [4]byte } + +// Owner types for F_SETOWN_EX and F_GETOWN_EX. +const ( + F_OWNER_TID = 0 + F_OWNER_PID = 1 + F_OWNER_PGRP = 2 +) + +// FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX. +type FOwnerEx struct { + Type int32 + PID int32 +} diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index c9ee098f4..e11ca2d62 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -24,27 +24,23 @@ import ( // Constants for open(2). const ( - O_ACCMODE = 000000003 - O_RDONLY = 000000000 - O_WRONLY = 000000001 - O_RDWR = 000000002 - O_CREAT = 000000100 - O_EXCL = 000000200 - O_NOCTTY = 000000400 - O_TRUNC = 000001000 - O_APPEND = 000002000 - O_NONBLOCK = 000004000 - O_DSYNC = 000010000 - O_ASYNC = 000020000 - O_DIRECT = 000040000 - O_LARGEFILE = 000100000 - O_DIRECTORY = 000200000 - O_NOFOLLOW = 000400000 - O_NOATIME = 001000000 - O_CLOEXEC = 002000000 - O_SYNC = 004000000 // __O_SYNC in Linux - O_PATH = 010000000 - O_TMPFILE = 020000000 // __O_TMPFILE in Linux + O_ACCMODE = 000000003 + O_RDONLY = 000000000 + O_WRONLY = 000000001 + O_RDWR = 000000002 + O_CREAT = 000000100 + O_EXCL = 000000200 + O_NOCTTY = 000000400 + O_TRUNC = 000001000 + O_APPEND = 000002000 + O_NONBLOCK = 000004000 + O_DSYNC = 000010000 + O_ASYNC = 000020000 + O_NOATIME = 001000000 + O_CLOEXEC = 002000000 + O_SYNC = 004000000 // __O_SYNC in Linux + O_PATH = 010000000 + O_TMPFILE = 020000000 // __O_TMPFILE in Linux ) // Constants for fstatat(2). @@ -144,9 +140,13 @@ const ( ModeCharacterDevice = S_IFCHR ModeNamedPipe = S_IFIFO - ModeSetUID = 04000 - ModeSetGID = 02000 - ModeSticky = 01000 + S_ISUID = 04000 + S_ISGID = 02000 + S_ISVTX = 01000 + + ModeSetUID = S_ISUID + ModeSetGID = S_ISGID + ModeSticky = S_ISVTX ModeUserAll = 0700 ModeUserRead = 0400 @@ -176,10 +176,24 @@ const ( DT_WHT = 14 ) +// DirentType are the friendly strings for linux_dirent64.d_type. +var DirentType = abi.ValueSet{ + DT_UNKNOWN: "DT_UNKNOWN", + DT_FIFO: "DT_FIFO", + DT_CHR: "DT_CHR", + DT_DIR: "DT_DIR", + DT_BLK: "DT_BLK", + DT_REG: "DT_REG", + DT_LNK: "DT_LNK", + DT_SOCK: "DT_SOCK", + DT_WHT: "DT_WHT", +} + // Values for preadv2/pwritev2. const ( - // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is - // accepted as a valid flag argument for preadv2/pwritev2. + // NOTE(b/120162627): gVisor does not implement the RWF_HIPRI feature, but + // the flag is accepted as a valid flag argument for preadv2/pwritev2 and + // silently ignored. RWF_HIPRI = 0x00000001 RWF_DSYNC = 0x00000002 RWF_SYNC = 0x00000004 @@ -228,6 +242,8 @@ const ( ) // Statx represents struct statx. +// +// +marshal type Statx struct { Mask uint32 Blksize uint32 @@ -251,6 +267,9 @@ type Statx struct { DevMinor uint32 } +// SizeOfStatx is the size of a Statx struct. +var SizeOfStatx = binary.Size(Statx{}) + // FileMode represents a mode_t. type FileMode uint16 @@ -269,6 +288,11 @@ func (m FileMode) ExtraBits() FileMode { return m &^ (PermissionsMask | FileTypeMask) } +// IsDir returns true if file type represents a directory. +func (m FileMode) IsDir() bool { + return m.FileType() == S_IFDIR +} + // String returns a string representation of m. func (m FileMode) String() string { var s []string @@ -282,6 +306,29 @@ func (m FileMode) String() string { return strings.Join(s, "|") } +// DirentType maps file types to dirent types appropriate for (struct +// dirent)::d_type. +func (m FileMode) DirentType() uint8 { + switch m.FileType() { + case ModeSocket: + return DT_SOCK + case ModeSymlink: + return DT_LNK + case ModeRegular: + return DT_REG + case ModeBlockDevice: + return DT_BLK + case ModeDirectory: + return DT_DIR + case ModeCharacterDevice: + return DT_CHR + case ModeNamedPipe: + return DT_FIFO + default: + return DT_UNKNOWN + } +} + var modeExtraBits = abi.FlagSet{ { Flag: ModeSetUID, diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go index 74c554be6..6b72364ea 100644 --- a/pkg/abi/linux/file_amd64.go +++ b/pkg/abi/linux/file_amd64.go @@ -12,9 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build amd64 + package linux +// Constants for open(2). +const ( + O_DIRECT = 000040000 + O_LARGEFILE = 000100000 + O_DIRECTORY = 000200000 + O_NOFOLLOW = 000400000 +) + // Stat represents struct stat. +// +// +marshal type Stat struct { Dev uint64 Ino uint64 diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go index f16c07589..6492c9038 100644 --- a/pkg/abi/linux/file_arm64.go +++ b/pkg/abi/linux/file_arm64.go @@ -12,9 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build arm64 + package linux +// Constants for open(2). +const ( + O_DIRECTORY = 000040000 + O_NOFOLLOW = 000100000 + O_DIRECT = 000200000 + O_LARGEFILE = 000400000 +) + // Stat represents struct stat. +// +// +marshal type Stat struct { Dev uint64 Ino uint64 diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index b416e3472..158d2db5b 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -38,6 +38,8 @@ const ( ) // Statfs is struct statfs, from uapi/asm-generic/statfs.h. +// +// +marshal type Statfs struct { // Type is one of the filesystem magic values, defined above. Type uint64 @@ -92,3 +94,10 @@ const ( SYNC_FILE_RANGE_WRITE = 2 SYNC_FILE_RANGE_WAIT_AFTER = 4 ) + +// Flag argument to renameat2(2), from include/uapi/linux/fs.h. +const ( + RENAME_NOREPLACE = (1 << 0) // Don't overwrite target. + RENAME_EXCHANGE = (1 << 1) // Exchange src and dst. + RENAME_WHITEOUT = (1 << 2) // Whiteout src. +) diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go new file mode 100644 index 000000000..7e30483ee --- /dev/null +++ b/pkg/abi/linux/fuse.go @@ -0,0 +1,303 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// +marshal +type FUSEOpcode uint32 + +// +marshal +type FUSEOpID uint64 + +// Opcodes for FUSE operations. Analogous to the opcodes in include/linux/fuse.h. +const ( + FUSE_LOOKUP FUSEOpcode = 1 + FUSE_FORGET = 2 /* no reply */ + FUSE_GETATTR = 3 + FUSE_SETATTR = 4 + FUSE_READLINK = 5 + FUSE_SYMLINK = 6 + _ + FUSE_MKNOD = 8 + FUSE_MKDIR = 9 + FUSE_UNLINK = 10 + FUSE_RMDIR = 11 + FUSE_RENAME = 12 + FUSE_LINK = 13 + FUSE_OPEN = 14 + FUSE_READ = 15 + FUSE_WRITE = 16 + FUSE_STATFS = 17 + FUSE_RELEASE = 18 + _ + FUSE_FSYNC = 20 + FUSE_SETXATTR = 21 + FUSE_GETXATTR = 22 + FUSE_LISTXATTR = 23 + FUSE_REMOVEXATTR = 24 + FUSE_FLUSH = 25 + FUSE_INIT = 26 + FUSE_OPENDIR = 27 + FUSE_READDIR = 28 + FUSE_RELEASEDIR = 29 + FUSE_FSYNCDIR = 30 + FUSE_GETLK = 31 + FUSE_SETLK = 32 + FUSE_SETLKW = 33 + FUSE_ACCESS = 34 + FUSE_CREATE = 35 + FUSE_INTERRUPT = 36 + FUSE_BMAP = 37 + FUSE_DESTROY = 38 + FUSE_IOCTL = 39 + FUSE_POLL = 40 + FUSE_NOTIFY_REPLY = 41 + FUSE_BATCH_FORGET = 42 +) + +const ( + // FUSE_MIN_READ_BUFFER is the minimum size the read can be for any FUSE filesystem. + // This is the minimum size Linux supports. See linux.fuse.h. + FUSE_MIN_READ_BUFFER uint32 = 8192 +) + +// FUSEHeaderIn is the header read by the daemon with each request. +// +// +marshal +type FUSEHeaderIn struct { + // Len specifies the total length of the data, including this header. + Len uint32 + + // Opcode specifies the kind of operation of the request. + Opcode FUSEOpcode + + // Unique specifies the unique identifier for this request. + Unique FUSEOpID + + // NodeID is the ID of the filesystem object being operated on. + NodeID uint64 + + // UID is the UID of the requesting process. + UID uint32 + + // GID is the GID of the requesting process. + GID uint32 + + // PID is the PID of the requesting process. + PID uint32 + + _ uint32 +} + +// FUSEHeaderOut is the header written by the daemon when it processes +// a request and wants to send a reply (almost all operations require a +// reply; if they do not, this will be explicitly documented). +// +// +marshal +type FUSEHeaderOut struct { + // Len specifies the total length of the data, including this header. + Len uint32 + + // Error specifies the error that occurred (0 if none). + Error int32 + + // Unique specifies the unique identifier of the corresponding request. + Unique FUSEOpID +} + +// FUSEWriteIn is the header written by a daemon when it makes a +// write request to the FUSE filesystem. +// +// +marshal +type FUSEWriteIn struct { + // Fh specifies the file handle that is being written to. + Fh uint64 + + // Offset is the offset of the write. + Offset uint64 + + // Size is the size of data being written. + Size uint32 + + // WriteFlags is the flags used during the write. + WriteFlags uint32 + + // LockOwner is the ID of the lock owner. + LockOwner uint64 + + // Flags is the flags for the request. + Flags uint32 + + _ uint32 +} + +// FUSE_INIT flags, consistent with the ones in include/uapi/linux/fuse.h. +const ( + FUSE_ASYNC_READ = 1 << 0 + FUSE_POSIX_LOCKS = 1 << 1 + FUSE_FILE_OPS = 1 << 2 + FUSE_ATOMIC_O_TRUNC = 1 << 3 + FUSE_EXPORT_SUPPORT = 1 << 4 + FUSE_BIG_WRITES = 1 << 5 + FUSE_DONT_MASK = 1 << 6 + FUSE_SPLICE_WRITE = 1 << 7 + FUSE_SPLICE_MOVE = 1 << 8 + FUSE_SPLICE_READ = 1 << 9 + FUSE_FLOCK_LOCKS = 1 << 10 + FUSE_HAS_IOCTL_DIR = 1 << 11 + FUSE_AUTO_INVAL_DATA = 1 << 12 + FUSE_DO_READDIRPLUS = 1 << 13 + FUSE_READDIRPLUS_AUTO = 1 << 14 + FUSE_ASYNC_DIO = 1 << 15 + FUSE_WRITEBACK_CACHE = 1 << 16 + FUSE_NO_OPEN_SUPPORT = 1 << 17 + FUSE_PARALLEL_DIROPS = 1 << 18 + FUSE_HANDLE_KILLPRIV = 1 << 19 + FUSE_POSIX_ACL = 1 << 20 + FUSE_ABORT_ERROR = 1 << 21 + FUSE_MAX_PAGES = 1 << 22 + FUSE_CACHE_SYMLINKS = 1 << 23 + FUSE_NO_OPENDIR_SUPPORT = 1 << 24 + FUSE_EXPLICIT_INVAL_DATA = 1 << 25 + FUSE_MAP_ALIGNMENT = 1 << 26 +) + +// currently supported FUSE protocol version numbers. +const ( + FUSE_KERNEL_VERSION = 7 + FUSE_KERNEL_MINOR_VERSION = 31 +) + +// FUSEInitIn is the request sent by the kernel to the daemon, +// to negotiate the version and flags. +// +// +marshal +type FUSEInitIn struct { + // Major version supported by kernel. + Major uint32 + + // Minor version supported by the kernel. + Minor uint32 + + // MaxReadahead is the maximum number of bytes to read-ahead + // decided by the kernel. + MaxReadahead uint32 + + // Flags of this init request. + Flags uint32 +} + +// FUSEInitOut is the reply sent by the daemon to the kernel +// for FUSEInitIn. +// +// +marshal +type FUSEInitOut struct { + // Major version supported by daemon. + Major uint32 + + // Minor version supported by daemon. + Minor uint32 + + // MaxReadahead is the maximum number of bytes to read-ahead. + // Decided by the daemon, after receiving the value from kernel. + MaxReadahead uint32 + + // Flags of this init reply. + Flags uint32 + + // MaxBackground is the maximum number of pending background requests + // that the daemon wants. + MaxBackground uint16 + + // CongestionThreshold is the daemon-decided threshold for + // the number of the pending background requests. + CongestionThreshold uint16 + + // MaxWrite is the daemon's maximum size of a write buffer. + // Kernel adjusts it to the minimum (fuse/init.go:fuseMinMaxWrite). + // if the value from daemon is too small. + MaxWrite uint32 + + // TimeGran is the daemon's time granularity for mtime and ctime metadata. + // The unit is nanosecond. + // Value should be power of 10. + // 1 indicates full nanosecond granularity support. + TimeGran uint32 + + // MaxPages is the daemon's maximum number of pages for one write operation. + // Kernel adjusts it to the maximum (fuse/init.go:FUSE_MAX_MAX_PAGES). + // if the value from daemon is too large. + MaxPages uint16 + + // MapAlignment is an unknown field and not used by this package at this moment. + // Use as a placeholder to be consistent with the FUSE protocol. + MapAlignment uint16 + + _ [8]uint32 +} + +// FUSEGetAttrIn is the request sent by the kernel to the daemon, +// to get the attribute of a inode. +// +// +marshal +type FUSEGetAttrIn struct { + // GetAttrFlags specifies whether getattr request is sent with a nodeid or + // with a file handle. + GetAttrFlags uint32 + + _ uint32 + + // Fh is the file handler when GetAttrFlags has FUSE_GETATTR_FH bit. If + // used, the operation is analogous to fstat(2). + Fh uint64 +} + +// FUSEAttr is the struct used in the reponse FUSEGetAttrOut. +// +// +marshal +type FUSEAttr struct { + Ino uint64 + Size uint64 + Blocks uint64 + Atime uint64 + Mtime uint64 + Ctime uint64 + AtimeNsec uint32 + MtimeNsec uint32 + CtimeNsec uint32 + Mode uint32 + Nlink uint32 + UID uint32 + GID uint32 + Rdev uint32 + BlkSize uint32 + _ uint32 +} + +// FUSEGetAttrOut is the reply sent by the daemon to the kernel +// for FUSEGetAttrIn. +// +// +marshal +type FUSEGetAttrOut struct { + // AttrValid and AttrValidNsec describe the attribute cache duration + AttrValid uint64 + + // AttrValidNsec is the nanosecond part of the attribute cache duration + AttrValidNsec uint32 + + _ uint32 + + // Attr contains the metadata returned from the FUSE server + Attr FUSEAttr +} diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go index 08bfde3b5..8138088a6 100644 --- a/pkg/abi/linux/futex.go +++ b/pkg/abi/linux/futex.go @@ -60,3 +60,21 @@ const ( FUTEX_WAITERS = 0x80000000 FUTEX_OWNER_DIED = 0x40000000 ) + +// FUTEX_BITSET_MATCH_ANY has all bits set. +const FUTEX_BITSET_MATCH_ANY = 0xffffffff + +// ROBUST_LIST_LIMIT protects against a deliberately circular list. +const ROBUST_LIST_LIMIT = 2048 + +// RobustListHead corresponds to Linux's struct robust_list_head. +// +// +marshal +type RobustListHead struct { + List uint64 + FutexOffset uint64 + ListOpPending uint64 +} + +// SizeOfRobustListHead is the size of a RobustListHead struct. +var SizeOfRobustListHead = (*RobustListHead)(nil).SizeBytes() diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go index 0e18db9ef..2c5e56ae5 100644 --- a/pkg/abi/linux/ioctl.go +++ b/pkg/abi/linux/ioctl.go @@ -67,8 +67,53 @@ const ( // ioctl(2) requests provided by uapi/linux/sockios.h const ( - SIOCGIFMEM = 0x891f - SIOCGIFPFLAGS = 0x8935 - SIOCGMIIPHY = 0x8947 - SIOCGMIIREG = 0x8948 + SIOCGIFNAME = 0x8910 + SIOCGIFCONF = 0x8912 + SIOCGIFFLAGS = 0x8913 + SIOCGIFADDR = 0x8915 + SIOCGIFDSTADDR = 0x8917 + SIOCGIFBRDADDR = 0x8919 + SIOCGIFNETMASK = 0x891b + SIOCGIFMETRIC = 0x891d + SIOCGIFMTU = 0x8921 + SIOCGIFMEM = 0x891f + SIOCGIFHWADDR = 0x8927 + SIOCGIFINDEX = 0x8933 + SIOCGIFPFLAGS = 0x8935 + SIOCGIFTXQLEN = 0x8942 + SIOCETHTOOL = 0x8946 + SIOCGMIIPHY = 0x8947 + SIOCGMIIREG = 0x8948 + SIOCGIFMAP = 0x8970 ) + +// ioctl(2) requests provided by uapi/asm-generic/sockios.h +const ( + SIOCGSTAMP = 0x8906 +) + +// ioctl(2) directions. Used to calculate requests number. +// Constants from asm-generic/ioctl.h. +const ( + _IOC_NONE = 0 + _IOC_WRITE = 1 + _IOC_READ = 2 +) + +// Constants from asm-generic/ioctl.h. +const ( + _IOC_NRBITS = 8 + _IOC_TYPEBITS = 8 + _IOC_SIZEBITS = 14 + _IOC_DIRBITS = 2 + + _IOC_NRSHIFT = 0 + _IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS + _IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS + _IOC_DIRSHIFT = _IOC_SIZESHIFT + _IOC_SIZEBITS +) + +// IOC outputs the result of _IOC macro in asm-generic/ioctl.h. +func IOC(dir, typ, nr, size uint32) uint32 { + return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT +} diff --git a/pkg/abi/linux/ioctl_tun.go b/pkg/abi/linux/ioctl_tun.go new file mode 100644 index 000000000..c59c9c136 --- /dev/null +++ b/pkg/abi/linux/ioctl_tun.go @@ -0,0 +1,29 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// ioctl(2) request numbers from linux/if_tun.h +var ( + TUNSETIFF = IOC(_IOC_WRITE, 'T', 202, 4) + TUNGETIFF = IOC(_IOC_READ, 'T', 210, 4) +) + +// Flags from net/if_tun.h +const ( + IFF_TUN = 0x0001 + IFF_TAP = 0x0002 + IFF_NO_PI = 0x1000 + IFF_NOFILTER = 0x1000 +) diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go index 31e56ffa6..ef6d1093e 100644 --- a/pkg/abi/linux/ip.go +++ b/pkg/abi/linux/ip.go @@ -92,6 +92,16 @@ const ( IP_UNICAST_IF = 50 ) +// IP_MTU_DISCOVER values from uapi/linux/in.h +const ( + IP_PMTUDISC_DONT = 0 + IP_PMTUDISC_WANT = 1 + IP_PMTUDISC_DO = 2 + IP_PMTUDISC_PROBE = 3 + IP_PMTUDISC_INTERFACE = 4 + IP_PMTUDISC_OMIT = 5 +) + // Socket options from uapi/linux/in6.h const ( IPV6_ADDRFORM = 1 diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go index cd043dac3..07cc1895e 100644 --- a/pkg/abi/linux/mm.go +++ b/pkg/abi/linux/mm.go @@ -90,14 +90,19 @@ const ( MS_SYNC = 1 << 2 ) +// NumaPolicy is the NUMA memory policy for a memory range. See numa(7). +// +// +marshal +type NumaPolicy int32 + // Policies for get_mempolicy(2)/set_mempolicy(2). const ( - MPOL_DEFAULT = 0 - MPOL_PREFERRED = 1 - MPOL_BIND = 2 - MPOL_INTERLEAVE = 3 - MPOL_LOCAL = 4 - MPOL_MAX = 5 + MPOL_DEFAULT NumaPolicy = 0 + MPOL_PREFERRED NumaPolicy = 1 + MPOL_BIND NumaPolicy = 2 + MPOL_INTERLEAVE NumaPolicy = 3 + MPOL_LOCAL NumaPolicy = 4 + MPOL_MAX NumaPolicy = 5 ) // Flags for get_mempolicy(2). diff --git a/pkg/abi/linux/netdevice.go b/pkg/abi/linux/netdevice.go index 7866352b4..0faf015c7 100644 --- a/pkg/abi/linux/netdevice.go +++ b/pkg/abi/linux/netdevice.go @@ -22,6 +22,8 @@ const ( ) // IFReq is an interface request. +// +// +marshal type IFReq struct { // IFName is an encoded name, normally null-terminated. This should be // accessed via the Name and SetName functions. @@ -79,6 +81,8 @@ type IFMap struct { // IFConf is used to return a list of interfaces and their addresses. See // netdevice(7) and struct ifconf for more detail on its use. +// +// +marshal type IFConf struct { Len int32 _ [4]byte // Pad to sizeof(struct ifconf). diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index 269ba5567..91e35366f 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -14,6 +14,14 @@ package linux +import ( + "io" + + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" +) + // This file contains structures required to support netfilter, specifically // the iptables tool. @@ -42,7 +50,16 @@ const ( NF_RETURN = -NF_REPEAT - 1 ) -// Socket options. These correspond to values in +// VerdictStrings maps int verdicts to the strings they represent. It is used +// for debugging. +var VerdictStrings = map[int32]string{ + -NF_DROP - 1: "DROP", + -NF_ACCEPT - 1: "ACCEPT", + -NF_QUEUE - 1: "QUEUE", + NF_RETURN: "RETURN", +} + +// Socket options for SOL_SOCKET. These correspond to values in // include/uapi/linux/netfilter_ipv4/ip_tables.h. const ( IPT_BASE_CTL = 64 @@ -57,6 +74,12 @@ const ( IPT_SO_GET_MAX = IPT_SO_GET_REVISION_TARGET ) +// Socket option for SOL_IP. This corresponds to the value in +// include/uapi/linux/netfilter_ipv4.h. +const ( + SO_ORIGINAL_DST = 80 +) + // Name lengths. These correspond to values in // include/uapi/linux/netfilter/x_tables.h. const ( @@ -67,6 +90,8 @@ const ( // IPTEntry is an iptable rule. It corresponds to struct ipt_entry in // include/uapi/linux/netfilter_ipv4/ip_tables.h. +// +// +marshal type IPTEntry struct { // IP is used to filter packets based on the IP header. IP IPTIP @@ -103,21 +128,41 @@ type IPTEntry struct { // SizeOfIPTEntry is the size of an IPTEntry. const SizeOfIPTEntry = 112 -// KernelIPTEntry is identical to IPTEntry, but includes the Elems field. This -// struct marshaled via the binary package to write an IPTEntry to userspace. +// KernelIPTEntry is identical to IPTEntry, but includes the Elems field. +// KernelIPTEntry itself is not Marshallable but it implements some methods of +// marshal.Marshallable that help in other implementations of Marshallable. type KernelIPTEntry struct { - IPTEntry + Entry IPTEntry // Elems holds the data for all this rule's matches followed by the // target. It is variable length -- users have to iterate over any // matches and use TargetOffset and NextOffset to make sense of the // data. - Elems []byte + Elems primitive.ByteSlice +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (ke *KernelIPTEntry) SizeBytes() int { + return ke.Entry.SizeBytes() + ke.Elems.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (ke *KernelIPTEntry) MarshalBytes(dst []byte) { + ke.Entry.MarshalBytes(dst) + ke.Elems.MarshalBytes(dst[ke.Entry.SizeBytes():]) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (ke *KernelIPTEntry) UnmarshalBytes(src []byte) { + ke.Entry.UnmarshalBytes(src) + ke.Elems.UnmarshalBytes(src[ke.Entry.SizeBytes():]) } // IPTIP contains information for matching a packet's IP header. // It corresponds to struct ipt_ip in // include/uapi/linux/netfilter_ipv4/ip_tables.h. +// +// +marshal type IPTIP struct { // Src is the source IP address. Src InetAddr @@ -137,7 +182,7 @@ type IPTIP struct { // OutputInterface is the output network interface. OutputInterface [IFNAMSIZ]byte - // InputInterfaceMask is the intput interface mask. + // InputInterfaceMask is the input interface mask. InputInterfaceMask [IFNAMSIZ]byte // OuputInterfaceMask is the output interface mask. @@ -149,15 +194,39 @@ type IPTIP struct { // Flags define matching behavior for the IP header. Flags uint8 - // InverseFlags invert the meaning of fields in struct IPTIP. + // InverseFlags invert the meaning of fields in struct IPTIP. See the + // IPT_INV_* flags. InverseFlags uint8 } +// Flags in IPTIP.InverseFlags. Corresponding constants are in +// include/uapi/linux/netfilter_ipv4/ip_tables.h. +const ( + // Invert the meaning of InputInterface. + IPT_INV_VIA_IN = 0x01 + // Invert the meaning of OutputInterface. + IPT_INV_VIA_OUT = 0x02 + // Unclear what this is, as no references to it exist in the kernel. + IPT_INV_TOS = 0x04 + // Invert the meaning of Src. + IPT_INV_SRCIP = 0x08 + // Invert the meaning of Dst. + IPT_INV_DSTIP = 0x10 + // Invert the meaning of the IPT_F_FRAG flag. + IPT_INV_FRAG = 0x20 + // Invert the meaning of the Protocol field. + IPT_INV_PROTO = 0x40 + // Enable all flags. + IPT_INV_MASK = 0x7F +) + // SizeOfIPTIP is the size of an IPTIP. const SizeOfIPTIP = 84 // XTCounters holds packet and byte counts for a rule. It corresponds to struct // xt_counters in include/uapi/linux/netfilter/x_tables.h. +// +// +marshal type XTCounters struct { // Pcnt is the packet count. Pcnt uint64 @@ -179,7 +248,7 @@ const SizeOfXTCounters = 16 // the user data. type XTEntryMatch struct { MatchSize uint16 - Name [XT_EXTENSION_MAXNAMELEN]byte + Name ExtensionName Revision uint8 // Data is omitted here because it would cause XTEntryMatch to be an // extra byte larger (see http://www.catb.org/esr/structure-packing/). @@ -189,6 +258,13 @@ type XTEntryMatch struct { // SizeOfXTEntryMatch is the size of an XTEntryMatch. const SizeOfXTEntryMatch = 32 +// KernelXTEntryMatch is identical to XTEntryMatch, but contains +// variable-length Data field. +type KernelXTEntryMatch struct { + XTEntryMatch + Data []byte +} + // XTEntryTarget holds a target for a rule. For example, it can specify that // packets matching the rule should DROP, ACCEPT, or use an extension target. // iptables-extension(8) has a list of possible targets. @@ -199,7 +275,7 @@ const SizeOfXTEntryMatch = 32 // the user data. type XTEntryTarget struct { TargetSize uint16 - Name [XT_EXTENSION_MAXNAMELEN]byte + Name ExtensionName Revision uint8 // Data is omitted here because it would cause XTEntryTarget to be an // extra byte larger (see http://www.catb.org/esr/structure-packing/). @@ -209,11 +285,14 @@ type XTEntryTarget struct { // SizeOfXTEntryTarget is the size of an XTEntryTarget. const SizeOfXTEntryTarget = 32 -// XTStandardTarget is a builtin target, one of ACCEPT, DROP, JUMP, QUEUE, or -// RETURN. It corresponds to struct xt_standard_target in +// XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE, +// RETURN, or jump. It corresponds to struct xt_standard_target in // include/uapi/linux/netfilter/x_tables.h. type XTStandardTarget struct { - Target XTEntryTarget + Target XTEntryTarget + // A positive verdict indicates a jump, and is the offset from the + // start of the table to jump to. A negative value means one of the + // other built-in targets. Verdict int32 _ [4]byte } @@ -226,18 +305,64 @@ const SizeOfXTStandardTarget = 40 // ErrorName. It corresponds to struct xt_error_target in // include/uapi/linux/netfilter/x_tables.h. type XTErrorTarget struct { - Target XTEntryTarget - ErrorName [XT_FUNCTION_MAXNAMELEN]byte - _ [2]byte + Target XTEntryTarget + Name ErrorName + _ [2]byte } // SizeOfXTErrorTarget is the size of an XTErrorTarget. const SizeOfXTErrorTarget = 64 +// Flag values for NfNATIPV4Range. The values indicate whether to map +// protocol specific part(ports) or IPs. It corresponds to values in +// include/uapi/linux/netfilter/nf_nat.h. +const ( + NF_NAT_RANGE_MAP_IPS = 1 << 0 + NF_NAT_RANGE_PROTO_SPECIFIED = 1 << 1 + NF_NAT_RANGE_PROTO_RANDOM = 1 << 2 + NF_NAT_RANGE_PERSISTENT = 1 << 3 + NF_NAT_RANGE_PROTO_RANDOM_FULLY = 1 << 4 + NF_NAT_RANGE_PROTO_RANDOM_ALL = (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) + NF_NAT_RANGE_MASK = (NF_NAT_RANGE_MAP_IPS | + NF_NAT_RANGE_PROTO_SPECIFIED | NF_NAT_RANGE_PROTO_RANDOM | + NF_NAT_RANGE_PERSISTENT | NF_NAT_RANGE_PROTO_RANDOM_FULLY) +) + +// NfNATIPV4Range corresponds to struct nf_nat_ipv4_range +// in include/uapi/linux/netfilter/nf_nat.h. The fields are in +// network byte order. +type NfNATIPV4Range struct { + Flags uint32 + MinIP [4]byte + MaxIP [4]byte + MinPort uint16 + MaxPort uint16 +} + +// NfNATIPV4MultiRangeCompat corresponds to struct +// nf_nat_ipv4_multi_range_compat in include/uapi/linux/netfilter/nf_nat.h. +type NfNATIPV4MultiRangeCompat struct { + RangeSize uint32 + RangeIPV4 NfNATIPV4Range +} + +// XTRedirectTarget triggers a redirect when reached. +// Adding 4 bytes of padding to make the struct 8 byte aligned. +type XTRedirectTarget struct { + Target XTEntryTarget + NfRange NfNATIPV4MultiRangeCompat + _ [4]byte +} + +// SizeOfXTRedirectTarget is the size of an XTRedirectTarget. +const SizeOfXTRedirectTarget = 56 + // IPTGetinfo is the argument for the IPT_SO_GET_INFO sockopt. It corresponds // to struct ipt_getinfo in include/uapi/linux/netfilter_ipv4/ip_tables.h. +// +// +marshal type IPTGetinfo struct { - Name [XT_TABLE_MAXNAMELEN]byte + Name TableName ValidHooks uint32 HookEntry [NF_INET_NUMHOOKS]uint32 Underflow [NF_INET_NUMHOOKS]uint32 @@ -248,16 +373,13 @@ type IPTGetinfo struct { // SizeOfIPTGetinfo is the size of an IPTGetinfo. const SizeOfIPTGetinfo = 84 -// TableName returns the table name. -func (info *IPTGetinfo) TableName() string { - return tableName(info.Name[:]) -} - // IPTGetEntries is the argument for the IPT_SO_GET_ENTRIES sockopt. It // corresponds to struct ipt_get_entries in // include/uapi/linux/netfilter_ipv4/ip_tables.h. +// +// +marshal type IPTGetEntries struct { - Name [XT_TABLE_MAXNAMELEN]byte + Name TableName Size uint32 _ [4]byte // Entrytable is omitted here because it would cause IPTGetEntries to @@ -266,34 +388,112 @@ type IPTGetEntries struct { // Entrytable [0]IPTEntry } -// TableName returns the entries' table name. -func (entries *IPTGetEntries) TableName() string { - return tableName(entries.Name[:]) -} - // SizeOfIPTGetEntries is the size of an IPTGetEntries. const SizeOfIPTGetEntries = 40 -// KernelIPTGetEntries is identical to IPTEntry, but includes the Elems field. -// This struct marshaled via the binary package to write an KernelIPTGetEntries -// to userspace. +// KernelIPTGetEntries is identical to IPTGetEntries, but includes the +// Entrytable field. This has been manually made marshal.Marshallable since it +// is dynamically sized. type KernelIPTGetEntries struct { - Name [XT_TABLE_MAXNAMELEN]byte - Size uint32 - _ [4]byte + IPTGetEntries Entrytable []KernelIPTEntry } -// TableName returns the entries' table name. -func (entries *KernelIPTGetEntries) TableName() string { - return tableName(entries.Name[:]) +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (ke *KernelIPTGetEntries) SizeBytes() int { + res := ke.IPTGetEntries.SizeBytes() + for _, entry := range ke.Entrytable { + res += entry.SizeBytes() + } + return res +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (ke *KernelIPTGetEntries) MarshalBytes(dst []byte) { + ke.IPTGetEntries.MarshalBytes(dst) + marshalledUntil := ke.IPTGetEntries.SizeBytes() + for i := range ke.Entrytable { + ke.Entrytable[i].MarshalBytes(dst[marshalledUntil:]) + marshalledUntil += ke.Entrytable[i].SizeBytes() + } +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (ke *KernelIPTGetEntries) UnmarshalBytes(src []byte) { + ke.IPTGetEntries.UnmarshalBytes(src) + unmarshalledUntil := ke.IPTGetEntries.SizeBytes() + for i := range ke.Entrytable { + ke.Entrytable[i].UnmarshalBytes(src[unmarshalledUntil:]) + unmarshalledUntil += ke.Entrytable[i].SizeBytes() + } +} + +// Packed implements marshal.Marshallable.Packed. +func (ke *KernelIPTGetEntries) Packed() bool { + // KernelIPTGetEntries isn't packed because the ke.Entrytable contains an + // indirection to the actual data we want to marshal (the slice data + // pointer), and the memory for KernelIPTGetEntries contains the slice + // header which we don't want to marshal. + return false +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (ke *KernelIPTGetEntries) MarshalUnsafe(dst []byte) { + // Fall back to safe Marshal because the type in not packed. + ke.MarshalBytes(dst) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (ke *KernelIPTGetEntries) UnmarshalUnsafe(src []byte) { + // Fall back to safe Unmarshal because the type in not packed. + ke.UnmarshalBytes(src) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +func (ke *KernelIPTGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + buf := task.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Unmarshal unconditionally. If we had a short copy-in, this results in a + // partially unmarshalled struct. + ke.UnmarshalBytes(buf) // escapes: fallback. + return length, err } +// CopyOut implements marshal.Marshallable.CopyOut. +func (ke *KernelIPTGetEntries) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + // Type KernelIPTGetEntries doesn't have a packed layout in memory, fall + // back to MarshalBytes. + return task.CopyOutBytes(addr, ke.marshalAll(task)) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +func (ke *KernelIPTGetEntries) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Type KernelIPTGetEntries doesn't have a packed layout in memory, fall + // back to MarshalBytes. + return task.CopyOutBytes(addr, ke.marshalAll(task)[:limit]) +} + +func (ke *KernelIPTGetEntries) marshalAll(task marshal.Task) []byte { + buf := task.CopyScratchBuffer(ke.SizeBytes()) + ke.MarshalBytes(buf) + return buf +} + +// WriteTo implements io.WriterTo.WriteTo. +func (ke *KernelIPTGetEntries) WriteTo(w io.Writer) (int64, error) { + buf := make([]byte, ke.SizeBytes()) + ke.MarshalBytes(buf) + length, err := w.Write(buf) + return int64(length), err +} + +var _ marshal.Marshallable = (*KernelIPTGetEntries)(nil) + // IPTReplace is the argument for the IPT_SO_SET_REPLACE sockopt. It // corresponds to struct ipt_replace in // include/uapi/linux/netfilter_ipv4/ip_tables.h. type IPTReplace struct { - Name [XT_TABLE_MAXNAMELEN]byte + Name TableName ValidHooks uint32 NumEntries uint32 Size uint32 @@ -309,11 +509,172 @@ type IPTReplace struct { // SizeOfIPTReplace is the size of an IPTReplace. const SizeOfIPTReplace = 96 -func tableName(name []byte) string { - for i, c := range name { +// ExtensionName holds the name of a netfilter extension. +type ExtensionName [XT_EXTENSION_MAXNAMELEN]byte + +// String implements fmt.Stringer. +func (en ExtensionName) String() string { + return goString(en[:]) +} + +// TableName holds the name of a netfilter table. +// +// +marshal +type TableName [XT_TABLE_MAXNAMELEN]byte + +// String implements fmt.Stringer. +func (tn TableName) String() string { + return goString(tn[:]) +} + +// ErrorName holds the name of a netfilter error. These can also hold +// user-defined chains. +type ErrorName [XT_FUNCTION_MAXNAMELEN]byte + +// String implements fmt.Stringer. +func (en ErrorName) String() string { + return goString(en[:]) +} + +func goString(cstring []byte) string { + for i, c := range cstring { if c == 0 { - return string(name[:i]) + return string(cstring[:i]) } } - return string(name) + return string(cstring) +} + +// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp +// in include/uapi/linux/netfilter/xt_tcpudp.h. +type XTTCP struct { + // SourcePortStart specifies the inclusive start of the range of source + // ports to which the matcher applies. + SourcePortStart uint16 + + // SourcePortEnd specifies the inclusive end of the range of source ports + // to which the matcher applies. + SourcePortEnd uint16 + + // DestinationPortStart specifies the start of the destination port + // range to which the matcher applies. + DestinationPortStart uint16 + + // DestinationPortEnd specifies the end of the destination port + // range to which the matcher applies. + DestinationPortEnd uint16 + + // Option specifies that a particular TCP option must be set. + Option uint8 + + // FlagMask masks TCP flags when comparing to the FlagCompare byte. It allows + // for specification of which flags are important to the matcher. + FlagMask uint8 + + // FlagCompare, in combination with FlagMask, is used to match only packets + // that have certain flags set. + FlagCompare uint8 + + // InverseFlags flips the meaning of certain fields. See the + // TX_TCP_INV_* flags. + InverseFlags uint8 } + +// SizeOfXTTCP is the size of an XTTCP. +const SizeOfXTTCP = 12 + +// Flags in XTTCP.InverseFlags. Corresponding constants are in +// include/uapi/linux/netfilter/xt_tcpudp.h. +const ( + // Invert the meaning of SourcePortStart/End. + XT_TCP_INV_SRCPT = 0x01 + // Invert the meaning of DestinationPortStart/End. + XT_TCP_INV_DSTPT = 0x02 + // Invert the meaning of FlagCompare. + XT_TCP_INV_FLAGS = 0x04 + // Invert the meaning of Option. + XT_TCP_INV_OPTION = 0x08 + // Enable all flags. + XT_TCP_INV_MASK = 0x0F +) + +// XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp +// in include/uapi/linux/netfilter/xt_tcpudp.h. +type XTUDP struct { + // SourcePortStart is the inclusive start of the range of source ports + // to which the matcher applies. + SourcePortStart uint16 + + // SourcePortEnd is the inclusive end of the range of source ports to + // which the matcher applies. + SourcePortEnd uint16 + + // DestinationPortStart is the inclusive start of the destination port + // range to which the matcher applies. + DestinationPortStart uint16 + + // DestinationPortEnd is the inclusive end of the destination port + // range to which the matcher applies. + DestinationPortEnd uint16 + + // InverseFlags flips the meaning of certain fields. See the + // TX_UDP_INV_* flags. + InverseFlags uint8 + + _ uint8 +} + +// SizeOfXTUDP is the size of an XTUDP. +const SizeOfXTUDP = 10 + +// Flags in XTUDP.InverseFlags. Corresponding constants are in +// include/uapi/linux/netfilter/xt_tcpudp.h. +const ( + // Invert the meaning of SourcePortStart/End. + XT_UDP_INV_SRCPT = 0x01 + // Invert the meaning of DestinationPortStart/End. + XT_UDP_INV_DSTPT = 0x02 + // Enable all flags. + XT_UDP_INV_MASK = 0x03 +) + +// IPTOwnerInfo holds data for matching packets with owner. It corresponds +// to struct ipt_owner_info in libxt_owner.c of iptables binary. +type IPTOwnerInfo struct { + // UID is user id which created the packet. + UID uint32 + + // GID is group id which created the packet. + GID uint32 + + // PID is process id of the process which created the packet. + PID uint32 + + // SID is session id which created the packet. + SID uint32 + + // Comm is the command name which created the packet. + Comm [16]byte + + // Match is used to match UID/GID of the socket. See the + // XT_OWNER_* flags below. + Match uint8 + + // Invert flips the meaning of Match field. + Invert uint8 +} + +// SizeOfIPTOwnerInfo is the size of an XTOwnerMatchInfo. +const SizeOfIPTOwnerInfo = 34 + +// Flags in IPTOwnerInfo.Match. Corresponding constants are in +// include/uapi/linux/netfilter/xt_owner.h. +const ( + // Match the UID of the packet. + XT_OWNER_UID = 1 << 0 + // Match the GID of the packet. + XT_OWNER_GID = 1 << 1 + // Match if the socket exists for the packet. Forwarded + // packets do not have an associated socket. + XT_OWNER_SOCKET = 1 << 2 +) diff --git a/pkg/abi/linux/netfilter_ipv6.go b/pkg/abi/linux/netfilter_ipv6.go new file mode 100644 index 000000000..9bb9efb10 --- /dev/null +++ b/pkg/abi/linux/netfilter_ipv6.go @@ -0,0 +1,310 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "io" + + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" +) + +// This file contains structures required to support IPv6 netfilter and +// ip6tables. Some constants and structs are equal to their IPv4 analogues, and +// are only distinguished by context (e.g. whether used on an IPv4 of IPv6 +// socket). + +// Socket options for SOL_SOCLET. These correspond to values in +// include/uapi/linux/netfilter_ipv6/ip6_tables.h. +const ( + IP6T_BASE_CTL = 64 + IP6T_SO_SET_REPLACE = IPT_BASE_CTL + IP6T_SO_SET_ADD_COUNTERS = IPT_BASE_CTL + 1 + IP6T_SO_SET_MAX = IPT_SO_SET_ADD_COUNTERS + + IP6T_SO_GET_INFO = IPT_BASE_CTL + IP6T_SO_GET_ENTRIES = IPT_BASE_CTL + 1 + IP6T_SO_GET_REVISION_MATCH = IPT_BASE_CTL + 4 + IP6T_SO_GET_REVISION_TARGET = IPT_BASE_CTL + 5 + IP6T_SO_GET_MAX = IP6T_SO_GET_REVISION_TARGET +) + +// IP6T_ORIGINAL_DST is the ip6tables SOL_IPV6 socket option. Corresponds to +// the value in include/uapi/linux/netfilter_ipv6/ip6_tables.h. +// TODO(gvisor.dev/issue/3549): Support IPv6 original destination. +const IP6T_ORIGINAL_DST = 80 + +// IP6TReplace is the argument for the IP6T_SO_SET_REPLACE sockopt. It +// corresponds to struct ip6t_replace in +// include/uapi/linux/netfilter_ipv6/ip6_tables.h. +// +// +marshal +type IP6TReplace struct { + Name TableName + ValidHooks uint32 + NumEntries uint32 + Size uint32 + HookEntry [NF_INET_NUMHOOKS]uint32 + Underflow [NF_INET_NUMHOOKS]uint32 + NumCounters uint32 + Counters uint64 // This is really a *XTCounters. + // Entries is omitted here because it would cause IP6TReplace to be an + // extra byte longer (see http://www.catb.org/esr/structure-packing/). + // Entries [0]IP6TEntry +} + +// SizeOfIP6TReplace is the size of an IP6TReplace. +const SizeOfIP6TReplace = 96 + +// KernelIP6TGetEntries is identical to IP6TGetEntries, but includes the +// Entrytable field. This has been manually made marshal.Marshallable since it +// is dynamically sized. +type KernelIP6TGetEntries struct { + IPTGetEntries + Entrytable []KernelIP6TEntry +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (ke *KernelIP6TGetEntries) SizeBytes() int { + res := ke.IPTGetEntries.SizeBytes() + for _, entry := range ke.Entrytable { + res += entry.SizeBytes() + } + return res +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (ke *KernelIP6TGetEntries) MarshalBytes(dst []byte) { + ke.IPTGetEntries.MarshalBytes(dst) + marshalledUntil := ke.IPTGetEntries.SizeBytes() + for i := range ke.Entrytable { + ke.Entrytable[i].MarshalBytes(dst[marshalledUntil:]) + marshalledUntil += ke.Entrytable[i].SizeBytes() + } +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (ke *KernelIP6TGetEntries) UnmarshalBytes(src []byte) { + ke.IPTGetEntries.UnmarshalBytes(src) + unmarshalledUntil := ke.IPTGetEntries.SizeBytes() + for i := range ke.Entrytable { + ke.Entrytable[i].UnmarshalBytes(src[unmarshalledUntil:]) + unmarshalledUntil += ke.Entrytable[i].SizeBytes() + } +} + +// Packed implements marshal.Marshallable.Packed. +func (ke *KernelIP6TGetEntries) Packed() bool { + // KernelIP6TGetEntries isn't packed because the ke.Entrytable contains + // an indirection to the actual data we want to marshal (the slice data + // pointer), and the memory for KernelIP6TGetEntries contains the slice + // header which we don't want to marshal. + return false +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (ke *KernelIP6TGetEntries) MarshalUnsafe(dst []byte) { + // Fall back to safe Marshal because the type in not packed. + ke.MarshalBytes(dst) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (ke *KernelIP6TGetEntries) UnmarshalUnsafe(src []byte) { + // Fall back to safe Unmarshal because the type in not packed. + ke.UnmarshalBytes(src) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +func (ke *KernelIP6TGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + buf := task.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Unmarshal unconditionally. If we had a short copy-in, this results + // in a partially unmarshalled struct. + ke.UnmarshalBytes(buf) // escapes: fallback. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +func (ke *KernelIP6TGetEntries) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + // Type KernelIP6TGetEntries doesn't have a packed layout in memory, + // fall back to MarshalBytes. + return task.CopyOutBytes(addr, ke.marshalAll(task)) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +func (ke *KernelIP6TGetEntries) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Type KernelIP6TGetEntries doesn't have a packed layout in memory, fall + // back to MarshalBytes. + return task.CopyOutBytes(addr, ke.marshalAll(task)[:limit]) +} + +func (ke *KernelIP6TGetEntries) marshalAll(task marshal.Task) []byte { + buf := task.CopyScratchBuffer(ke.SizeBytes()) + ke.MarshalBytes(buf) + return buf +} + +// WriteTo implements io.WriterTo.WriteTo. +func (ke *KernelIP6TGetEntries) WriteTo(w io.Writer) (int64, error) { + buf := make([]byte, ke.SizeBytes()) + ke.MarshalBytes(buf) + length, err := w.Write(buf) + return int64(length), err +} + +var _ marshal.Marshallable = (*KernelIP6TGetEntries)(nil) + +// IP6TEntry is an iptables rule. It corresponds to struct ip6t_entry in +// include/uapi/linux/netfilter_ipv6/ip6_tables.h. +// +// +marshal +type IP6TEntry struct { + // IPv6 is used to filter packets based on the IPv6 header. + IPv6 IP6TIP + + // NFCache relates to kernel-internal caching and isn't used by + // userspace. + NFCache uint32 + + // TargetOffset is the byte offset from the beginning of this IPTEntry + // to the start of the entry's target. + TargetOffset uint16 + + // NextOffset is the byte offset from the beginning of this IPTEntry to + // the start of the next entry. It is thus also the size of the entry. + NextOffset uint16 + + // Comeback is a return pointer. It is not used by userspace. + Comeback uint32 + + _ [4]byte + + // Counters holds the packet and byte counts for this rule. + Counters XTCounters + + // Elems holds the data for all this rule's matches followed by the + // target. It is variable length -- users have to iterate over any + // matches and use TargetOffset and NextOffset to make sense of the + // data. + // + // Elems is omitted here because it would cause IPTEntry to be an extra + // byte larger (see http://www.catb.org/esr/structure-packing/). + // + // Elems [0]byte +} + +// SizeOfIP6TEntry is the size of an IP6TEntry. +const SizeOfIP6TEntry = 168 + +// KernelIP6TEntry is identical to IP6TEntry, but includes the Elems field. +// KernelIP6TEntry itself is not Marshallable but it implements some methods of +// marshal.Marshallable that help in other implementations of Marshallable. +type KernelIP6TEntry struct { + Entry IP6TEntry + + // Elems holds the data for all this rule's matches followed by the + // target. It is variable length -- users have to iterate over any + // matches and use TargetOffset and NextOffset to make sense of the + // data. + Elems primitive.ByteSlice +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (ke *KernelIP6TEntry) SizeBytes() int { + return ke.Entry.SizeBytes() + ke.Elems.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (ke *KernelIP6TEntry) MarshalBytes(dst []byte) { + ke.Entry.MarshalBytes(dst) + ke.Elems.MarshalBytes(dst[ke.Entry.SizeBytes():]) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (ke *KernelIP6TEntry) UnmarshalBytes(src []byte) { + ke.Entry.UnmarshalBytes(src) + ke.Elems.UnmarshalBytes(src[ke.Entry.SizeBytes():]) +} + +// IP6TIP contains information for matching a packet's IP header. +// It corresponds to struct ip6t_ip6 in +// include/uapi/linux/netfilter_ipv6/ip6_tables.h. +// +// +marshal +type IP6TIP struct { + // Src is the source IP address. + Src Inet6Addr + + // Dst is the destination IP address. + Dst Inet6Addr + + // SrcMask is the source IP mask. + SrcMask Inet6Addr + + // DstMask is the destination IP mask. + DstMask Inet6Addr + + // InputInterface is the input network interface. + InputInterface [IFNAMSIZ]byte + + // OutputInterface is the output network interface. + OutputInterface [IFNAMSIZ]byte + + // InputInterfaceMask is the input interface mask. + InputInterfaceMask [IFNAMSIZ]byte + + // OuputInterfaceMask is the output interface mask. + OutputInterfaceMask [IFNAMSIZ]byte + + // Protocol is the transport protocol. + Protocol uint16 + + // TOS matches TOS flags when Flags indicates filtering by TOS. + TOS uint8 + + // Flags define matching behavior for the IP header. + Flags uint8 + + // InverseFlags invert the meaning of fields in struct IPTIP. See the + // IP6T_INV_* flags. + InverseFlags uint8 + + // Linux defines in6_addr (Inet6Addr for us) as the union of a + // 16-element byte array and a 4-element 32-bit integer array, so the + // whole struct is 4-byte aligned. + _ [3]byte +} + +const SizeOfIP6TIP = 136 + +// Flags in IP6TIP.InverseFlags. Corresponding constants are in +// include/uapi/linux/netfilter_ipv6/ip6_tables.h. +const ( + // Invert the meaning of InputInterface. + IP6T_INV_VIA_IN = 0x01 + // Invert the meaning of OutputInterface. + IP6T_INV_VIA_OUT = 0x02 + // Invert the meaning of TOS. + IP6T_INV_TOS = 0x04 + // Invert the meaning of Src. + IP6T_INV_SRCIP = 0x08 + // Invert the meaning of Dst. + IP6T_INV_DSTIP = 0x10 + // Invert the meaning of the IPT_F_FRAG flag. + IP6T_INV_FRAG = 0x20 + // Enable all flags. + IP6T_INV_MASK = 0x7F +) diff --git a/pkg/abi/linux/netfilter_test.go b/pkg/abi/linux/netfilter_test.go index 21e237f92..bf73271c6 100644 --- a/pkg/abi/linux/netfilter_test.go +++ b/pkg/abi/linux/netfilter_test.go @@ -29,12 +29,16 @@ func TestSizes(t *testing.T) { {IPTGetEntries{}, SizeOfIPTGetEntries}, {IPTGetinfo{}, SizeOfIPTGetinfo}, {IPTIP{}, SizeOfIPTIP}, + {IPTOwnerInfo{}, SizeOfIPTOwnerInfo}, {IPTReplace{}, SizeOfIPTReplace}, {XTCounters{}, SizeOfXTCounters}, {XTEntryMatch{}, SizeOfXTEntryMatch}, {XTEntryTarget{}, SizeOfXTEntryTarget}, {XTErrorTarget{}, SizeOfXTErrorTarget}, {XTStandardTarget{}, SizeOfXTStandardTarget}, + {IP6TReplace{}, SizeOfIP6TReplace}, + {IP6TEntry{}, SizeOfIP6TEntry}, + {IP6TIP{}, SizeOfIP6TIP}, } for _, tc := range testCases { diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go index 3898d2314..ceda0a8d3 100644 --- a/pkg/abi/linux/netlink_route.go +++ b/pkg/abi/linux/netlink_route.go @@ -187,10 +187,12 @@ const ( // Device types, from uapi/linux/if_arp.h. const ( + ARPHRD_NONE = 65534 + ARPHRD_ETHER = 1 ARPHRD_LOOPBACK = 772 ) -// RouteMessage struct rtmsg, from uapi/linux/rtnetlink.h. +// RouteMessage is struct rtmsg, from uapi/linux/rtnetlink.h. type RouteMessage struct { Family uint8 DstLen uint8 @@ -205,6 +207,9 @@ type RouteMessage struct { Flags uint32 } +// SizeOfRouteMessage is the size of RouteMessage. +const SizeOfRouteMessage = 12 + // Route types, from uapi/linux/rtnetlink.h. const ( // RTN_UNSPEC represents an unspecified route type. @@ -331,3 +336,13 @@ const ( RTF_GATEWAY = 0x2 RTF_UP = 0x1 ) + +// RtAttr is the header of optional addition route information, as a netlink +// attribute. From include/uapi/linux/rtnetlink.h. +type RtAttr struct { + Len uint16 + Type uint16 +} + +// SizeOfRtAttr is the size of RtAttr. +const SizeOfRtAttr = 4 diff --git a/pkg/abi/linux/ptrace_amd64.go b/pkg/abi/linux/ptrace_amd64.go new file mode 100644 index 000000000..ed3881e27 --- /dev/null +++ b/pkg/abi/linux/ptrace_amd64.go @@ -0,0 +1,52 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +// PtraceRegs is the set of CPU registers exposed by ptrace. Source: +// syscall.PtraceRegs. +// +// +marshal +// +stateify savable +type PtraceRegs struct { + R15 uint64 + R14 uint64 + R13 uint64 + R12 uint64 + Rbp uint64 + Rbx uint64 + R11 uint64 + R10 uint64 + R9 uint64 + R8 uint64 + Rax uint64 + Rcx uint64 + Rdx uint64 + Rsi uint64 + Rdi uint64 + Orig_rax uint64 + Rip uint64 + Cs uint64 + Eflags uint64 + Rsp uint64 + Ss uint64 + Fs_base uint64 + Gs_base uint64 + Ds uint64 + Es uint64 + Fs uint64 + Gs uint64 +} diff --git a/pkg/abi/linux/ptrace_arm64.go b/pkg/abi/linux/ptrace_arm64.go new file mode 100644 index 000000000..6147738b3 --- /dev/null +++ b/pkg/abi/linux/ptrace_arm64.go @@ -0,0 +1,29 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package linux + +// PtraceRegs is the set of CPU registers exposed by ptrace. Source: +// syscall.PtraceRegs. +// +// +marshal +// +stateify savable +type PtraceRegs struct { + Regs [31]uint64 + Sp uint64 + Pc uint64 + Pstate uint64 +} diff --git a/pkg/abi/linux/rseq.go b/pkg/abi/linux/rseq.go new file mode 100644 index 000000000..76253ba30 --- /dev/null +++ b/pkg/abi/linux/rseq.go @@ -0,0 +1,130 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Flags passed to rseq(2). +// +// Defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_FLAG_UNREGISTER unregisters the current thread. + RSEQ_FLAG_UNREGISTER = 1 << 0 +) + +// Critical section flags used in RSeqCriticalSection.Flags and RSeq.Flags. +// +// Defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT inhibits restart on preemption. + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 1 << 0 + + // RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL inhibits restart on signal + // delivery. + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 1 << 1 + + // RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE inhibits restart on CPU + // migration. + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 1 << 2 +) + +// RSeqCriticalSection describes a restartable sequences critical section. It +// is equivalent to struct rseq_cs, defined in include/uapi/linux/rseq.h. +// +// In userspace, this structure is always aligned to 32 bytes. +// +// +marshal +type RSeqCriticalSection struct { + // Version is the version of this structure. Version 0 is defined here. + Version uint32 + + // Flags are the critical section flags, defined above. + Flags uint32 + + // Start is the start address of the critical section. + Start uint64 + + // PostCommitOffset is the offset from Start of the first instruction + // outside of the critical section. + PostCommitOffset uint64 + + // Abort is the abort address. It must be outside the critical section, + // and the 4 bytes prior must match the abort signature. + Abort uint64 +} + +const ( + // SizeOfRSeqCriticalSection is the size of RSeqCriticalSection. + SizeOfRSeqCriticalSection = 32 + + // SizeOfRSeqSignature is the size of the signature immediately + // preceding RSeqCriticalSection.Abort. + SizeOfRSeqSignature = 4 +) + +// Special values for RSeq.CPUID, defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_CPU_ID_UNINITIALIZED indicates that this thread has not + // performed rseq initialization. + RSEQ_CPU_ID_UNINITIALIZED = ^uint32(0) // -1 + + // RSEQ_CPU_ID_REGISTRATION_FAILED indicates that rseq initialization + // failed. + RSEQ_CPU_ID_REGISTRATION_FAILED = ^uint32(1) // -2 +) + +// RSeq is the thread-local restartable sequences config/status. It +// is equivalent to struct rseq, defined in include/uapi/linux/rseq.h. +// +// In userspace, this structure is always aligned to 32 bytes. +type RSeq struct { + // CPUIDStart contains the current CPU ID if rseq is initialized. + // + // This field should only be read by the thread which registered this + // structure, and must be read atomically. + CPUIDStart uint32 + + // CPUID contains the current CPU ID or one of the CPU ID special + // values defined above. + // + // This field should only be read by the thread which registered this + // structure, and must be read atomically. + CPUID uint32 + + // RSeqCriticalSection is a pointer to the current RSeqCriticalSection + // block, or NULL. It is reset to NULL by the kernel on restart or + // non-restarting preempt/signal. + // + // This field should only be written by the thread which registered + // this structure, and must be written atomically. + RSeqCriticalSection uint64 + + // Flags are the critical section flags that apply to all critical + // sections on this thread, defined above. + Flags uint32 +} + +const ( + // SizeOfRSeq is the size of RSeq. + // + // Note that RSeq is naively 24 bytes. However, it has 32-byte + // alignment, which in C increases sizeof to 32. That is the size that + // the Linux kernel uses. + SizeOfRSeq = 32 + + // AlignOfRSeq is the standard alignment of RSeq. + AlignOfRSeq = 32 + + // OffsetOfRSeqCriticalSection is the offset of RSeqCriticalSection in RSeq. + OffsetOfRSeqCriticalSection = 8 +) diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go index 4eeb5cd7a..d0607e256 100644 --- a/pkg/abi/linux/seccomp.go +++ b/pkg/abi/linux/seccomp.go @@ -63,3 +63,10 @@ func (a BPFAction) String() string { func (a BPFAction) Data() uint16 { return uint16(a & SECCOMP_RET_DATA) } + +// SockFprog is sock_fprog taken from <linux/filter.h>. +type SockFprog struct { + Len uint16 + pad [6]byte + Filter *BPFInstruction +} diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go index c69b04ea9..1c330e763 100644 --- a/pkg/abi/linux/signal.go +++ b/pkg/abi/linux/signal.go @@ -115,6 +115,8 @@ const ( ) // SignalSet is a signal mask with a bit corresponding to each signal. +// +// +marshal type SignalSet uint64 // SignalSetSize is the size in bytes of a SignalSet. diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index 2e2cc6be7..e37c8727d 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -83,7 +83,6 @@ const ( MSG_MORE = 0x8000 MSG_WAITFORONE = 0x10000 MSG_SENDPAGE_NOTLAST = 0x20000 - MSG_REINJECT = 0x8000000 MSG_ZEROCOPY = 0x4000000 MSG_FASTOPEN = 0x20000000 MSG_CMSG_CLOEXEC = 0x40000000 @@ -134,6 +133,15 @@ const ( SHUT_RDWR = 2 ) +// Packet types from <linux/if_packet.h> +const ( + PACKET_HOST = 0 // To us + PACKET_BROADCAST = 1 // To all + PACKET_MULTICAST = 2 // To group + PACKET_OTHERHOST = 3 // To someone else + PACKET_OUTGOING = 4 // Outgoing of any type +) + // Socket options from socket.h. const ( SO_DEBUG = 1 @@ -225,14 +233,18 @@ const ( const SockAddrMax = 128 // InetAddr is struct in_addr, from uapi/linux/in.h. +// +// +marshal type InetAddr [4]byte // SockAddrInet is struct sockaddr_in, from uapi/linux/in.h. +// +// +marshal type SockAddrInet struct { Family uint16 Port uint16 Addr InetAddr - Zero [8]uint8 // pad to sizeof(struct sockaddr). + _ [8]uint8 // pad to sizeof(struct sockaddr). } // InetMulticastRequest is struct ip_mreq, from uapi/linux/in.h. @@ -247,6 +259,11 @@ type InetMulticastRequestWithNIC struct { InterfaceIndex int32 } +// Inet6Addr is struct in6_addr, from uapi/linux/in6.h. +// +// +marshal +type Inet6Addr [16]byte + // SockAddrInet6 is struct sockaddr_in6, from uapi/linux/in6.h. type SockAddrInet6 struct { Family uint16 @@ -294,6 +311,8 @@ func (s *SockAddrUnix) implementsSockAddr() {} func (s *SockAddrNetlink) implementsSockAddr() {} // Linger is struct linger, from include/linux/socket.h. +// +// +marshal type Linger struct { OnOff int32 Linger int32 @@ -308,6 +327,8 @@ const SizeOfLinger = 8 // the end of this struct or within existing unusued space, so its size grows // over time. The current iteration is based on linux v4.17. New versions are // always backwards compatible. +// +// +marshal type TCPInfo struct { State uint8 CaState uint8 @@ -405,12 +426,23 @@ var SizeOfControlMessageHeader = int(binary.Size(ControlMessageHeader{})) // A ControlMessageCredentials is an SCM_CREDENTIALS socket control message. // // ControlMessageCredentials represents struct ucred from linux/socket.h. +// +// +marshal type ControlMessageCredentials struct { PID int32 UID uint32 GID uint32 } +// A ControlMessageIPPacketInfo is IP_PKTINFO socket control message. +// +// ControlMessageIPPacketInfo represents struct in_pktinfo from linux/in.h. +type ControlMessageIPPacketInfo struct { + NIC int32 + LocalAddr InetAddr + DestinationAddr InetAddr +} + // SizeOfControlMessageCredentials is the binary size of a // ControlMessageCredentials struct. var SizeOfControlMessageCredentials = int(binary.Size(ControlMessageCredentials{})) @@ -422,6 +454,19 @@ type ControlMessageRights []int32 // ControlMessageRights. const SizeOfControlMessageRight = 4 +// SizeOfControlMessageInq is the size of a TCP_INQ control message. +const SizeOfControlMessageInq = 4 + +// SizeOfControlMessageTOS is the size of an IP_TOS control message. +const SizeOfControlMessageTOS = 1 + +// SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message. +const SizeOfControlMessageTClass = 4 + +// SizeOfControlMessageIPPacketInfo is the size of an IP_PKTINFO +// control message. +const SizeOfControlMessageIPPacketInfo = 12 + // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call. // From net/scm.h. const SCM_MAX_FD = 253 diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go index 174d470e2..2a8d4708b 100644 --- a/pkg/abi/linux/tcp.go +++ b/pkg/abi/linux/tcp.go @@ -57,4 +57,5 @@ const ( const ( MAX_TCP_KEEPIDLE = 32767 MAX_TCP_KEEPINTVL = 32767 + MAX_TCP_KEEPCNT = 127 ) diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index 546668bca..e6860ed49 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -101,6 +101,8 @@ func NsecToTimeT(nsec int64) TimeT { } // Timespec represents struct timespec in <time.h>. +// +// +marshal type Timespec struct { Sec int64 Nsec int64 @@ -155,6 +157,8 @@ func DurationToTimespec(dur time.Duration) Timespec { const SizeOfTimeval = 16 // Timeval represents struct timeval in <time.h>. +// +// +marshal type Timeval struct { Sec int64 Usec int64 @@ -228,12 +232,27 @@ type Tms struct { type TimerID int32 // StatxTimestamp represents struct statx_timestamp. +// +// +marshal type StatxTimestamp struct { Sec int64 Nsec uint32 _ int32 } +// ToNsec returns the nanosecond representation. +func (sxts StatxTimestamp) ToNsec() int64 { + return int64(sxts.Sec)*1e9 + int64(sxts.Nsec) +} + +// ToNsecCapped returns the safe nanosecond representation. +func (sxts StatxTimestamp) ToNsecCapped() int64 { + if sxts.Sec > maxSecInDuration { + return math.MaxInt64 + } + return sxts.ToNsec() +} + // NsecToStatxTimestamp translates nanoseconds to StatxTimestamp. func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { return StatxTimestamp{ @@ -243,6 +262,8 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { } // Utime represents struct utimbuf used by utimes(2). +// +// +marshal type Utime struct { Actime int64 Modtime int64 diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go new file mode 100644 index 000000000..99180b208 --- /dev/null +++ b/pkg/abi/linux/xattr.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Constants for extended attributes. +const ( + XATTR_NAME_MAX = 255 + XATTR_SIZE_MAX = 65536 + XATTR_LIST_MAX = 65536 + + XATTR_CREATE = 1 + XATTR_REPLACE = 2 + + XATTR_USER_PREFIX = "user." + XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX) +) |