diff options
Diffstat (limited to 'pkg')
105 files changed, 2011 insertions, 588 deletions
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index cd405aa96..2061b38c0 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -13,8 +13,10 @@ go_library( "audit.go", "bpf.go", "capability.go", + "clone.go", "dev.go", "elf.go", + "epoll.go", "errors.go", "eventfd.go", "exec.go", diff --git a/pkg/abi/linux/clone.go b/pkg/abi/linux/clone.go new file mode 100644 index 000000000..c2cbfca5e --- /dev/null +++ b/pkg/abi/linux/clone.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Clone constants per clone(2). +const ( + CLONE_VM = 0x100 + CLONE_FS = 0x200 + CLONE_FILES = 0x400 + CLONE_SIGHAND = 0x800 + CLONE_PARENT = 0x8000 + CLONE_PTRACE = 0x2000 + CLONE_VFORK = 0x4000 + CLONE_THREAD = 0x10000 + CLONE_NEWNS = 0x20000 + CLONE_SYSVSEM = 0x40000 + CLONE_SETTLS = 0x80000 + CLONE_PARENT_SETTID = 0x100000 + CLONE_CHILD_CLEARTID = 0x200000 + CLONE_DETACHED = 0x400000 + CLONE_UNTRACED = 0x800000 + CLONE_CHILD_SETTID = 0x1000000 + CLONE_NEWUTS = 0x4000000 + CLONE_NEWIPC = 0x8000000 + CLONE_NEWUSER = 0x10000000 + CLONE_NEWPID = 0x20000000 + CLONE_NEWNET = 0x40000000 + CLONE_IO = 0x80000000 +) diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go new file mode 100644 index 000000000..72083b604 --- /dev/null +++ b/pkg/abi/linux/epoll.go @@ -0,0 +1,56 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// EpollEvent is equivalent to struct epoll_event from epoll(2). +type EpollEvent struct { + Events uint32 + Fd int32 + Data int32 +} + +// Event masks. +const ( + EPOLLIN = 0x1 + EPOLLPRI = 0x2 + EPOLLOUT = 0x4 + EPOLLERR = 0x8 + EPOLLHUP = 0x10 + EPOLLRDNORM = 0x40 + EPOLLRDBAND = 0x80 + EPOLLWRNORM = 0x100 + EPOLLWRBAND = 0x200 + EPOLLMSG = 0x400 + EPOLLRDHUP = 0x2000 +) + +// Per-file descriptor flags. +const ( + EPOLLET = 0x80000000 + EPOLLONESHOT = 0x40000000 +) + +// Operation flags. +const ( + EPOLL_CLOEXEC = 0x80000 + EPOLL_NONBLOCK = 0x800 +) + +// Control operations. +const ( + EPOLL_CTL_ADD = 0x1 + EPOLL_CTL_DEL = 0x2 + EPOLL_CTL_MOD = 0x3 +) diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go index 30902a8ca..f78315ebf 100644 --- a/pkg/abi/linux/fcntl.go +++ b/pkg/abi/linux/fcntl.go @@ -16,21 +16,39 @@ package linux // Commands from linux/fcntl.h. const ( - F_DUPFD = 0 - F_GETFD = 1 - F_GETFL = 3 - F_GETOWN = 9 - F_SETFD = 2 - F_SETFL = 4 - F_SETLK = 6 - F_SETLKW = 7 - F_SETOWN = 8 + F_DUPFD = 0x0 + F_GETFD = 0x1 + F_SETFD = 0x2 + F_GETFL = 0x3 + F_SETFL = 0x4 + F_SETLK = 0x6 + F_SETLKW = 0x7 + F_SETOWN = 0x8 + F_GETOWN = 0x9 F_DUPFD_CLOEXEC = 1024 + 6 F_SETPIPE_SZ = 1024 + 7 F_GETPIPE_SZ = 1024 + 8 ) +// Commands for F_SETLK. +const ( + F_RDLCK = 0x0 + F_WRLCK = 0x1 + F_UNLCK = 0x2 +) + // Flags for fcntl. const ( FD_CLOEXEC = 00000001 ) + +// Lock structure for F_SETLK. +type Flock struct { + Type int16 + Whence int16 + _ [4]byte + Start int64 + Len int64 + Pid int32 + _ [4]byte +} diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 426003eb7..f78ffaa82 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -161,23 +161,36 @@ const ( // Stat represents struct stat. type Stat struct { - Dev uint64 - Ino uint64 - Nlink uint64 - Mode uint32 - UID uint32 - GID uint32 - X_pad0 int32 - Rdev uint64 - Size int64 - Blksize int64 - Blocks int64 - ATime Timespec - MTime Timespec - CTime Timespec - X_unused [3]int64 + Dev uint64 + Ino uint64 + Nlink uint64 + Mode uint32 + UID uint32 + GID uint32 + _ int32 + Rdev uint64 + Size int64 + Blksize int64 + Blocks int64 + ATime Timespec + MTime Timespec + CTime Timespec + _ [3]int64 } +// File types. +const ( + DT_BLK = 0x6 + DT_CHR = 0x2 + DT_DIR = 0x4 + DT_FIFO = 0x1 + DT_LNK = 0xa + DT_REG = 0x8 + DT_SOCK = 0xc + DT_UNKNOWN = 0x0 + DT_WHT = 0xe +) + // SizeOfStat is the size of a Stat struct. var SizeOfStat = binary.Size(Stat{}) diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index c82ab9b5b..549e0fb93 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -20,6 +20,7 @@ package linux const ( ANON_INODE_FS_MAGIC = 0x09041934 DEVPTS_SUPER_MAGIC = 0x00001cd1 + EXT_SUPER_MAGIC = 0xef53 OVERLAYFS_SUPER_MAGIC = 0x794c7630 PIPEFS_MAGIC = 0x50495045 PROC_SUPER_MAGIC = 0x9fa0 diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go index 2ef8d6cbb..22acd2d43 100644 --- a/pkg/abi/linux/ipc.go +++ b/pkg/abi/linux/ipc.go @@ -44,10 +44,10 @@ type IPCPerm struct { CUID uint32 CGID uint32 Mode uint16 - pad1 uint16 + _ uint16 Seq uint16 - pad2 uint16 - pad3 uint32 + _ uint16 + _ uint32 unused1 uint64 unused2 uint64 } diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go index 5e718c363..e8b6544b4 100644 --- a/pkg/abi/linux/netlink.go +++ b/pkg/abi/linux/netlink.go @@ -41,10 +41,10 @@ const ( // SockAddrNetlink is struct sockaddr_nl, from uapi/linux/netlink.h. type SockAddrNetlink struct { - Family uint16 - Padding uint16 - PortID uint32 - Groups uint32 + Family uint16 + _ uint16 + PortID uint32 + Groups uint32 } // SockAddrNetlinkSize is the size of SockAddrNetlink. diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go index 630dc339a..dd698e2bc 100644 --- a/pkg/abi/linux/netlink_route.go +++ b/pkg/abi/linux/netlink_route.go @@ -86,12 +86,12 @@ const ( // InterfaceInfoMessage is struct ifinfomsg, from uapi/linux/rtnetlink.h. type InterfaceInfoMessage struct { - Family uint8 - Padding uint8 - Type uint16 - Index int32 - Flags uint32 - Change uint32 + Family uint8 + _ uint8 + Type uint16 + Index int32 + Flags uint32 + Change uint32 } // Interface flags, from uapi/linux/if.h. diff --git a/pkg/abi/linux/sched.go b/pkg/abi/linux/sched.go index 193d9a242..70e820823 100644 --- a/pkg/abi/linux/sched.go +++ b/pkg/abi/linux/sched.go @@ -28,3 +28,9 @@ const ( // reverted back to SCHED_NORMAL on fork. SCHED_RESET_ON_FORK = 0x40000000 ) + +const ( + PRIO_PGRP = 0x1 + PRIO_PROCESS = 0x0 + PRIO_USER = 0x2 +) diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index e727066d7..546668bca 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -241,3 +241,9 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { Nsec: uint32(nsec % 1e9), } } + +// Utime represents struct utimbuf used by utimes(2). +type Utime struct { + Actime int64 + Modtime int64 +} diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 59de615fb..80e106e6f 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -11,6 +11,7 @@ go_library( "full.go", "null.go", "random.go", + "tty.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/dev", visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index d4bbd9807..f739c476c 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -38,12 +38,20 @@ const ( urandomDevMinor uint32 = 9 ) -func newCharacterDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode { +// TTY major device number comes from include/uapi/linux/major.h. +const ( + ttyDevMinor = 0 + ttyDevMajor = 5 +) + +func newCharacterDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSource, major uint16, minor uint32) *fs.Inode { return fs.NewInode(ctx, iops, msrc, fs.StableAttr{ - DeviceID: devDevice.DeviceID(), - InodeID: devDevice.NextIno(), - BlockSize: usermem.PageSize, - Type: fs.CharacterDevice, + DeviceID: devDevice.DeviceID(), + InodeID: devDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.CharacterDevice, + DeviceFileMajor: major, + DeviceFileMinor: minor, }) } @@ -114,6 +122,8 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { // If no devpts is mounted, this will simply be a dangling // symlink, which is fine. "ptmx": newSymlink(ctx, "pts/ptmx", msrc), + + "tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor), } iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) diff --git a/pkg/sentry/fs/dev/tty.go b/pkg/sentry/fs/dev/tty.go new file mode 100644 index 000000000..87d80e292 --- /dev/null +++ b/pkg/sentry/fs/dev/tty.go @@ -0,0 +1,67 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dev + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/waiter" +) + +// +stateify savable +type ttyInodeOperations struct { + fsutil.InodeGenericChecker `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.InodeNoopAllocate `state:"nosave"` + fsutil.InodeNoopRelease `state:"nosave"` + fsutil.InodeNoopTruncate `state:"nosave"` + fsutil.InodeNoopWriteOut `state:"nosave"` + fsutil.InodeNotDirectory `state:"nosave"` + fsutil.InodeNotMappable `state:"nosave"` + fsutil.InodeNotOpenable `state:"nosave"` + fsutil.InodeNotSocket `state:"nosave"` + fsutil.InodeNotSymlink `state:"nosave"` + fsutil.InodeVirtual `state:"nosave"` + + fsutil.InodeSimpleAttributes +} + +var _ fs.InodeOperations = (*ttyInodeOperations)(nil) + +func newTTYDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *ttyInodeOperations { + return &ttyInodeOperations{ + InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC), + } +} + +// +stateify savable +type ttyFileOperations struct { + fsutil.FileNoSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNoopWrite `state:"nosave"` + fsutil.FileNoopRead `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` +} + +var _ fs.FileOperations = (*ttyFileOperations)(nil) diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD new file mode 100644 index 000000000..b386f31c8 --- /dev/null +++ b/pkg/sentry/fs/ext/BUILD @@ -0,0 +1,18 @@ +package(licenses = ["notice"]) + +load("//tools/go_stateify:defs.bzl", "go_library") + +go_library( + name = "ext", + srcs = [ + "ext.go", + "utils.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/fs/ext/disklayout", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fs/ext/assets/README.md b/pkg/sentry/fs/ext/assets/README.md new file mode 100644 index 000000000..b6eac4f40 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/README.md @@ -0,0 +1,28 @@ +### Tiny Ext4 Image + +The image is of size 64Kb which supports 64 1k blocks and 16 inodes. This is the +smallest size mkfs.ext4 works with. + +This image was generated using the following commands. + +```bash +fallocate -l 64K tiny.ext4 +mkfs.ext4 -j tiny.ext4 +``` + +You can mount it using: + +```bash +sudo mount -o loop tiny.ext4 $MOUNTPOINT +``` + +`file.txt`, `bigfile.txt` and `symlink.txt` were added to this image by just +mounting it and copying (while preserving links) those files to the mountpoint +directory using: + +```bash +sudo cp -P {file.txt,symlink.txt,bigfile.txt} $MOUNTPOINT/ +``` + +The files in this directory mirror the contents and organisation of the files +stored in the image. diff --git a/pkg/sentry/fs/ext/assets/bigfile.txt b/pkg/sentry/fs/ext/assets/bigfile.txt new file mode 100644 index 000000000..3857cf516 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/bigfile.txt @@ -0,0 +1,41 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus faucibus eleifend orci, ut ornare nibh faucibus eu. Cras at condimentum massa. Nullam luctus, elit non porttitor congue, sapien diam feugiat sapien, sed eleifend nulla mauris non arcu. Sed lacinia mauris magna, eu mollis libero varius sit amet. Donec mollis, quam convallis commodo posuere, dolor nisi placerat nisi, in faucibus augue mi eu lorem. In pharetra consectetur faucibus. Ut euismod ex efficitur egestas tincidunt. Maecenas condimentum ut ante in rutrum. Vivamus sed arcu tempor, faucibus turpis et, lacinia diam. + +Sed in lacus vel nisl interdum bibendum in sed justo. Nunc tellus risus, molestie vitae arcu sed, molestie tempus ligula. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc risus neque, volutpat et ante non, ullamcorper condimentum ante. Aliquam sed metus in urna condimentum convallis. Vivamus ut libero mauris. Proin mollis posuere consequat. Vestibulum placerat mollis est et pulvinar. + +Donec rutrum odio ac diam pharetra, id fermentum magna cursus. Pellentesque in dapibus elit, et condimentum orci. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Suspendisse euismod dapibus est, id vestibulum mauris. Nulla facilisi. Nulla cursus gravida nisi. Phasellus vestibulum rutrum lectus, a dignissim mauris hendrerit vitae. In at elementum mauris. Integer vel efficitur velit. Nullam fringilla sapien mi, quis luctus neque efficitur ac. Aenean nec quam dapibus nunc commodo pharetra. Proin sapien mi, fermentum aliquet vulputate non, aliquet porttitor diam. Quisque lacinia, urna et finibus fermentum, nunc lacus vehicula ex, sed congue metus lectus ac quam. Aliquam erat volutpat. Suspendisse sodales, dolor ut tincidunt finibus, augue erat varius tellus, a interdum erat sem at nunc. Vestibulum cursus iaculis sapien, vitae feugiat dui auctor quis. + +Pellentesque nec maximus nulla, eu blandit diam. Maecenas quis arcu ornare, congue ante at, vehicula ipsum. Praesent feugiat mauris rutrum sem fermentum, nec luctus ipsum placerat. Pellentesque placerat ipsum at dignissim fringilla. Vivamus et posuere sem, eget hendrerit felis. Aenean vulputate, augue vel mollis feugiat, justo ipsum mollis dolor, eu mollis elit neque ut ipsum. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Fusce bibendum sem quam, vulputate laoreet mi dapibus imperdiet. Sed a purus non nibh pretium aliquet. Integer eget luctus augue, vitae tincidunt magna. Ut eros enim, egestas eu nulla et, lobortis egestas arcu. Cras id ipsum ac justo lacinia rutrum. Vivamus lectus leo, ultricies sed justo at, pellentesque feugiat magna. Ut sollicitudin neque elit, vel ornare mauris commodo id. + +Duis dapibus orci et sapien finibus finibus. Mauris eleifend, lacus at vestibulum maximus, quam ligula pharetra erat, sit amet dapibus neque elit vitae neque. In bibendum sollicitudin erat, eget ultricies tortor malesuada at. Sed sit amet orci turpis. Donec feugiat ligula nibh, molestie tincidunt lectus elementum id. Donec volutpat maximus nibh, in vulputate felis posuere eu. Cras tincidunt ullamcorper lacus. Phasellus porta lorem auctor, congue magna a, commodo elit. + +Etiam auctor mi quis elit sodales, eu pulvinar arcu condimentum. Aenean imperdiet risus et dapibus tincidunt. Nullam tincidunt dictum dui, sed commodo urna rutrum id. Ut mollis libero vel elit laoreet bibendum. Quisque arcu arcu, tincidunt at ultricies id, vulputate nec metus. In tristique posuere quam sit amet volutpat. Vivamus scelerisque et nunc at dapibus. Fusce finibus libero ut ligula pretium rhoncus. Mauris non elit in arcu finibus imperdiet. Pellentesque nec massa odio. Proin rutrum mauris non sagittis efficitur. Aliquam auctor quam at dignissim faucibus. Ut eget ligula in magna posuere ultricies vitae sit amet turpis. Duis maximus odio nulla. Donec gravida sem tristique tempus scelerisque. + +Interdum et malesuada fames ac ante ipsum primis in faucibus. Fusce pharetra magna vulputate aliquet tempus. Duis id hendrerit arcu. Quisque ut ex elit. Integer velit orci, venenatis ut sapien ac, placerat porttitor dui. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc hendrerit cursus diam, hendrerit finibus ipsum scelerisque ut. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. + +Nulla non euismod neque. Phasellus vel sapien eu metus pulvinar rhoncus. Suspendisse eu mollis tellus, quis vestibulum tortor. Maecenas interdum dolor sed nulla fermentum maximus. Donec imperdiet ullamcorper condimentum. Nam quis nibh ante. Praesent quis tellus ut tortor pulvinar blandit sit amet ut sapien. Vestibulum est orci, pellentesque vitae tristique sit amet, tristique non felis. + +Vivamus sodales pellentesque varius. Sed vel tempus ligula. Nulla tristique nisl vel dui facilisis, ac sodales augue hendrerit. Proin augue nisi, vestibulum quis augue nec, sagittis tincidunt velit. Vestibulum euismod, nulla nec sodales faucibus, urna sapien vulputate magna, id varius metus sapien ut neque. Duis in mollis urna, in scelerisque enim. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc condimentum dictum turpis, et egestas neque dapibus eget. Quisque fringilla, dui eu venenatis eleifend, erat nibh lacinia urna, at lacinia lacus sapien eu dui. Duis eu erat ut mi lacinia convallis a sed ex. + +Fusce elit metus, tincidunt nec eleifend a, hendrerit nec ligula. Duis placerat finibus sollicitudin. In euismod porta tellus, in luctus justo bibendum bibendum. Maecenas at magna eleifend lectus tincidunt suscipit ut a ligula. Nulla tempor accumsan felis, fermentum dapibus est eleifend vitae. Mauris urna sem, fringilla at ultricies non, ultrices in arcu. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam vehicula nunc at laoreet imperdiet. Nunc tristique ut risus id aliquet. Integer eleifend massa orci. + +Vestibulum sed ante sollicitudin nisi fringilla bibendum nec vel quam. Sed pretium augue eu ligula congue pulvinar. Donec vitae magna tincidunt, pharetra lacus id, convallis nulla. Cras viverra nisl nisl, varius convallis leo vulputate nec. Morbi at consequat dui, sed aliquet metus. Sed suscipit fermentum mollis. Maecenas nec mi sodales, tincidunt purus in, tristique mauris. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec interdum mi in velit efficitur, quis ultrices ex imperdiet. Sed vestibulum, magna ut tristique pretium, mi ipsum placerat tellus, non tempor enim augue et ex. Pellentesque eget felis quis ante sodales viverra ac sed lacus. Donec suscipit tempus massa, eget laoreet massa molestie at. + +Aenean fringilla dui non aliquet consectetur. Fusce cursus quam nec orci hendrerit faucibus. Donec consequat suscipit enim, non volutpat lectus auctor interdum. Proin lorem purus, maximus vel orci vitae, suscipit egestas turpis. Donec risus urna, congue a sem eu, aliquet placerat odio. Morbi gravida tristique turpis, quis efficitur enim. Nunc interdum gravida ipsum vel facilisis. Nunc congue finibus sollicitudin. Quisque euismod aliquet lectus et tincidunt. Curabitur ultrices sem ut mi fringilla fermentum. Morbi pretium, nisi sit amet dapibus congue, dolor enim consectetur risus, a interdum ligula odio sed odio. Quisque facilisis, mi at suscipit gravida, nunc sapien cursus justo, ut luctus odio nulla quis leo. Integer condimentum lobortis mauris, non egestas tellus lobortis sit amet. + +In sollicitudin velit ac ante vehicula, vitae varius tortor mollis. In hac habitasse platea dictumst. Quisque et orci lorem. Integer malesuada fringilla luctus. Pellentesque malesuada, mi non lobortis porttitor, ante ligula vulputate ante, nec dictum risus eros sit amet sapien. Nulla aliquam lorem libero, ac varius nulla tristique eget. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut pellentesque mauris orci, vel consequat mi varius a. Ut sit amet elit vulputate, lacinia metus non, fermentum nisl. Pellentesque eu nisi sed quam egestas blandit. Duis sit amet lobortis dolor. Donec consectetur sem interdum, tristique elit sit amet, sodales lacus. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Fusce id aliquam augue. Sed pretium congue risus vitae lacinia. Vestibulum non vulputate risus, ut malesuada justo. + +Sed odio elit, consectetur ac mauris quis, consequat commodo libero. Fusce sodales velit vulputate pulvinar fermentum. Donec iaculis nec nisl eget faucibus. Mauris at dictum velit. Donec fermentum lectus eu viverra volutpat. Aliquam consequat facilisis lorem, cursus consequat dui bibendum ullamcorper. Pellentesque nulla magna, imperdiet at magna et, cursus egestas enim. Nullam semper molestie lectus sit amet semper. Duis eget tincidunt est. Integer id neque risus. Integer ultricies hendrerit vestibulum. Donec blandit blandit sagittis. Nunc consectetur vitae nisi consectetur volutpat. + +Nulla id lorem fermentum, efficitur magna a, hendrerit dui. Vivamus sagittis orci gravida, bibendum quam eget, molestie est. Phasellus nec enim tincidunt, volutpat sapien non, laoreet diam. Nulla posuere enim nec porttitor lobortis. Donec auctor odio ut orci eleifend, ut eleifend purus convallis. Interdum et malesuada fames ac ante ipsum primis in faucibus. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut hendrerit, purus eget viverra tincidunt, sem magna imperdiet libero, et aliquam turpis neque vitae elit. Maecenas semper varius iaculis. Cras non lorem quis quam bibendum eleifend in et libero. Curabitur at purus mauris. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus porta diam sed elit eleifend gravida. + +Nulla facilisi. Ut ultricies diam vel diam consectetur, vel porta augue molestie. Fusce interdum sapien et metus facilisis pellentesque. Nulla convallis sem at nunc vehicula facilisis. Nam ac rutrum purus. Nunc bibendum, dolor sit amet tempus ullamcorper, lorem leo tempor sem, id fringilla nunc augue scelerisque augue. Nullam sit amet rutrum nisl. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Donec sed mauris gravida eros vehicula sagittis at eget orci. Cras elementum, eros at accumsan bibendum, libero neque blandit purus, vitae vestibulum libero massa ac nibh. Integer at placerat nulla. Mauris eu eleifend orci. Aliquam consequat ligula vitae erat porta lobortis. Duis fermentum elit ac aliquet ornare. + +Mauris eget cursus tellus, eget sodales purus. Aliquam malesuada, augue id vulputate finibus, nisi ex bibendum nisl, sit amet laoreet quam urna a dolor. Nullam ultricies, sapien eu laoreet consequat, erat eros dignissim diam, ultrices sodales lectus mauris et leo. Morbi lacinia eu ante at tempus. Sed iaculis finibus magna malesuada efficitur. Donec faucibus erat sit amet elementum feugiat. Praesent a placerat nisi. Etiam lacinia gravida diam, et sollicitudin sapien tincidunt ut. + +Maecenas felis quam, tincidunt vitae venenatis scelerisque, viverra vitae odio. Phasellus enim neque, ultricies suscipit malesuada sit amet, vehicula sit amet purus. Nulla placerat sit amet dui vel tincidunt. Nam quis neque vel magna commodo egestas. Vestibulum sagittis rutrum lorem ut congue. Maecenas vel ultrices tellus. Donec efficitur, urna ac consequat iaculis, lorem felis pharetra eros, eget faucibus orci lectus sit amet arcu. + +Ut a tempus nisi. Nulla facilisi. Praesent vulputate maximus mi et dapibus. Sed sit amet libero ac augue hendrerit efficitur in a sapien. Mauris placerat velit sit amet tellus sollicitudin faucibus. Donec egestas a magna ac suscipit. Duis enim sapien, mollis sed egestas et, vestibulum vel leo. + +Proin quis dapibus dui. Donec eu tincidunt nunc. Vivamus eget purus consectetur, maximus ante vitae, tincidunt elit. Aenean mattis dolor a gravida aliquam. Praesent quis tellus id sem maximus vulputate nec sed nulla. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur metus nulla, volutpat volutpat est eu, hendrerit congue erat. Aliquam sollicitudin augue ante. Sed sollicitudin, magna eu consequat elementum, mi augue ullamcorper felis, molestie imperdiet erat metus iaculis est. Proin ac tortor nisi. Pellentesque quis nisi risus. Integer enim sapien, tincidunt quis tortor id, accumsan venenatis mi. Nulla facilisi. + +Cras pretium sit amet quam congue maximus. Morbi lacus libero, imperdiet commodo massa sed, scelerisque placerat libero. Cras nisl nisi, consectetur sed bibendum eu, venenatis at enim. Proin sodales justo at quam aliquam, a consectetur mi ornare. Donec porta ac est sit amet efficitur. Suspendisse vestibulum tortor id neque imperdiet, id lacinia risus vehicula. Phasellus ac eleifend purus. Mauris vel gravida ante. Aliquam vitae lobortis risus. Sed vehicula consectetur tincidunt. Nam et justo vitae purus molestie consequat. Pellentesque ipsum ex, convallis quis blandit non, gravida et urna. Donec diam ligula amet. diff --git a/pkg/sentry/fs/ext/assets/file.txt b/pkg/sentry/fs/ext/assets/file.txt new file mode 100644 index 000000000..980a0d5f1 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/file.txt @@ -0,0 +1 @@ +Hello World! diff --git a/pkg/sentry/fs/ext/assets/symlink.txt b/pkg/sentry/fs/ext/assets/symlink.txt new file mode 120000 index 000000000..4c330738c --- /dev/null +++ b/pkg/sentry/fs/ext/assets/symlink.txt @@ -0,0 +1 @@ +file.txt
\ No newline at end of file diff --git a/pkg/sentry/fs/ext/assets/tiny.ext4 b/pkg/sentry/fs/ext/assets/tiny.ext4 Binary files differnew file mode 100644 index 000000000..a6859736d --- /dev/null +++ b/pkg/sentry/fs/ext/assets/tiny.ext4 diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext/disklayout/BUILD index e3d2f8d71..e4cb26645 100644 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ b/pkg/sentry/fs/ext/disklayout/BUILD @@ -8,17 +8,25 @@ go_library( "block_group.go", "block_group_32.go", "block_group_64.go", + "dirent.go", + "dirent_new.go", + "dirent_old.go", + "disklayout.go", "inode.go", + "inode_new.go", + "inode_old.go", "superblock.go", "superblock_32.go", "superblock_64.go", "superblock_old.go", "test_utils.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout", + importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout", + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", ], @@ -29,7 +37,10 @@ go_test( size = "small", srcs = [ "block_group_test.go", + "dirent_test.go", + "inode_test.go", "superblock_test.go", ], embed = [":disklayout"], + deps = ["//pkg/sentry/kernel/time"], ) diff --git a/pkg/sentry/fs/ext4/disklayout/block_group.go b/pkg/sentry/fs/ext/disklayout/block_group.go index 32ea3d97d..ad6f4fef8 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group.go +++ b/pkg/sentry/fs/ext/disklayout/block_group.go @@ -18,6 +18,16 @@ package disklayout // is split into a series of block groups. This provides an access layer to // information needed to access and use a block group. // +// Location: +// - The block group descriptor table is always placed in the blocks +// immediately after the block containing the superblock. +// - The 1st block group descriptor in the original table is in the +// (sb.FirstDataBlock() + 1)th block. +// - See SuperBlock docs to see where the block group descriptor table is +// replicated. +// - sb.BgDescSize() must be used as the block group descriptor entry size +// while reading the table from disk. +// // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. type BlockGroup interface { // InodeTable returns the absolute block number of the block containing the diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_32.go b/pkg/sentry/fs/ext/disklayout/block_group_32.go index dadecb3e3..3e16c76db 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_32.go +++ b/pkg/sentry/fs/ext/disklayout/block_group_32.go @@ -17,12 +17,6 @@ package disklayout // BlockGroup32Bit emulates the first half of struct ext4_group_desc in // fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and // 32-bit ext4 filesystems. It implements BlockGroup interface. -// -// The suffix `Lo` here stands for lower bits because this is also used in the -// 64-bit version where these fields represent the lower half of the fields. -// The suffix `Raw` has been added to indicate that the field does not have a -// counterpart in the 64-bit version and to resolve name collision with the -// interface. type BlockGroup32Bit struct { BlockBitmapLo uint32 InodeBitmapLo uint32 @@ -38,6 +32,9 @@ type BlockGroup32Bit struct { ChecksumRaw uint16 } +// Compiles only if BlockGroup32Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup32Bit)(nil) + // InodeTable implements BlockGroup.InodeTable. func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) } diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_64.go b/pkg/sentry/fs/ext/disklayout/block_group_64.go index 27de3990d..9a809197a 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_64.go +++ b/pkg/sentry/fs/ext/disklayout/block_group_64.go @@ -18,9 +18,6 @@ package disklayout // It is the block group descriptor struct for 64-bit ext4 filesystems. // It implements BlockGroup interface. It is an extension of the 32-bit // version of BlockGroup. -// -// The suffix `Hi` here stands for upper bits because they represent the upper -// half of the fields. type BlockGroup64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. @@ -40,6 +37,9 @@ type BlockGroup64Bit struct { _ uint32 // Padding to 64 bytes. } +// Compiles only if BlockGroup64Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup64Bit)(nil) + // Methods to override. Checksum() and Flags() are not overridden. // InodeTable implements BlockGroup.InodeTable. diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_test.go b/pkg/sentry/fs/ext/disklayout/block_group_test.go index 0ef4294c0..0ef4294c0 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_test.go +++ b/pkg/sentry/fs/ext/disklayout/block_group_test.go diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go new file mode 100644 index 000000000..685bf57b8 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent.go @@ -0,0 +1,69 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +const ( + // MaxFileName is the maximum length of an ext fs file's name. + MaxFileName = 255 +) + +var ( + // inodeTypeByFileType maps ext4 file types to vfs inode types. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype. + inodeTypeByFileType = map[uint8]fs.InodeType{ + 0: fs.Anonymous, + 1: fs.RegularFile, + 2: fs.Directory, + 3: fs.CharacterDevice, + 4: fs.BlockDevice, + 5: fs.Pipe, + 6: fs.Socket, + 7: fs.Symlink, + } +) + +// The Dirent interface should be implemented by structs representing ext +// directory entries. These are for the linear classical directories which +// just store a list of dirent structs. A directory is a series of data blocks +// where is each data block contains a linear array of dirents. The last entry +// of the block has a record size that takes it to the end of the block. The +// end of the directory is when you read dirInode.Size() bytes from the blocks. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. +type Dirent interface { + // Inode returns the absolute inode number of the underlying inode. + // Inode number 0 signifies an unused dirent. + Inode() uint32 + + // RecordSize returns the record length of this dirent on disk. The next + // dirent in the dirent list should be read after these many bytes from + // the current dirent. Must be a multiple of 4. + RecordSize() uint16 + + // FileName returns the name of the file. Can be at most 255 is length. + FileName() string + + // FileType returns the inode type of the underlying inode. This is a + // performance hack so that we do not have to read the underlying inode struct + // to know the type of inode. This will only work when the SbDirentFileType + // feature is set. If not, the second returned value will be false indicating + // that user code has to use the inode mode to extract the file type. + FileType() (fs.InodeType, bool) +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_new.go b/pkg/sentry/fs/ext/disklayout/dirent_new.go new file mode 100644 index 000000000..29ae4a5c2 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent_new.go @@ -0,0 +1,61 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +// DirentNew represents the ext4 directory entry struct. This emulates Linux's +// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we +// only need 8 bits to store the NameLength. As a result, NameLength has been +// shortened and the other 8 bits are used to encode the file type. Use the +// FileTypeRaw field only if the SbDirentFileType feature is set. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentNew struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint8 + FileTypeRaw uint8 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentNew implements Dirent. +var _ Dirent = (*DirentNew)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentNew) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentNew) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentNew) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentNew) FileType() (fs.InodeType, bool) { + if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok { + return inodeType, true + } + + panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw)) +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_old.go b/pkg/sentry/fs/ext/disklayout/dirent_old.go new file mode 100644 index 000000000..2e0f9c812 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent_old.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/fs" + +// DirentOld represents the old directory entry struct which does not contain +// the file type. This emulates Linux's ext4_dir_entry struct. This is used in +// ext2, ext3 and sometimes in ext4. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentOld struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint16 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentOld implements Dirent. +var _ Dirent = (*DirentOld)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentOld) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentOld) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentOld) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentOld) FileType() (fs.InodeType, bool) { + return fs.Anonymous, false +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go new file mode 100644 index 000000000..cc6dff2c9 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestDirentSize tests that the dirent structs are of the correct +// size. +func TestDirentSize(t *testing.T) { + want := uintptr(263) + + assertSize(t, DirentOld{}, want) + assertSize(t, DirentNew{}, want) +} diff --git a/pkg/sentry/fs/ext/disklayout/disklayout.go b/pkg/sentry/fs/ext/disklayout/disklayout.go new file mode 100644 index 000000000..bdf4e2132 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/disklayout.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package disklayout provides Linux ext file system's disk level structures +// which can be directly read into from the underlying device. Structs aim to +// emulate structures `exactly` how they are layed out on disk. +// +// This library aims to be compatible with all ext(2/3/4) systems so it +// provides a generic interface for all major structures and various +// implementations (for different versions). The user code is responsible for +// using appropriate implementations based on the underlying device. +// +// Interfacing all major structures here serves a few purposes: +// - Abstracts away the complexity of the underlying structure from client +// code. The client only has to figure out versioning on set up and then +// can use these as black boxes and pass it higher up the stack. +// - Having pointer receivers forces the user to use pointers to these +// heavy structs. Hence, prevents the client code from unintentionally +// copying these by value while passing the interface around. +// - Version-based implementation selection is resolved on set up hence +// avoiding per call overhead of choosing implementation. +// - All interface methods are pretty light weight (do not take in any +// parameters by design). Passing pointer arguments to interface methods +// can lead to heap allocation as the compiler won't be able to perform +// escape analysis on an unknown implementation at compile time. +// +// Notes: +// - All fields in these structs are exported because binary.Read would +// panic otherwise. +// - All structures on disk are in little-endian order. Only jbd2 (journal) +// structures are in big-endian order. +// - All OS dependent fields in these structures will be interpretted using +// the Linux version of that field. +// - The suffix `Lo` in field names stands for lower bits of that field. +// - The suffix `Hi` in field names stands for upper bits of that field. +// - The suffix `Raw` has been added to indicate that the field is not split +// into Lo and Hi fields and also to resolve name collision with the +// respective interface. +package disklayout diff --git a/pkg/sentry/fs/ext4/disklayout/inode.go b/pkg/sentry/fs/ext/disklayout/inode.go index b48001910..b48001910 100644 --- a/pkg/sentry/fs/ext4/disklayout/inode.go +++ b/pkg/sentry/fs/ext/disklayout/inode.go diff --git a/pkg/sentry/fs/ext/disklayout/inode_new.go b/pkg/sentry/fs/ext/disklayout/inode_new.go new file mode 100644 index 000000000..4f5348372 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_new.go @@ -0,0 +1,96 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/kernel/time" + +// InodeNew represents ext4 inode structure which can be bigger than +// OldInodeSize. The actual size of this struct should be determined using +// inode.ExtraInodeSize. Accessing any field here should be verified with the +// actual size. The extra space between the end of the inode struct and end of +// the inode record can be used to store extended attr. +// +// If the TimeExtra fields are in scope, the lower 2 bits of those are used +// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits +// are used to provide nanoscond precision. Hence, these timestamps will now +// overflow in May 2446. +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +type InodeNew struct { + InodeOld + + ExtraInodeSize uint16 + ChecksumHi uint16 + ChangeTimeExtra uint32 + ModificationTimeExtra uint32 + AccessTimeExtra uint32 + CreationTime uint32 + CreationTimeExtra uint32 + VersionHi uint32 + ProjectID uint32 +} + +// Compiles only if InodeNew implements Inode. +var _ Inode = (*InodeNew)(nil) + +// fromExtraTime decodes the extra time and constructs the kernel time struct +// with nanosecond precision. +func fromExtraTime(lo int32, extra uint32) time.Time { + // See description above InodeNew for format. + seconds := (int64(extra&0x3) << 32) + int64(lo) + nanoseconds := int64(extra >> 2) + return time.FromUnix(seconds, nanoseconds) +} + +// Only override methods which change due to ext4 specific fields. + +// Size implements Inode.Size. +func (in *InodeNew) Size() uint64 { + return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeNew) InodeSize() uint16 { + return oldInodeSize + in.ExtraInodeSize +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeNew) ChangeTime() time.Time { + // Apply new timestamp logic if inode.ChangeTimeExtra is in scope. + if in.ExtraInodeSize >= 8 { + return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra) + } + + return in.InodeOld.ChangeTime() +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeNew) ModificationTime() time.Time { + // Apply new timestamp logic if inode.ModificationTimeExtra is in scope. + if in.ExtraInodeSize >= 12 { + return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra) + } + + return in.InodeOld.ModificationTime() +} + +// AccessTime implements Inode.AccessTime. +func (in *InodeNew) AccessTime() time.Time { + // Apply new timestamp logic if inode.AccessTimeExtra is in scope. + if in.ExtraInodeSize >= 16 { + return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra) + } + + return in.InodeOld.AccessTime() +} diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fs/ext/disklayout/inode_old.go new file mode 100644 index 000000000..dc4c9d8e4 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_old.go @@ -0,0 +1,117 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +const ( + // oldInodeSize is the inode size in ext2/ext3. + oldInodeSize = 128 +) + +// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. +// Inode struct size and record size are both 128 bytes for this. +// +// All fields representing time are in seconds since the epoch. Which means that +// they will overflow in January 2038. +type InodeOld struct { + ModeRaw uint16 + UIDLo uint16 + SizeLo uint32 + + // The time fields are signed integers because they could be negative to + // represent time before the epoch. + AccessTimeRaw int32 + ChangeTimeRaw int32 + ModificationTimeRaw int32 + DeletionTimeRaw int32 + + GIDLo uint16 + LinksCountRaw uint16 + BlocksCountLo uint32 + FlagsRaw uint32 + VersionLo uint32 // This is OS dependent. + BlocksRaw [60]byte + Generation uint32 + FileACLLo uint32 + SizeHi uint32 + ObsoFaddr uint32 + + // OS dependent fields have been inlined here. + BlocksCountHi uint16 + FileACLHi uint16 + UIDHi uint16 + GIDHi uint16 + ChecksumLo uint16 + _ uint16 +} + +// Compiles only if InodeOld implements Inode. +var _ Inode = (*InodeOld)(nil) + +// Mode implements Inode.Mode. +func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) } + +// UID implements Inode.UID. +func (in *InodeOld) UID() auth.KUID { + return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo)) +} + +// GID implements Inode.GID. +func (in *InodeOld) GID() auth.KGID { + return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo)) +} + +// Size implements Inode.Size. +func (in *InodeOld) Size() uint64 { + // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL. + return uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeOld) InodeSize() uint16 { return oldInodeSize } + +// AccessTime implements Inode.AccessTime. +func (in *InodeOld) AccessTime() time.Time { + return time.FromUnix(int64(in.AccessTimeRaw), 0) +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeOld) ChangeTime() time.Time { + return time.FromUnix(int64(in.ChangeTimeRaw), 0) +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeOld) ModificationTime() time.Time { + return time.FromUnix(int64(in.ModificationTimeRaw), 0) +} + +// DeletionTime implements Inode.DeletionTime. +func (in *InodeOld) DeletionTime() time.Time { + return time.FromUnix(int64(in.DeletionTimeRaw), 0) +} + +// LinksCount implements Inode.LinksCount. +func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } + +// Flags implements Inode.Flags. +func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } + +// Blocks implements Inode.Blocks. +func (in *InodeOld) Blocks() [60]byte { return in.BlocksRaw } diff --git a/pkg/sentry/fs/ext/disklayout/inode_test.go b/pkg/sentry/fs/ext/disklayout/inode_test.go new file mode 100644 index 000000000..9cae9e4f0 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_test.go @@ -0,0 +1,222 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// TestInodeSize tests that the inode structs are of the correct size. +func TestInodeSize(t *testing.T) { + assertSize(t, InodeOld{}, oldInodeSize) + + // This was updated from 156 bytes to 160 bytes in Oct 2015. + assertSize(t, InodeNew{}, 160) +} + +// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in +// ext4 inode structs are decoded correctly. +// +// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +func TestTimestampSeconds(t *testing.T) { + type timestampTest struct { + // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set. + // If this is set then the 32-bit time is negative. + msbSet bool + + // lowerBound tells if we should take the lowest possible value of + // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to + // false it tells to take the highest possible value. + lowerBound bool + + // extraBits is InodeNew.[X]TimeExtra. + extraBits uint32 + + // want is the kernel time struct that is expected. + want time.Time + } + + tests := []timestampTest{ + // 1901-12-13 + { + msbSet: true, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(-0x80000000), 0), + }, + + // 1969-12-31 + { + msbSet: true, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(-1), 0), + }, + + // 1970-01-01 + { + msbSet: false, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(0), 0), + }, + + // 2038-01-19 + { + msbSet: false, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(0x7fffffff), 0), + }, + + // 2038-01-19 + { + msbSet: true, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x80000000), 0), + }, + + // 2106-02-07 + { + msbSet: true, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0xffffffff), 0), + }, + + // 2106-02-07 + { + msbSet: false, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x100000000), 0), + }, + + // 2174-02-25 + { + msbSet: false, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0x17fffffff), 0), + }, + + // 2174-02-25 + { + msbSet: true, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x180000000), 0), + }, + + // 2242-03-16 + { + msbSet: true, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x1ffffffff), 0), + }, + + // 2242-03-16 + { + msbSet: false, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x200000000), 0), + }, + + // 2310-04-04 + { + msbSet: false, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x27fffffff), 0), + }, + + // 2310-04-04 + { + msbSet: true, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x280000000), 0), + }, + + // 2378-04-22 + { + msbSet: true, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x2ffffffff), 0), + }, + + // 2378-04-22 + { + msbSet: false, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x300000000), 0), + }, + + // 2446-05-10 + { + msbSet: false, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x37fffffff), 0), + }, + } + + lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000 + upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111 + lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000 + upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111 + + get32BitTime := func(test timestampTest) int32 { + if test.msbSet { + if test.lowerBound { + return lowerMSB1 + } + + return upperMSB1 + } + + if test.lowerBound { + return lowerMSB0 + } + + return upperMSB0 + } + + getTestName := func(test timestampTest) string { + return fmt.Sprintf( + "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t", + strconv.FormatInt(int64(test.extraBits), 2), + test.msbSet, + test.lowerBound, + ) + } + + for _, test := range tests { + t.Run(getTestName(test), func(t *testing.T) { + if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want { + t.Errorf("Expected: %v, Got: %v", test.want, got) + } + }) + } +} diff --git a/pkg/sentry/fs/ext4/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go index 030c73410..7a337a5e0 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock.go +++ b/pkg/sentry/fs/ext/disklayout/superblock.go @@ -12,24 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package disklayout provides Linux ext file system's disk level structures -// which can be directly read into from the underlying device. All structures -// on disk are in little-endian order. Only jbd2 (journal) structures are in -// big-endian order. Structs aim to emulate structures `exactly` how they are -// layed out on disk. -// -// This library aims to be compatible with all ext(2/3/4) systems so it -// provides a generic interface for all major structures and various -// implementations (for different versions). The user code is responsible for -// using appropriate implementations based on the underlying device. -// -// Notes: -// - All fields in these structs are exported because binary.Read would -// panic otherwise. -// - All OS dependent fields in these structures will be interpretted using -// the Linux version of that field. package disklayout +const ( + // SbOffset is the absolute offset at which the superblock is placed. + SbOffset = 1024 +) + // SuperBlock should be implemented by structs representing the ext superblock. // The superblock holds a lot of information about the enclosing filesystem. // This interface aims to provide access methods to important information held @@ -73,8 +62,6 @@ type SuperBlock interface { // // If the filesystem has 1kb data blocks then this should return 1. For all // other configurations, this typically returns 0. - // - // The first block group descriptor is in (FirstDataBlock() + 1)th block. FirstDataBlock() uint32 // BlockSize returns the size of one data block in this filesystem. @@ -109,7 +96,7 @@ type SuperBlock interface { // - Inode disk record size = sb.s_inode_size (function return value). // = 256 (default) // - Inode struct size = 128 + inode.i_extra_isize. - // = 128 + 28 = 156 (default) + // = 128 + 32 = 160 (default) InodeSize() uint16 // InodesPerGroup returns the number of inodes in a block group. @@ -144,7 +131,7 @@ type SuperBlock interface { } // SbRevision is the type for superblock revisions. -type SbRevision int +type SbRevision uint32 // Super block revisions. const ( diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_32.go b/pkg/sentry/fs/ext/disklayout/superblock_32.go index 4c3233eed..587e4afaa 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock_32.go +++ b/pkg/sentry/fs/ext/disklayout/superblock_32.go @@ -16,10 +16,6 @@ package disklayout // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of // the ext4_super_block struct in fs/ext4/ext4.h. -// -// The suffix `Raw` has been added to indicate that the field does not have a -// counterpart in the 64-bit version and to resolve name collision with the -// interface. type SuperBlock32Bit struct { // We embed the old superblock struct here because the 32-bit version is just // an extension of the old version. @@ -52,6 +48,9 @@ type SuperBlock32Bit struct { JnlBlocks [17]uint32 } +// Compiles only if SuperBlock32Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock32Bit)(nil) + // Only override methods which change based on the additional fields above. // Not overriding SuperBlock.BgDescSize because it would still return 32 here. diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_64.go b/pkg/sentry/fs/ext/disklayout/superblock_64.go index 2e945a7c7..a2c2278fb 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock_64.go +++ b/pkg/sentry/fs/ext/disklayout/superblock_64.go @@ -18,9 +18,6 @@ package disklayout // the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly // 1024 bytes (smallest possible block size) and hence the superblock always // fits in no more than one data block. -// -// The suffix `Hi` here stands for upper bits because they represent the upper -// half of the fields. type SuperBlock64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. @@ -78,6 +75,9 @@ type SuperBlock64Bit struct { Checksum uint32 } +// Compiles only if SuperBlock64Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock64Bit)(nil) + // Only override methods which change based on the 64-bit feature. // BlocksCount implements SuperBlock.BlocksCount. diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_old.go b/pkg/sentry/fs/ext/disklayout/superblock_old.go index 1f7425ba3..aada8b550 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock_old.go +++ b/pkg/sentry/fs/ext/disklayout/superblock_old.go @@ -16,12 +16,6 @@ package disklayout // SuperBlockOld implements SuperBlock and represents the old version of the // superblock struct in ext2 and ext3 systems. -// -// The suffix `Lo` here stands for lower bits because this is also used in the -// 64-bit version where these fields represent the lower half of the fields. -// The suffix `Raw` has been added to indicate that the field does not have a -// counterpart in the 64-bit version and to resolve name collision with the -// interface. type SuperBlockOld struct { InodesCountRaw uint32 BlocksCountLo uint32 @@ -50,6 +44,9 @@ type SuperBlockOld struct { DefResGID uint16 } +// Compiles only if SuperBlockOld implements SuperBlock. +var _ SuperBlock = (*SuperBlockOld)(nil) + // InodesCount implements SuperBlock.InodesCount. func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw } @@ -84,7 +81,7 @@ func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterS func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } // InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlockOld) InodeSize() uint16 { return 128 } +func (sb *SuperBlockOld) InodeSize() uint16 { return oldInodeSize } // InodesPerGroup implements SuperBlock.InodesPerGroup. func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_test.go b/pkg/sentry/fs/ext/disklayout/superblock_test.go index 463b5ba21..463b5ba21 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock_test.go +++ b/pkg/sentry/fs/ext/disklayout/superblock_test.go diff --git a/pkg/sentry/fs/ext4/disklayout/test_utils.go b/pkg/sentry/fs/ext/disklayout/test_utils.go index 9c63f04c0..9c63f04c0 100644 --- a/pkg/sentry/fs/ext4/disklayout/test_utils.go +++ b/pkg/sentry/fs/ext/disklayout/test_utils.go diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go new file mode 100644 index 000000000..9d33be08f --- /dev/null +++ b/pkg/sentry/fs/ext/ext.go @@ -0,0 +1,75 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ext implements readonly ext(2/3/4) filesystems. +package ext + +import ( + "io" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Filesystem implements vfs.FilesystemImpl. +type Filesystem struct { + // dev is the ReadSeeker for the underlying fs device and is protected by mu. + dev io.ReadSeeker + + // mu synchronizes the usage of dev. The ext filesystems take locality into + // condsideration, i.e. data blocks of a file will tend to be placed close + // together. On a spinning disk, locality reduces the amount of movement of + // the head hence speeding up IO operations. On an SSD there are no moving + // parts but locality increases the size of each transer request. Hence, + // having mutual exclusion on the read seeker while reading a file *should* + // help in achieving the intended performance gains. + // + // Note: This synchronization was not coupled with the ReadSeeker itself + // because we want to synchronize across read/seek operations for the + // performance gains mentioned above. Helps enforcing one-file-at-a-time IO. + mu sync.Mutex + + // sb represents the filesystem superblock. Immutable after initialization. + sb disklayout.SuperBlock + + // bgs represents all the block group descriptors for the filesystem. + // Immutable after initialization. + bgs []disklayout.BlockGroup +} + +// newFilesystem is the Filesystem constructor. +func newFilesystem(dev io.ReadSeeker) (*Filesystem, error) { + fs := Filesystem{dev: dev} + var err error + + fs.sb, err = readSuperBlock(dev) + if err != nil { + return nil, err + } + + if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { + // mount(2) specifies that EINVAL should be returned if the superblock is + // invalid. + return nil, syserror.EINVAL + } + + fs.bgs, err = readBlockGroups(dev, fs.sb) + if err != nil { + return nil, err + } + + return &fs, nil +} diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go new file mode 100644 index 000000000..861ef9a73 --- /dev/null +++ b/pkg/sentry/fs/ext/utils.go @@ -0,0 +1,103 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "encoding/binary" + "io" + + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/syserror" +) + +// readFromDisk performs a binary read from disk into the given struct from +// the absolute offset provided. +// +// All disk reads should use this helper so we avoid reading from stale +// previously used offsets. This function forces the offset parameter. +func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error { + if _, err := dev.Seek(abOff, io.SeekStart); err != nil { + return syserror.EIO + } + + if err := binary.Read(dev, binary.LittleEndian, v); err != nil { + return syserror.EIO + } + + return nil +} + +// readSuperBlock reads the SuperBlock from block group 0 in the underlying +// device. There are three versions of the superblock. This function identifies +// and returns the correct version. +func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) { + var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + if sb.Revision() == disklayout.OldRev { + return sb, nil + } + + sb = &disklayout.SuperBlock32Bit{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + if !sb.IncompatibleFeatures().Is64Bit { + return sb, nil + } + + sb = &disklayout.SuperBlock64Bit{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + return sb, nil +} + +// blockGroupsCount returns the number of block groups in the ext fs. +func blockGroupsCount(sb disklayout.SuperBlock) uint64 { + blocksCount := sb.BlocksCount() + blocksPerGroup := uint64(sb.BlocksPerGroup()) + + // Round up the result. float64 can compromise precision so do it manually. + bgCount := blocksCount / blocksPerGroup + if blocksCount%blocksPerGroup != 0 { + bgCount++ + } + + return bgCount +} + +// readBlockGroups reads the block group descriptor table from block group 0 in +// the underlying device. +func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { + bgCount := blockGroupsCount(sb) + bgdSize := uint64(sb.BgDescSize()) + is64Bit := sb.IncompatibleFeatures().Is64Bit + bgds := make([]disklayout.BlockGroup, bgCount) + + for i, off := uint64(0), uint64(sb.FirstDataBlock()+1)*sb.BlockSize(); i < bgCount; i, off = i+1, off+bgdSize { + if is64Bit { + bgds[i] = &disklayout.BlockGroup64Bit{} + } else { + bgds[i] = &disklayout.BlockGroup32Bit{} + } + + if err := readFromDisk(dev, int64(off), bgds[i]); err != nil { + return nil, err + } + } + return bgds, nil +} diff --git a/pkg/sentry/fs/ext4/BUILD b/pkg/sentry/fs/ext4/BUILD deleted file mode 100644 index 9dce67635..000000000 --- a/pkg/sentry/fs/ext4/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library") - -go_library( - name = "ext4", - srcs = ["fs.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/context", - "//pkg/sentry/fs", - ], -) diff --git a/pkg/sentry/fs/ext4/fs.go b/pkg/sentry/fs/ext4/fs.go deleted file mode 100644 index 5c7274821..000000000 --- a/pkg/sentry/fs/ext4/fs.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package ext4 implements the ext4 filesystem. -package ext4 - -import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -// filesystem implements fs.Filesystem for ext4. -// -// +stateify savable -type filesystem struct{} - -func init() { - fs.RegisterFilesystem(&filesystem{}) -} - -// FilesystemName is the name under which the filesystem is registered. -// Name matches fs/ext4/super.c:ext4_fs_type.name. -const FilesystemName = "ext4" - -// Name is the name of the file system. -func (*filesystem) Name() string { - return FilesystemName -} - -// AllowUserMount prohibits users from using mount(2) with this file system. -func (*filesystem) AllowUserMount() bool { - return false -} - -// AllowUserList prohibits this filesystem to be listed in /proc/filesystems. -func (*filesystem) AllowUserList() bool { - return false -} - -// Flags returns properties of the filesystem. -// -// In Linux, ext4 returns FS_REQUIRES_DEV. See fs/ext4/super.c -func (*filesystem) Flags() fs.FilesystemFlags { - return fs.FilesystemRequiresDev -} - -// Mount returns the root inode of the ext4 fs. -func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, cgroupsInt interface{}) (*fs.Inode, error) { - panic("unimplemented") -} diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 24b769cfc..e0602da17 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -339,7 +339,9 @@ func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child * } } if child.Inode.overlay.lowerExists { - return overlayCreateWhiteout(o.upper, child.name) + if err := overlayCreateWhiteout(o.upper, child.name); err != nil { + return err + } } // We've removed from the directory so we must drop the cache. o.markDirectoryDirty() @@ -418,10 +420,12 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena return err } if renamed.Inode.overlay.lowerExists { - return overlayCreateWhiteout(oldParent.Inode.overlay.upper, oldName) + if err := overlayCreateWhiteout(oldParent.Inode.overlay.upper, oldName); err != nil { + return err + } } // We've changed the directory so we must drop the cache. - o.markDirectoryDirty() + oldParent.Inode.overlay.markDirectoryDirty() return nil } diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index ce7ffeed2..693ffc760 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -663,6 +663,11 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s } defer d.DecRef() + // Check that it is a regular file. + if !IsRegular(d.Inode.StableAttr) { + continue + } + // Check whether we can read and execute the found file. if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil { log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err) diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 37694620c..6b839685b 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -155,37 +155,40 @@ func (n *netDev) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se contents[1] = " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n" for _, i := range interfaces { - // TODO(b/71872867): Collect stats from each inet.Stack - // implementation (hostinet, epsocket, and rpcinet). - // Implements the same format as // net/core/net-procfs.c:dev_seq_printf_stats. - l := fmt.Sprintf("%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", + var stats inet.StatDev + if err := n.s.Statistics(&stats, i.Name); err != nil { + log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) + continue + } + l := fmt.Sprintf( + "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", i.Name, // Received - 0, // bytes - 0, // packets - 0, // errors - 0, // dropped - 0, // fifo - 0, // frame - 0, // compressed - 0, // multicast + stats[0], // bytes + stats[1], // packets + stats[2], // errors + stats[3], // dropped + stats[4], // fifo + stats[5], // frame + stats[6], // compressed + stats[7], // multicast // Transmitted - 0, // bytes - 0, // packets - 0, // errors - 0, // dropped - 0, // fifo - 0, // frame - 0, // compressed - 0) // multicast + stats[8], // bytes + stats[9], // packets + stats[10], // errors + stats[11], // dropped + stats[12], // fifo + stats[13], // frame + stats[14], // compressed + stats[15]) // multicast contents = append(contents, l) } var data []seqfile.SeqData for _, l := range contents { - data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*ifinet6)(nil)}) + data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*netDev)(nil)}) } return data, 0 diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index ef0ca3301..87184ec67 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -162,6 +162,11 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry // subtask to emit. offset := file.Offset() + tasks := f.t.ThreadGroup().MemberIDs(f.pidns) + if len(tasks) == 0 { + return offset, syserror.ENOENT + } + if offset == 0 { // Serialize "." and "..". root := fs.RootFromContext(ctx) @@ -178,12 +183,12 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry } // Serialize tasks. - tasks := f.t.ThreadGroup().MemberIDs(f.pidns) taskInts := make([]int, 0, len(tasks)) for _, tid := range tasks { taskInts = append(taskInts, int(tid)) } + sort.Sort(sort.IntSlice(taskInts)) // Find the task to start at. idx := sort.SearchInts(taskInts, int(offset)) if idx == len(taskInts) { diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 7c104fd47..5b75a4a06 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -49,6 +49,9 @@ type Stack interface { // SetTCPSACKEnabled attempts to change TCP selective acknowledgement // settings. SetTCPSACKEnabled(enabled bool) error + + // Statistics reports stack statistics. + Statistics(stat interface{}, arg string) error } // Interface contains information about a network interface. @@ -102,3 +105,7 @@ type TCPBufferSize struct { // Max is the maximum size. Max int } + +// StatDev describes one line of /proc/net/dev, i.e., stats for one network +// interface. +type StatDev [16]uint64 diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index 624371eb6..75f9e7a77 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -81,3 +81,8 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error { s.TCPSACKFlag = enabled return nil } + +// Statistics implements inet.Stack.Statistics. +func (s *TestStack) Statistics(stat interface{}, arg string) error { + return nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 7b92f1b8d..e61d39c82 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -31,7 +31,7 @@ go_template_instance( go_template_instance( name = "seqatomic_taskgoroutineschedinfo", - out = "seqatomic_taskgoroutineschedinfo.go", + out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", template = "//third_party/gvsync:generic_seqatomic", @@ -112,7 +112,7 @@ go_library( "ptrace_arm64.go", "rseq.go", "seccomp.go", - "seqatomic_taskgoroutineschedinfo.go", + "seqatomic_taskgoroutineschedinfo_unsafe.go", "session_list.go", "sessions.go", "signal.go", diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 42779baa9..1d00a6310 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -5,7 +5,7 @@ load("//tools/go_stateify:defs.bzl", "go_library") go_template_instance( name = "atomicptr_credentials", - out = "atomicptr_credentials.go", + out = "atomicptr_credentials_unsafe.go", package = "auth", suffix = "Credentials", template = "//third_party/gvsync:generic_atomicptr", @@ -45,7 +45,7 @@ go_template_instance( go_library( name = "auth", srcs = [ - "atomicptr_credentials.go", + "atomicptr_credentials_unsafe.go", "auth.go", "capability_set.go", "context.go", diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 1f3a57dc1..cc3f43a45 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -81,7 +81,9 @@ type FDTable struct { // mu protects below. mu sync.Mutex `state:"nosave"` - // used contains the number of non-nil entries. + // used contains the number of non-nil entries. It must be accessed + // atomically. It may be read atomically without holding mu (but not + // written). used int32 // descriptorTable holds descriptors. @@ -317,7 +319,7 @@ func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { // GetFDs returns a list of valid fds. func (f *FDTable) GetFDs() []int32 { - fds := make([]int32, 0, f.used) + fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) f.forEach(func(fd int32, file *fs.File, flags FDFlags) { fds = append(fds, fd) }) diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index a5cf1f627..6a31dc044 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -5,7 +5,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test") go_template_instance( name = "atomicptr_bucket", - out = "atomicptr_bucket.go", + out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", template = "//third_party/gvsync:generic_atomicptr", @@ -29,7 +29,7 @@ go_template_instance( go_library( name = "futex", srcs = [ - "atomicptr_bucket.go", + "atomicptr_bucket_unsafe.go", "futex.go", "waiter_list.go", ], diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index a88bf3951..d60cd62c7 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -250,8 +250,20 @@ func (ns *PIDNamespace) allocateTID() (ThreadID, error) { } // Is it available? - _, ok := ns.tasks[tid] - if !ok { + tidInUse := func() bool { + if _, ok := ns.tasks[tid]; ok { + return true + } + if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { + return true + } + if _, ok := ns.sessions[SessionID(tid)]; ok { + return true + } + return false + }() + + if !tidInUse { ns.last = tid return tid, nil } diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index 8d76e106e..405e00292 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -21,6 +21,7 @@ package kvm import ( "fmt" + "math" "sync/atomic" "syscall" "unsafe" @@ -134,7 +135,7 @@ func (c *vCPU) notify() { syscall.SYS_FUTEX, uintptr(unsafe.Pointer(&c.state)), linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, - ^uintptr(0), // Number of waiters. + math.MaxInt32, // Number of waiters. 0, 0, 0) if errno != 0 { throw("futex wake error") diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index 3577b5127..0feff8778 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -70,6 +70,14 @@ func (c *CPU) init() { c.tss.ist1Lo = uint32(stackAddr) c.tss.ist1Hi = uint32(stackAddr >> 32) + // Set the I/O bitmap base address beyond the last byte in the TSS + // to block access to the entire I/O address range. + // + // From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1: + // I/O addresses not spanned by the map are treated as if they had set + // bits in the map. + c.tss.ioPerm = tssLimit + 1 + // Permanently set the kernel segments. c.registers.Cs = uint64(Kcode) c.registers.Ds = uint64(Kdata) diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index 9d1bcfd41..69eff7373 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -867,6 +867,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(v), nil + case linux.TCP_MAXSEG: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.MaxSegOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + case linux.TCP_KEEPIDLE: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument @@ -1219,6 +1231,14 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * v := usermem.ByteOrder.Uint32(optVal) return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v))) + case linux.TCP_MAXSEG: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MaxSegOption(v))) + case linux.TCP_KEEPIDLE: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go index 6d2b5d038..421f93dc4 100644 --- a/pkg/sentry/socket/epsocket/provider.go +++ b/pkg/sentry/socket/epsocket/provider.go @@ -40,42 +40,49 @@ type provider struct { } // getTransportProtocol figures out transport protocol. Currently only TCP, -// UDP, and ICMP are supported. -func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) { +// UDP, and ICMP are supported. The bool return value is true when this socket +// is associated with a transport protocol. This is only false for SOCK_RAW, +// IPPROTO_IP sockets. +func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, bool, *syserr.Error) { switch stype { case linux.SOCK_STREAM: if protocol != 0 && protocol != syscall.IPPROTO_TCP { - return 0, syserr.ErrInvalidArgument + return 0, true, syserr.ErrInvalidArgument } - return tcp.ProtocolNumber, nil + return tcp.ProtocolNumber, true, nil case linux.SOCK_DGRAM: switch protocol { case 0, syscall.IPPROTO_UDP: - return udp.ProtocolNumber, nil + return udp.ProtocolNumber, true, nil case syscall.IPPROTO_ICMP: - return header.ICMPv4ProtocolNumber, nil + return header.ICMPv4ProtocolNumber, true, nil case syscall.IPPROTO_ICMPV6: - return header.ICMPv6ProtocolNumber, nil + return header.ICMPv6ProtocolNumber, true, nil } case linux.SOCK_RAW: // Raw sockets require CAP_NET_RAW. creds := auth.CredentialsFromContext(ctx) if !creds.HasCapability(linux.CAP_NET_RAW) { - return 0, syserr.ErrPermissionDenied + return 0, true, syserr.ErrPermissionDenied } switch protocol { case syscall.IPPROTO_ICMP: - return header.ICMPv4ProtocolNumber, nil + return header.ICMPv4ProtocolNumber, true, nil case syscall.IPPROTO_UDP: - return header.UDPProtocolNumber, nil + return header.UDPProtocolNumber, true, nil case syscall.IPPROTO_TCP: - return header.TCPProtocolNumber, nil + return header.TCPProtocolNumber, true, nil + // IPPROTO_RAW signifies that the raw socket isn't assigned to + // a transport protocol. Users will be able to write packets' + // IP headers and won't receive anything. + case syscall.IPPROTO_RAW: + return tcpip.TransportProtocolNumber(0), false, nil } } - return 0, syserr.ErrProtocolNotSupported + return 0, true, syserr.ErrProtocolNotSupported } // Socket creates a new socket object for the AF_INET or AF_INET6 family. @@ -93,7 +100,7 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* } // Figure out the transport protocol. - transProto, err := getTransportProtocol(t, stype, protocol) + transProto, associated, err := getTransportProtocol(t, stype, protocol) if err != nil { return nil, err } @@ -103,7 +110,7 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* var e *tcpip.Error wq := &waiter.Queue{} if stype == linux.SOCK_RAW { - ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq) + ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) } else { ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) } diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go index 1627a4f68..7eef19f74 100644 --- a/pkg/sentry/socket/epsocket/stack.go +++ b/pkg/sentry/socket/epsocket/stack.go @@ -138,3 +138,8 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError() } + +// Statistics implements inet.Stack.Statistics. +func (s *Stack) Statistics(stat interface{}, arg string) error { + return syserr.ErrEndpointOperation.ToError() +} diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index 11f94281c..cc1f66fa1 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -244,3 +244,8 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserror.EACCES } + +// Statistics implements inet.Stack.Statistics. +func (s *Stack) Statistics(stat interface{}, arg string) error { + return syserror.EOPNOTSUPP +} diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go index 3038f25a7..49bd3a220 100644 --- a/pkg/sentry/socket/rpcinet/stack.go +++ b/pkg/sentry/socket/rpcinet/stack.go @@ -133,3 +133,8 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { func (s *Stack) SetTCPSACKEnabled(enabled bool) error { panic("rpcinet handles procfs directly this method should not be called") } + +// Statistics implements inet.Stack.Statistics. +func (s *Stack) Statistics(stat interface{}, arg string) error { + return syserr.ErrEndpointOperation.ToError() +} diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 637168714..eb262ecaf 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -68,6 +68,12 @@ func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) // NewWithDirent creates a new unix socket using an existing dirent. func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File { + // You can create AF_UNIX, SOCK_RAW sockets. They're the same as + // SOCK_DGRAM and don't require CAP_NET_RAW. + if stype == linux.SOCK_RAW { + stype = linux.SOCK_DGRAM + } + s := SocketOperations{ ep: ep, stype: stype, @@ -639,7 +645,7 @@ func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs // Create the endpoint and socket. var ep transport.Endpoint switch stype { - case linux.SOCK_DGRAM: + case linux.SOCK_DGRAM, linux.SOCK_RAW: ep = transport.NewConnectionless(t) case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: ep = transport.NewConnectioned(t, stype, t.Kernel()) @@ -658,7 +664,7 @@ func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.F } switch stype { - case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: + case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW: // Ok default: return nil, nil, syserr.ErrInvalidArgument diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index 11850dd0f..87dcad18b 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -15,12 +15,12 @@ package syscalls import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -44,21 +44,21 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to add the entry. @@ -70,21 +70,21 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to update the entry. @@ -96,21 +96,21 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to remove the entry. @@ -122,14 +122,14 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, e // Get epoll from the file descriptor. epollfile := t.GetFile(fd) if epollfile == nil { - return nil, syscall.EBADF + return nil, syserror.EBADF } defer epollfile.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return nil, syscall.EBADF + return nil, syserror.EBADF } // Try to read events and return right away if we got them or if the @@ -162,7 +162,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, e } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syscall.ETIMEDOUT { + if err == syserror.ETIMEDOUT { return nil, nil } diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index ac3905c5c..264301bfa 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -17,7 +17,6 @@ package linux import ( "io" "sync" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" @@ -54,7 +53,7 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return syscall.EFBIG + return syserror.EFBIG case syserror.ErrInterrupted: // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 7bbbb5008..51db2d8f7 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -16,8 +16,6 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -116,10 +114,10 @@ var AMD64 = &kernel.SyscallTable{ 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), 67: syscalls.Supported("shmdt", Shmdt), - 68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), @@ -170,8 +168,8 @@ var AMD64 = &kernel.SyscallTable{ 119: syscalls.Supported("setresgid", Setresgid), 120: syscalls.Supported("getresgid", Getresgid), 121: syscalls.Supported("getpgid", Getpgid), - 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 124: syscalls.Supported("getsid", Getsid), 125: syscalls.Supported("capget", Capget), 126: syscalls.Supported("capset", Capset), @@ -182,12 +180,12 @@ var AMD64 = &kernel.SyscallTable{ 131: syscalls.Supported("sigaltstack", Sigaltstack), 132: syscalls.Supported("utime", Utime), 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), - 134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil), - 135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil), - 136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil), + 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), @@ -196,15 +194,15 @@ var AMD64 = &kernel.SyscallTable{ 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil), - 155: syscalls.Error("pivot_root", syscall.EPERM, "", nil), - 156: syscalls.Error("sysctl", syscall.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), @@ -225,50 +223,50 @@ var AMD64 = &kernel.SyscallTable{ 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in Linux > 2.6.", nil), - 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in Linux > 2.6.", nil), + 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Removed after Linux 3.1.", nil), - 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux.", nil), - 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux.", nil), - 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux.", nil), - 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux.", nil), - 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux.", nil), + 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), - 187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) - 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 187: syscalls.ErrorWithEvent("readahead", syserror.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) + 188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 200: syscalls.Supported("tkill", Tkill), 201: syscalls.Supported("time", Time), 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 213: syscalls.Supported("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated.", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated.", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since Linux 3.16.", nil), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), 217: syscalls.Supported("getdents64", Getdents64), 218: syscalls.Supported("set_tid_address", SetTidAddress), 219: syscalls.Supported("restart_syscall", RestartSyscall), - 220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), 222: syscalls.Supported("timer_create", TimerCreate), 223: syscalls.Supported("timer_settime", TimerSettime), @@ -284,21 +282,21 @@ var AMD64 = &kernel.SyscallTable{ 233: syscalls.Supported("epoll_ctl", EpollCtl), 234: syscalls.Supported("tgkill", Tgkill), 235: syscalls.Supported("utimes", Utimes), - 236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil), + 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 243: syscalls.ErrorWithEvent("mq_timedreceive", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), 247: syscalls.Supported("waitid", Waitid), - 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user.", nil), - 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user.", nil), - 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user.", nil), + 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), @@ -321,23 +319,23 @@ var AMD64 = &kernel.SyscallTable{ 270: syscalls.Supported("pselect", Pselect), 271: syscalls.Supported("ppoll", Ppoll), 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete.", nil), + 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 276: syscalls.ErrorWithEvent("tee", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) 280: syscalls.Supported("utimensat", Utimensat), 281: syscalls.Supported("epoll_pwait", EpollPwait), - 282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 282: syscalls.ErrorWithEvent("signalfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) 283: syscalls.Supported("timerfd_create", TimerfdCreate), 284: syscalls.Supported("eventfd", Eventfd), 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), 286: syscalls.Supported("timerfd_settime", TimerfdSettime), 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), 288: syscalls.Supported("accept4", Accept4), - 289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 289: syscalls.ErrorWithEvent("signalfd4", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) 290: syscalls.Supported("eventfd2", Eventfd2), 291: syscalls.Supported("epoll_create1", EpollCreate1), 292: syscalls.Supported("dup3", Dup3), @@ -346,37 +344,37 @@ var AMD64 = &kernel.SyscallTable{ 295: syscalls.Supported("preadv", Preadv), 296: syscalls.Supported("pwritev", Pwritev), 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil), + 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 302: syscalls.Supported("prlimit64", Prlimit64), - 303: syscalls.Error("name_to_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 304: syscalls.Error("open_by_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) 309: syscalls.Supported("getcpu", Getcpu), - 310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) 317: syscalls.Supported("seccomp", Seccomp), 318: syscalls.Supported("getrandom", GetRandom), 319: syscalls.Supported("memfd_create", MemfdCreate), 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) - 323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) + 322: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) + 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls after 325 are "backports" from versions of Linux after 4.4. - 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil), + 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), 327: syscalls.Supported("preadv2", Preadv2), 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), 332: syscalls.Supported("statx", Statx), diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index 00b7e7cf2..333013d8c 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -15,8 +15,6 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -27,7 +25,7 @@ import ( // STOP are clear. func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { if size != linux.SignalSetSize { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } b := t.CopyScratchBuffer(8) if _, err := t.CopyInBytes(sigSetAddr, b); err != nil { diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index a0dd4d4e3..4a2b9f061 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -15,8 +15,7 @@ package linux import ( - "syscall" - + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" @@ -29,11 +28,11 @@ import ( // EpollCreate1 implements the epoll_create1(2) linux syscall. func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() - if flags & ^syscall.EPOLL_CLOEXEC != 0 { + if flags & ^linux.EPOLL_CLOEXEC != 0 { return 0, nil, syserror.EINVAL } - closeOnExec := flags&syscall.EPOLL_CLOEXEC != 0 + closeOnExec := flags&linux.EPOLL_CLOEXEC != 0 fd, err := syscalls.CreateEpoll(t, closeOnExec) if err != nil { return 0, nil, err @@ -69,37 +68,34 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := epoll.EntryFlags(0) mask := waiter.EventMask(0) var data [2]int32 - if op != syscall.EPOLL_CTL_DEL { - var e syscall.EpollEvent + if op != linux.EPOLL_CTL_DEL { + var e linux.EpollEvent if _, err := t.CopyIn(eventAddr, &e); err != nil { return 0, nil, err } - if e.Events&syscall.EPOLLONESHOT != 0 { + if e.Events&linux.EPOLLONESHOT != 0 { flags |= epoll.OneShot } - // syscall.EPOLLET is incorrectly generated as a negative number - // in Go, see https://github.com/golang/go/issues/5328 for - // details. - if e.Events&-syscall.EPOLLET != 0 { + if e.Events&linux.EPOLLET != 0 { flags |= epoll.EdgeTriggered } mask = waiter.EventMaskFromLinux(e.Events) data[0] = e.Fd - data[1] = e.Pad + data[1] = e.Data } // Perform the requested operations. switch op { - case syscall.EPOLL_CTL_ADD: + case linux.EPOLL_CTL_ADD: // See fs/eventpoll.c. mask |= waiter.EventHUp | waiter.EventErr return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data) - case syscall.EPOLL_CTL_DEL: + case linux.EPOLL_CTL_DEL: return 0, nil, syscalls.RemoveEpoll(t, epfd, fd) - case syscall.EPOLL_CTL_MOD: + case linux.EPOLL_CTL_MOD: // Same as EPOLL_CTL_ADD. mask |= waiter.EventHUp | waiter.EventErr return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data) diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 5cfc2c3c8..8a34c4e99 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -15,12 +15,11 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" + "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -38,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC) if flags & ^allOps != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index eb6f5648f..2e00a91ce 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -353,7 +353,8 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l // No more resolution necessary. defer resolved.DecRef() break - } else if err != fs.ErrResolveViaReadlink { + } + if err != fs.ErrResolveViaReadlink { return err } @@ -363,15 +364,17 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l } // Resolve the symlink to a path via Readlink. - path, err := found.Inode.Readlink(t) + var path string + path, err = found.Inode.Readlink(t) if err != nil { break } remainingTraversals-- // Get the new parent from the target path. + var newParent *fs.Dirent newParentPath, newName := fs.SplitLast(path) - newParent, err := t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals) + newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals) if err != nil { break } @@ -765,7 +768,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall defer file.DecRef() err := file.Flush(t) - return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file) + return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) } // Dup implements linux syscall dup(2). @@ -907,7 +910,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Copy in the lock request. flockAddr := args[2].Pointer() - var flock syscall.Flock_t + var flock linux.Flock if _, err := t.CopyIn(flockAddr, &flock); err != nil { return 0, nil, err } @@ -956,11 +959,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. switch flock.Type { - case syscall.F_RDLCK: + case linux.F_RDLCK: if !file.Flags().Read { return 0, nil, syserror.EBADF } - if cmd == syscall.F_SETLK { + if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN @@ -972,11 +975,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } return 0, nil, nil - case syscall.F_WRLCK: + case linux.F_WRLCK: if !file.Flags().Write { return 0, nil, syserror.EBADF } - if cmd == syscall.F_SETLK { + if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN @@ -988,7 +991,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } return 0, nil, nil - case syscall.F_UNLCK: + case linux.F_UNLCK: file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng) return 0, nil, nil default: @@ -1473,7 +1476,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG @@ -1533,7 +1536,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG @@ -1823,7 +1826,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // No timesAddr argument will be interpreted as current system time. ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { - var times syscall.Utimbuf + var times linux.Utime if _, err := t.CopyIn(timesAddr, ×); err != nil { return 0, nil, err } @@ -2015,7 +2018,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index 7a2beda23..63e2c5a5d 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -17,8 +17,8 @@ package linux import ( "bytes" "io" - "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -82,7 +82,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err { case nil: - dir.Dirent.InotifyEvent(syscall.IN_ACCESS, 0) + dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0) return uintptr(ds.Written()), nil case io.EOF: return 0, nil @@ -146,21 +146,21 @@ func smallestDirent64(a arch.Context) uint { func toType(nodeType fs.InodeType) uint8 { switch nodeType { case fs.RegularFile, fs.SpecialFile: - return syscall.DT_REG + return linux.DT_REG case fs.Symlink: - return syscall.DT_LNK + return linux.DT_LNK case fs.Directory, fs.SpecialDirectory: - return syscall.DT_DIR + return linux.DT_DIR case fs.Pipe: - return syscall.DT_FIFO + return linux.DT_FIFO case fs.CharacterDevice: - return syscall.DT_CHR + return linux.DT_CHR case fs.BlockDevice: - return syscall.DT_BLK + return linux.DT_BLK case fs.Socket: - return syscall.DT_SOCK + return linux.DT_SOCK default: - return syscall.DT_UNKNOWN + return linux.DT_UNKNOWN } } diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index a4f36e01f..b2c7b3444 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -15,13 +15,12 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" ) const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) @@ -31,7 +30,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. flags := int(args[0].Int()) if flags&^allFlags != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } dirent := fs.NewDirent(t, anon.NewInode(t), "inotify") @@ -66,14 +65,14 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { file := t.GetFile(fd) if file == nil { // Invalid fd. - return nil, nil, syscall.EBADF + return nil, nil, syserror.EBADF } ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. file.DecRef() - return nil, nil, syscall.EINVAL + return nil, nil, syserror.EINVAL } return ino, file, nil @@ -92,7 +91,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } ino, file, err := fdToInotify(t, fd) @@ -109,7 +108,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error { // "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7) if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) { - return syscall.ENOTDIR + return syserror.ENOTDIR } // Copy out to the return frame. diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 576022dd5..418d7fa5f 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -15,20 +15,19 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) // pipe2 implements the actual system call with flags. func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) @@ -49,7 +48,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { // The files are not closed in this case, the exact semantics // of this error case are not well defined, but they could have // already been observed by user space. - return 0, syscall.EFAULT + return 0, syserror.EFAULT } return 0, nil } diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 5329023e6..98db32d77 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -16,7 +16,6 @@ package linux import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserror" ) // Prctl implements linux syscall prctl(2). @@ -36,7 +36,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_PDEATHSIG: sig := linux.Signal(args[1].Int()) if sig != 0 && !sig.IsValid() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } t.SetParentDeathSignal(sig) return 0, nil, nil @@ -67,7 +67,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall d = mm.UserDumpable default: // N.B. Userspace may not pass SUID_DUMP_ROOT. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } t.MemoryManager().SetDumpability(d) return 0, nil, nil @@ -88,7 +88,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else if val == 1 { t.SetKeepCaps(true) } else { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil @@ -96,7 +96,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_NAME: addr := args[1].Pointer() name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1) - if err != nil && err != syscall.ENAMETOOLONG { + if err != nil && err != syserror.ENAMETOOLONG { return 0, nil, err } t.SetName(name) @@ -116,7 +116,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_MM: if !t.HasCapability(linux.CAP_SYS_RESOURCE) { - return 0, nil, syscall.EPERM + return 0, nil, syserror.EPERM } switch args[1].Int() { @@ -125,13 +125,13 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } // Set the underlying executable. @@ -153,12 +153,12 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } case linux.PR_SET_NO_NEW_PRIVS: if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // no_new_privs is assumed to always be set. See // kernel.Task.updateCredsForExec. @@ -166,14 +166,14 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_GET_NO_NEW_PRIVS: if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 1, nil, nil case linux.PR_SET_SECCOMP: if args[1].Int() != linux.SECCOMP_MODE_FILTER { // Unsupported mode. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer()) @@ -184,7 +184,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_READ: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } var rv uintptr if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 { @@ -195,7 +195,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_DROP: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, t.DropBoundingCapability(cp) @@ -220,7 +220,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go index 434bbb322..99f6993f5 100644 --- a/pkg/sentry/syscalls/linux/sys_sched.go +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -15,11 +15,10 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -37,13 +36,13 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel pid := args[0].Int() param := args[1].Pointer() if param == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } r := SchedParam{schedPriority: onlyPriority} if _, err := t.CopyOut(param, r); err != nil { @@ -57,10 +56,10 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := args[0].Int() if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } return onlyScheduler, nil, nil } @@ -71,20 +70,20 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke policy := args[1].Int() param := args[2].Pointer() if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if policy != onlyScheduler { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } var r SchedParam if _, err := t.CopyIn(param, &r); err != nil { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if r.schedPriority != onlyPriority { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go index 4885b5e40..18510ead8 100644 --- a/pkg/sentry/syscalls/linux/sys_seccomp.go +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -15,13 +15,12 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. @@ -43,7 +42,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { // We only support SECCOMP_SET_MODE_FILTER at the moment. if mode != linux.SECCOMP_SET_MODE_FILTER { // Unsupported mode. - return syscall.EINVAL + return syserror.EINVAL } tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0 @@ -51,7 +50,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC. if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 { // Unsupported flag. - return syscall.EINVAL + return syserror.EINVAL } var fprog userSockFprog @@ -65,7 +64,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { compiledFilter, err := bpf.Compile(filter) if err != nil { t.Debugf("Invalid seccomp-bpf filter: %v", err) - return syscall.EINVAL + return syserror.EINVAL } return t.AppendSyscallFilter(compiledFilter, tsync) diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index d1165a57a..195734257 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -15,7 +15,6 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -131,7 +130,7 @@ func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { if addrlen > maxAddrLen { - return nil, syscall.EINVAL + return nil, syserror.EINVAL } addrBuf := make([]byte, addrlen) @@ -153,7 +152,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user } if int32(bufLen) < 0 { - return syscall.EINVAL + return syserror.EINVAL } // Write the length unconditionally. @@ -186,7 +185,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Create the new socket. @@ -218,7 +217,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } fileFlags := fs.SettableFileFlags{ @@ -261,14 +260,14 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Capture address and call syscall implementation. @@ -286,20 +285,20 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } // Call the syscall implementation for this socket, then copy the @@ -314,7 +313,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if peerRequested { // NOTE(magi): Linux does not give you an error if it can't // write the data back out so neither do we. - if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL { + if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL { return 0, err } } @@ -351,14 +350,14 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Capture address and call syscall implementation. @@ -378,14 +377,14 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Per Linux, the backlog is silently capped to reasonable values. @@ -407,21 +406,21 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Validate how, then call syscall implementation. switch how { case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, s.Shutdown(t, int(how)).ToError() @@ -438,14 +437,14 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Read the length if present. Reject negative values. @@ -456,7 +455,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } if optLen < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } } @@ -522,21 +521,21 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } if optLen <= 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if optLen > maxOptLen { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) if _, err := t.CopyIn(optValAddr, &buf); err != nil { @@ -560,14 +559,14 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Get the socket name and copy it to the caller. @@ -588,14 +587,14 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Get the socket peer name and copy it to the caller. @@ -615,25 +614,25 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -663,25 +662,25 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } if file.Flags().NonBlocking { @@ -696,7 +695,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if !ts.Valid() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) haveDeadline = true @@ -716,7 +715,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } var n uintptr if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { @@ -726,7 +725,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } if _, err = t.CopyOut(lp, uint32(n)); err != nil { break @@ -748,7 +747,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if msg.IovLen > linux.UIO_MAXIOV { - return 0, syscall.EMSGSIZE + return 0, syserror.EMSGSIZE } dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -759,7 +758,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i // FIXME(b/63594852): Pretend we have an empty error queue. if flags&linux.MSG_ERRQUEUE != 0 { - return 0, syscall.EAGAIN + return 0, syserror.EAGAIN } // Fast path when no control message nor name buffers are provided. @@ -784,7 +783,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if msg.ControlLen > maxControlLen { - return 0, syscall.ENOBUFS + return 0, syserror.ENOBUFS } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { @@ -836,25 +835,25 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i // recvfrom and recv syscall handlers. func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { if int(bufLen) < 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } if file.Flags().NonBlocking { @@ -914,25 +913,25 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -952,25 +951,25 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -982,7 +981,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } var n uintptr if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { @@ -992,7 +991,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } if _, err = t.CopyOut(lp, uint32(n)); err != nil { break @@ -1017,7 +1016,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme if msg.ControlLen > 0 { // Put an upper bound to prevent large allocations. if msg.ControlLen > maxControlLen { - return 0, syscall.ENOBUFS + return 0, syserror.ENOBUFS } controlData = make([]byte, msg.ControlLen) if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { @@ -1037,7 +1036,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme // Read data then call the sendmsg implementation. if msg.IovLen > linux.UIO_MAXIOV { - return 0, syscall.EMSGSIZE + return 0, syserror.EMSGSIZE } src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -1074,20 +1073,20 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } if file.Flags().NonBlocking { diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 9e037bd7b..595eb9155 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -123,29 +123,29 @@ func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) { opts := kernel.CloneOptions{ SharingOptions: kernel.SharingOptions{ - NewAddressSpace: flags&syscall.CLONE_VM == 0, - NewSignalHandlers: flags&syscall.CLONE_SIGHAND == 0, - NewThreadGroup: flags&syscall.CLONE_THREAD == 0, + NewAddressSpace: flags&linux.CLONE_VM == 0, + NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0, + NewThreadGroup: flags&linux.CLONE_THREAD == 0, TerminationSignal: linux.Signal(flags & exitSignalMask), - NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID, - NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER, - NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET, - NewFiles: flags&syscall.CLONE_FILES == 0, - NewFSContext: flags&syscall.CLONE_FS == 0, - NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS, - NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC, + NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, + NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, + NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, + NewFiles: flags&linux.CLONE_FILES == 0, + NewFSContext: flags&linux.CLONE_FS == 0, + NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, + NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, }, Stack: stack, - SetTLS: flags&syscall.CLONE_SETTLS == syscall.CLONE_SETTLS, + SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS, TLS: tls, - ChildClearTID: flags&syscall.CLONE_CHILD_CLEARTID == syscall.CLONE_CHILD_CLEARTID, - ChildSetTID: flags&syscall.CLONE_CHILD_SETTID == syscall.CLONE_CHILD_SETTID, + ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID, + ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID, ChildTID: childTID, - ParentSetTID: flags&syscall.CLONE_PARENT_SETTID == syscall.CLONE_PARENT_SETTID, + ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID, ParentTID: parentTID, - Vfork: flags&syscall.CLONE_VFORK == syscall.CLONE_VFORK, - Untraced: flags&syscall.CLONE_UNTRACED == syscall.CLONE_UNTRACED, - InheritTracer: flags&syscall.CLONE_PTRACE == syscall.CLONE_PTRACE, + Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK, + Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED, + InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE, } ntid, ctrl, err := t.Clone(&opts) return uintptr(ntid), ctrl, err @@ -168,7 +168,7 @@ func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // "A call to fork() is equivalent to a call to clone(2) specifying flags // as just SIGCHLD." - fork(2) - return clone(t, int(syscall.SIGCHLD), 0, 0, 0, 0) + return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0) } // Vfork implements Linux syscall vfork(2). @@ -178,7 +178,7 @@ func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // // CLONE_VM | CLONE_VFORK | SIGCHLD // """ - vfork(2) - return clone(t, syscall.CLONE_VM|syscall.CLONE_VFORK|int(syscall.SIGCHLD), 0, 0, 0, 0) + return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0) } // parseCommonWaitOptions applies the options common to wait4 and waitid to @@ -193,7 +193,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { wopts.NonCloneTasks = true wopts.CloneTasks = true default: - return syscall.EINVAL + return syserror.EINVAL } if options&linux.WCONTINUED != 0 { wopts.Events |= kernel.EventGroupContinue @@ -210,7 +210,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { // wait4 waits for the given child process to exit. func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) { if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventExit | kernel.EventTraceeStop, @@ -291,10 +291,10 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal rusageAddr := args[4].Pointer() if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventTraceeStop, @@ -307,7 +307,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.P_PGID: wopts.SpecificPGID = kernel.ProcessGroupID(id) default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if err := parseCommonWaitOptions(&wopts, options); err != nil { @@ -347,12 +347,12 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, nil } si := arch.SignalInfo{ - Signo: int32(syscall.SIGCHLD), + Signo: int32(linux.SIGCHLD), } si.SetPid(int32(wr.TID)) si.SetUid(int32(wr.UID)) // TODO(b/73541790): convert kernel.ExitStatus to functions and make - // WaitResult.Status a linux.WaitStatus + // WaitResult.Status a linux.WaitStatus. s := syscall.WaitStatus(wr.Status) switch { case s.Exited(): @@ -374,7 +374,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } case s.Continued(): si.Code = arch.CLD_CONTINUED - si.SetStatus(int32(syscall.SIGCONT)) + si.SetStatus(int32(linux.SIGCONT)) default: t.Warningf("waitid got incomprehensible wait status %d", s) } @@ -395,16 +395,16 @@ func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() opts := kernel.SharingOptions{ - NewAddressSpace: flags&syscall.CLONE_VM == syscall.CLONE_VM, - NewSignalHandlers: flags&syscall.CLONE_SIGHAND == syscall.CLONE_SIGHAND, - NewThreadGroup: flags&syscall.CLONE_THREAD == syscall.CLONE_THREAD, - NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID, - NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER, - NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET, - NewFiles: flags&syscall.CLONE_FILES == syscall.CLONE_FILES, - NewFSContext: flags&syscall.CLONE_FS == syscall.CLONE_FS, - NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS, - NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC, + NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM, + NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND, + NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD, + NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, + NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, + NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, + NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES, + NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS, + NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, + NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, } // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) if opts.NewPIDNamespace { @@ -623,7 +623,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S who := kernel.ThreadID(args[1].Int()) switch which { - case syscall.PRIO_PROCESS: + case linux.PRIO_PROCESS: // Look for who, return ESRCH if not found. var task *kernel.Task if who == 0 { @@ -633,7 +633,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } // From kernel/sys.c:getpriority: @@ -641,13 +641,13 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // will not return the normal nice-value, but a negated // value that has been offset by 20" return uintptr(20 - task.Niceness()), nil, nil - case syscall.PRIO_USER: + case linux.PRIO_USER: fallthrough - case syscall.PRIO_PGRP: + case linux.PRIO_PGRP: // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } } @@ -669,7 +669,7 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } switch which { - case syscall.PRIO_PROCESS: + case linux.PRIO_PROCESS: // Look for who, return ESRCH if not found. var task *kernel.Task if who == 0 { @@ -679,17 +679,17 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } task.SetNiceness(niceval) - case syscall.PRIO_USER: + case linux.PRIO_USER: fallthrough - case syscall.PRIO_PGRP: + case linux.PRIO_PGRP: // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index ca5ccb7c3..d4134207b 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -15,13 +15,13 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) const nsecPerSec = int64(time.Second) @@ -47,7 +47,7 @@ func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) return itv, nil default: - return linux.ItimerVal{}, syscall.ENOSYS + return linux.ItimerVal{}, syserror.ENOSYS } } @@ -65,7 +65,7 @@ func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) e _, err := t.CopyOut(addr, itv) return err default: - return syscall.ENOSYS + return syserror.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go index e3d1c6201..b3eb96a1c 100644 --- a/pkg/sentry/syscalls/linux/sys_tls.go +++ b/pkg/sentry/syscalls/linux/sys_tls.go @@ -17,11 +17,10 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" ) // ArchPrctl implements linux syscall arch_prctl(2). @@ -39,14 +38,14 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.ARCH_SET_FS: fsbase := args[1].Uint64() if !t.Arch().SetTLS(uintptr(fsbase)) { - return 0, nil, syscall.EPERM + return 0, nil, syserror.EPERM } case linux.ARCH_GET_GS, linux.ARCH_SET_GS: t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go index 9ba0eba7a..4ff8f9234 100644 --- a/pkg/sentry/syscalls/linux/timespec.go +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -15,7 +15,6 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -70,7 +69,7 @@ func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) { tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:])) return tv, nil default: - return linux.Timeval{}, syscall.ENOSYS + return linux.Timeval{}, syserror.ENOSYS } } @@ -84,7 +83,7 @@ func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error _, err := t.CopyOutBytes(addr, out) return err default: - return syscall.ENOSYS + return syserror.ENOSYS } } @@ -104,7 +103,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.D return 0, err } if !timespec.Valid() { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } timeout = time.Duration(timespec.ToNsecCapped()) } diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index 94d52d5d5..f88055676 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -26,7 +26,6 @@ package syscalls import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -56,7 +55,7 @@ func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []st } // Error returns a syscall handler that will always give the passed error. -func Error(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { +func Error(name string, err error, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } @@ -73,7 +72,7 @@ func Error(name string, err syscall.Errno, note string, urls []string) kernel.Sy // ErrorWithEvent gives a syscall function that sends an unimplemented // syscall event via the event channel and returns the passed error. -func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { +func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index d2ede0353..8aa6a3017 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -6,7 +6,7 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") go_template_instance( name = "seqatomic_parameters", - out = "seqatomic_parameters.go", + out = "seqatomic_parameters_unsafe.go", package = "time", suffix = "Parameters", template = "//third_party/gvsync:generic_seqatomic", @@ -27,7 +27,7 @@ go_library( "parameters.go", "sampler.go", "sampler_unsafe.go", - "seqatomic_parameters.go", + "seqatomic_parameters_unsafe.go", "tsc_amd64.s", "tsc_arm64.s", ], diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index 345653544..1987e89cc 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -48,6 +48,7 @@ var ( EMSGSIZE = error(syscall.EMSGSIZE) ENAMETOOLONG = error(syscall.ENAMETOOLONG) ENOATTR = ENODATA + ENOBUFS = error(syscall.ENOBUFS) ENODATA = error(syscall.ENODATA) ENODEV = error(syscall.ENODEV) ENOENT = error(syscall.ENOENT) @@ -59,6 +60,7 @@ var ( ENOSYS = error(syscall.ENOSYS) ENOTDIR = error(syscall.ENOTDIR) ENOTEMPTY = error(syscall.ENOTEMPTY) + ENOTSOCK = error(syscall.ENOTSOCK) ENOTSUP = error(syscall.ENOTSUP) ENOTTY = error(syscall.ENOTTY) ENXIO = error(syscall.ENXIO) diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go index c081de61f..c52c0d851 100644 --- a/pkg/tcpip/header/icmpv4.go +++ b/pkg/tcpip/header/icmpv4.go @@ -24,15 +24,11 @@ import ( type ICMPv4 []byte const ( - // ICMPv4MinimumSize is the minimum size of a valid ICMP packet. - ICMPv4MinimumSize = 4 - - // ICMPv4EchoMinimumSize is the minimum size of a valid ICMP echo packet. - ICMPv4EchoMinimumSize = 6 + // ICMPv4PayloadOffset defines the start of ICMP payload. + ICMPv4PayloadOffset = 4 - // ICMPv4DstUnreachableMinimumSize is the minimum size of a valid ICMP - // destination unreachable packet. - ICMPv4DstUnreachableMinimumSize = ICMPv4MinimumSize + 4 + // ICMPv4MinimumSize is the minimum size of a valid ICMP packet. + ICMPv4MinimumSize = 8 // ICMPv4ProtocolNumber is the ICMP transport protocol number. ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1 @@ -104,5 +100,5 @@ func (ICMPv4) SetDestinationPort(uint16) { // Payload implements Transport.Payload. func (b ICMPv4) Payload() []byte { - return b[ICMPv4MinimumSize:] + return b[ICMPv4PayloadOffset:] } diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go index 7da4c4845..94a3af289 100644 --- a/pkg/tcpip/header/ipv4.go +++ b/pkg/tcpip/header/ipv4.go @@ -85,6 +85,10 @@ const ( // units, the header cannot exceed 15*4 = 60 bytes. IPv4MaximumHeaderSize = 60 + // MinIPFragmentPayloadSize is the minimum number of payload bytes that + // the first fragment must carry when an IPv4 packet is fragmented. + MinIPFragmentPayloadSize = 8 + // IPv4AddressSize is the size, in bytes, of an IPv4 address. IPv4AddressSize = 4 @@ -268,6 +272,10 @@ func (b IPv4) IsValid(pktSize int) bool { return false } + if IPVersion(b) != IPv4Version { + return false + } + return true } diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go index 7163eaa36..95fe8bfc3 100644 --- a/pkg/tcpip/header/ipv6.go +++ b/pkg/tcpip/header/ipv6.go @@ -184,6 +184,10 @@ func (b IPv6) IsValid(pktSize int) bool { return false } + if IPVersion(b) != IPv6Version { + return false + } + return true } diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go index 1141443bb..82cfe785c 100644 --- a/pkg/tcpip/header/tcp.go +++ b/pkg/tcpip/header/tcp.go @@ -176,6 +176,21 @@ const ( // TCPProtocolNumber is TCP's transport protocol number. TCPProtocolNumber tcpip.TransportProtocolNumber = 6 + + // TCPMinimumMSS is the minimum acceptable value for MSS. This is the + // same as the value TCP_MIN_MSS defined net/tcp.h. + TCPMinimumMSS = IPv4MaximumHeaderSize + TCPHeaderMaximumSize + MinIPFragmentPayloadSize - IPv4MinimumSize - TCPMinimumSize + + // TCPMaximumMSS is the maximum acceptable value for MSS. + TCPMaximumMSS = 0xffff + + // TCPDefaultMSS is the MSS value that should be used if an MSS option + // is not received from the peer. It's also the value returned by + // TCP_MAXSEG option for a socket in an unconnected state. + // + // Per RFC 1122, page 85: "If an MSS option is not received at + // connection setup, TCP MUST assume a default send MSS of 536." + TCPDefaultMSS = 536 ) // SourcePort returns the "source port" field of the tcp header. @@ -306,7 +321,7 @@ func ParseSynOptions(opts []byte, isAck bool) TCPSynOptions { synOpts := TCPSynOptions{ // Per RFC 1122, page 85: "If an MSS option is not received at // connection setup, TCP MUST assume a default send MSS of 536." - MSS: 536, + MSS: TCPDefaultMSS, // If no window scale option is specified, WS in options is // returned as -1; this is because the absence of the option // indicates that the we cannot use window scaling on the diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index ca3d6c0bf..cb35635fc 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -83,6 +83,10 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, buffer.Prependable, buf return tcpip.ErrNotSupported } +func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { + return tcpip.ErrNotSupported +} + func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { v := vv.First() h := header.ARP(v) diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index db65ee7cc..8ff428445 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -282,10 +282,10 @@ func TestIPv4ReceiveControl(t *testing.T) { {"Truncated (10 bytes missing)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, 10}, {"Truncated (missing IPv4 header)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, header.IPv4MinimumSize + 8}, {"Truncated (missing 'extra info')", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, 4 + header.IPv4MinimumSize + 8}, - {"Truncated (missing ICMP header)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, header.ICMPv4DstUnreachableMinimumSize + header.IPv4MinimumSize + 8}, + {"Truncated (missing ICMP header)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, header.ICMPv4MinimumSize + header.IPv4MinimumSize + 8}, {"Port unreachable", 1, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0}, {"Non-zero fragment offset", 0, 100, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0}, - {"Zero-length packet", 0, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv4MinimumSize + header.ICMPv4DstUnreachableMinimumSize + 8}, + {"Zero-length packet", 0, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv4MinimumSize + header.ICMPv4MinimumSize + 8}, } r, err := buildIPv4Route(localIpv4Addr, "\x0a\x00\x00\xbb") if err != nil { @@ -301,7 +301,7 @@ func TestIPv4ReceiveControl(t *testing.T) { } defer ep.Close() - const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize + 4 + const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize view := buffer.NewView(dataOffset + 8) // Create the outer IPv4 header. @@ -319,10 +319,10 @@ func TestIPv4ReceiveControl(t *testing.T) { icmp := header.ICMPv4(view[header.IPv4MinimumSize:]) icmp.SetType(header.ICMPv4DstUnreachable) icmp.SetCode(c.code) - copy(view[header.IPv4MinimumSize+header.ICMPv4MinimumSize:], []byte{0xde, 0xad, 0xbe, 0xef}) + copy(view[header.IPv4MinimumSize+header.ICMPv4PayloadOffset:], []byte{0xde, 0xad, 0xbe, 0xef}) // Create the inner IPv4 header. - ip = header.IPv4(view[header.IPv4MinimumSize+header.ICMPv4MinimumSize+4:]) + ip = header.IPv4(view[header.IPv4MinimumSize+header.ICMPv4MinimumSize:]) ip.Encode(&header.IPv4Fields{ IHL: header.IPv4MinimumSize, TotalLength: 100, diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go index bc7f1c42a..fbef6947d 100644 --- a/pkg/tcpip/network/ipv4/icmp.go +++ b/pkg/tcpip/network/ipv4/icmp.go @@ -68,10 +68,6 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V switch h.Type() { case header.ICMPv4Echo: received.Echo.Increment() - if len(v) < header.ICMPv4EchoMinimumSize { - received.Invalid.Increment() - return - } // Only send a reply if the checksum is valid. wantChecksum := h.Checksum() @@ -93,9 +89,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv) vv := vv.Clone(nil) - vv.TrimFront(header.ICMPv4EchoMinimumSize) - hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv4EchoMinimumSize) - pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize)) + vv.TrimFront(header.ICMPv4MinimumSize) + hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize) + pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize)) copy(pkt, h) pkt.SetType(header.ICMPv4EchoReply) pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0))) @@ -108,25 +104,19 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V case header.ICMPv4EchoReply: received.EchoReply.Increment() - if len(v) < header.ICMPv4EchoMinimumSize { - received.Invalid.Increment() - return - } + e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv) case header.ICMPv4DstUnreachable: received.DstUnreachable.Increment() - if len(v) < header.ICMPv4DstUnreachableMinimumSize { - received.Invalid.Increment() - return - } - vv.TrimFront(header.ICMPv4DstUnreachableMinimumSize) + + vv.TrimFront(header.ICMPv4MinimumSize) switch h.Code() { case header.ICMPv4PortUnreachable: e.handleControl(stack.ControlPortUnreachable, 0, vv) case header.ICMPv4FragmentationNeeded: - mtu := uint32(binary.BigEndian.Uint16(v[header.ICMPv4DstUnreachableMinimumSize-2:])) + mtu := uint32(binary.BigEndian.Uint16(v[header.ICMPv4PayloadOffset+2:])) e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv) } diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index 1e3a7425a..e44a73d96 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -232,6 +232,55 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return nil } +// WriteHeaderIncludedPacket writes a packet already containing a network +// header through the given route. +func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { + // The packet already has an IP header, but there are a few required + // checks. + ip := header.IPv4(payload.First()) + if !ip.IsValid(payload.Size()) { + return tcpip.ErrInvalidOptionValue + } + + // Always set the total length. + ip.SetTotalLength(uint16(payload.Size())) + + // Set the source address when zero. + if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) { + ip.SetSourceAddress(r.LocalAddress) + } + + // Set the destination. If the packet already included a destination, + // it will be part of the route. + ip.SetDestinationAddress(r.RemoteAddress) + + // Set the packet ID when zero. + if ip.ID() == 0 { + id := uint32(0) + if payload.Size() > header.IPv4MaximumHeaderSize+8 { + // Packets of 68 bytes or less are required by RFC 791 to not be + // fragmented, so we only assign ids to larger packets. + id = atomic.AddUint32(&ids[hashRoute(r, 0 /* protocol */)%buckets], 1) + } + ip.SetID(uint16(id)) + } + + // Always set the checksum. + ip.SetChecksum(0) + ip.SetChecksum(^ip.CalculateChecksum()) + + if loop&stack.PacketLoop != 0 { + e.HandlePacket(r, payload) + } + if loop&stack.PacketOut == 0 { + return nil + } + + hdr := buffer.NewPrependableFromView(payload.ToView()) + r.Stats().IP.PacketsSent.Increment() + return e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber) +} + // HandlePacket is called by the link layer when new ipv4 packets arrive for // this endpoint. func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index 27367d6c5..e3e8739fd 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -120,6 +120,13 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return e.linkEP.WritePacket(r, gso, hdr, payload, ProtocolNumber) } +// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet +// supported by IPv6. +func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { + // TODO(b/119580726): Support IPv6 header-included packets. + return tcpip.ErrNotSupported +} + // HandlePacket is called by the link layer when new ipv6 packets arrive for // this endpoint. func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index 0ecaa0833..462265281 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -174,6 +174,10 @@ type NetworkEndpoint interface { // protocol. WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop PacketLooping) *tcpip.Error + // WriteHeaderIncludedPacket writes a packet that includes a network + // header to the given destination address. + WriteHeaderIncludedPacket(r *Route, payload buffer.VectorisedView, loop PacketLooping) *tcpip.Error + // ID returns the network protocol endpoint ID. ID() *NetworkEndpointID @@ -357,10 +361,19 @@ type TransportProtocolFactory func() TransportProtocol // instantiate network protocols. type NetworkProtocolFactory func() NetworkProtocol +// UnassociatedEndpointFactory produces endpoints for writing packets not +// associated with a particular transport protocol. Such endpoints can be used +// to write arbitrary packets that include the IP header. +type UnassociatedEndpointFactory interface { + NewUnassociatedRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) +} + var ( transportProtocols = make(map[string]TransportProtocolFactory) networkProtocols = make(map[string]NetworkProtocolFactory) + unassociatedFactory UnassociatedEndpointFactory + linkEPMu sync.RWMutex nextLinkEndpointID tcpip.LinkEndpointID = 1 linkEndpoints = make(map[tcpip.LinkEndpointID]LinkEndpoint) @@ -380,6 +393,13 @@ func RegisterNetworkProtocolFactory(name string, p NetworkProtocolFactory) { networkProtocols[name] = p } +// RegisterUnassociatedFactory registers a factory to produce endpoints not +// associated with any particular transport protocol. This function is intended +// to be called by init() functions of the protocols. +func RegisterUnassociatedFactory(f UnassociatedEndpointFactory) { + unassociatedFactory = f +} + // RegisterLinkEndpoint register a link-layer protocol endpoint and returns an // ID that can be used to refer to it. func RegisterLinkEndpoint(linkEP LinkEndpoint) tcpip.LinkEndpointID { diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index 36d7b6ac7..391ab4344 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -163,6 +163,18 @@ func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.Vec return err } +// WriteHeaderIncludedPacket writes a packet already containing a network +// header through the given route. +func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip.Error { + if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.loop); err != nil { + r.Stats().IP.OutgoingPacketErrors.Increment() + return err + } + r.ref.nic.stats.Tx.Packets.Increment() + r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payload.Size())) + return nil +} + // DefaultTTL returns the default TTL of the underlying network endpoint. func (r *Route) DefaultTTL() uint8 { return r.ref.ep.DefaultTTL() diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 2d7f56ca9..3e8fb2a6c 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -340,6 +340,8 @@ type Stack struct { networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol linkAddrResolvers map[tcpip.NetworkProtocolNumber]LinkAddressResolver + unassociatedFactory UnassociatedEndpointFactory + demux *transportDemuxer stats tcpip.Stats @@ -442,6 +444,8 @@ func New(network []string, transport []string, opts Options) *Stack { } } + s.unassociatedFactory = unassociatedFactory + // Create the global transport demuxer. s.demux = newTransportDemuxer(s) @@ -574,11 +578,15 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp // NewRawEndpoint creates a new raw transport layer endpoint of the given // protocol. Raw endpoints receive all traffic for a given protocol regardless // of address. -func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { +func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) { if !s.raw { return nil, tcpip.ErrNotPermitted } + if !associated { + return s.unassociatedFactory.NewUnassociatedRawEndpoint(s, network, transport, waiterQueue) + } + t, ok := s.transportProtocols[transport] if !ok { return nil, tcpip.ErrUnknownProtocol diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 69884af03..959071dbe 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -137,6 +137,10 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr bu return f.linkEP.WritePacket(r, gso, hdr, payload, fakeNetNumber) } +func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { + return tcpip.ErrNotSupported +} + func (*fakeNetworkEndpoint) Close() {} type fakeNetGoodOption bool diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index c61f96fb0..c4076666a 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -496,6 +496,10 @@ type AvailableCongestionControlOption string // buffer moderation. type ModerateReceiveBufferOption bool +// MaxSegOption is used by SetSockOpt/GetSockOpt to set/get the current +// Maximum Segment Size(MSS) value as specified using the TCP_MAXSEG option. +type MaxSegOption int + // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default // TTL value for multicast messages. The default is 1. type MulticastTTLOption uint8 diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index ab9e80747..a80ceafd0 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -291,7 +291,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c switch e.netProto { case header.IPv4ProtocolNumber: - err = e.send4(route, v) + err = send4(route, e.id.LocalPort, v) case header.IPv6ProtocolNumber: err = send6(route, e.id.LocalPort, v) @@ -352,20 +352,20 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { } } -func (e *endpoint) send4(r *stack.Route, data buffer.View) *tcpip.Error { - if len(data) < header.ICMPv4EchoMinimumSize { +func send4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error { + if len(data) < header.ICMPv4MinimumSize { return tcpip.ErrInvalidEndpointState } // Set the ident to the user-specified port. Sequence number should // already be set by the user. - binary.BigEndian.PutUint16(data[header.ICMPv4MinimumSize:], e.id.LocalPort) + binary.BigEndian.PutUint16(data[header.ICMPv4PayloadOffset:], ident) - hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength())) + hdr := buffer.NewPrependable(header.ICMPv4MinimumSize + int(r.MaxHeaderLength())) - icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize)) + icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize)) copy(icmpv4, data) - data = data[header.ICMPv4EchoMinimumSize:] + data = data[header.ICMPv4MinimumSize:] // Linux performs these basic checks. if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 { diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go index c89538131..7fdba5d56 100644 --- a/pkg/tcpip/transport/icmp/protocol.go +++ b/pkg/tcpip/transport/icmp/protocol.go @@ -90,19 +90,18 @@ func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProt func (p *protocol) MinimumPacketSize() int { switch p.number { case ProtocolNumber4: - return header.ICMPv4EchoMinimumSize + return header.ICMPv4MinimumSize case ProtocolNumber6: return header.ICMPv6EchoMinimumSize } panic(fmt.Sprint("unknown protocol number: ", p.number)) } -// ParsePorts returns the source and destination ports stored in the given icmp -// packet. +// ParsePorts in case of ICMP sets src to 0, dst to ICMP ID, and err to nil. func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) { switch p.number { case ProtocolNumber4: - return 0, binary.BigEndian.Uint16(v[header.ICMPv4MinimumSize:]), nil + return 0, binary.BigEndian.Uint16(v[header.ICMPv4PayloadOffset:]), nil case ProtocolNumber6: return 0, binary.BigEndian.Uint16(v[header.ICMPv6MinimumSize:]), nil } diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index 34a14bf7f..bc4b255b4 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -21,6 +21,7 @@ go_library( "endpoint.go", "endpoint_state.go", "packet_list.go", + "protocol.go", ], importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 42aded77f..a29587658 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -67,6 +67,7 @@ type endpoint struct { netProto tcpip.NetworkProtocolNumber transProto tcpip.TransportProtocolNumber waiterQueue *waiter.Queue + associated bool // The following fields are used to manage the receive queue and are // protected by rcvMu. @@ -97,8 +98,12 @@ type endpoint struct { } // NewEndpoint returns a raw endpoint for the given protocols. -// TODO(b/129292371): IP_HDRINCL, IPPROTO_RAW, and AF_PACKET. +// TODO(b/129292371): IP_HDRINCL and AF_PACKET. func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */) +} + +func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) { if netProto != header.IPv4ProtocolNumber { return nil, tcpip.ErrUnknownProtocol } @@ -110,6 +115,16 @@ func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, trans waiterQueue: waiterQueue, rcvBufSizeMax: 32 * 1024, sndBufSize: 32 * 1024, + associated: associated, + } + + // Unassociated endpoints are write-only and users call Write() with IP + // headers included. Because they're write-only, We don't need to + // register with the stack. + if !associated { + ep.rcvBufSizeMax = 0 + ep.waiterQueue = nil + return ep, nil } if err := ep.stack.RegisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep); err != nil { @@ -124,7 +139,7 @@ func (ep *endpoint) Close() { ep.mu.Lock() defer ep.mu.Unlock() - if ep.closed { + if ep.closed || !ep.associated { return } @@ -142,8 +157,11 @@ func (ep *endpoint) Close() { if ep.connected { ep.route.Release() + ep.connected = false } + ep.closed = true + ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) } @@ -152,6 +170,10 @@ func (ep *endpoint) ModerateRecvBuf(copied int) {} // Read implements tcpip.Endpoint.Read. func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { + if !ep.associated { + return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidOptionValue + } + ep.rcvMu.Lock() // If there's no data to read, return that read would block or that the @@ -192,6 +214,33 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintp return 0, nil, tcpip.ErrInvalidEndpointState } + payloadBytes, err := payload.Get(payload.Size()) + if err != nil { + ep.mu.RUnlock() + return 0, nil, err + } + + // If this is an unassociated socket and callee provided a nonzero + // destination address, route using that address. + if !ep.associated { + ip := header.IPv4(payloadBytes) + if !ip.IsValid(payload.Size()) { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrInvalidOptionValue + } + dstAddr := ip.DestinationAddress() + // Update dstAddr with the address in the IP header, unless + // opts.To is set (e.g. if sendto specifies a specific + // address). + if dstAddr != tcpip.Address([]byte{0, 0, 0, 0}) && opts.To == nil { + opts.To = &tcpip.FullAddress{ + NIC: 0, // NIC is unset. + Addr: dstAddr, // The address from the payload. + Port: 0, // There are no ports here. + } + } + } + // Did the user caller provide a destination? If not, use the connected // destination. if opts.To == nil { @@ -216,12 +265,12 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintp return 0, nil, tcpip.ErrInvalidEndpointState } - n, ch, err := ep.finishWrite(payload, savedRoute) + n, ch, err := ep.finishWrite(payloadBytes, savedRoute) ep.mu.Unlock() return n, ch, err } - n, ch, err := ep.finishWrite(payload, &ep.route) + n, ch, err := ep.finishWrite(payloadBytes, &ep.route) ep.mu.RUnlock() return n, ch, err } @@ -248,7 +297,7 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintp return 0, nil, err } - n, ch, err := ep.finishWrite(payload, &route) + n, ch, err := ep.finishWrite(payloadBytes, &route) route.Release() ep.mu.RUnlock() return n, ch, err @@ -256,7 +305,7 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintp // finishWrite writes the payload to a route. It resolves the route if // necessary. It's really just a helper to make defer unnecessary in Write. -func (ep *endpoint) finishWrite(payload tcpip.Payload, route *stack.Route) (uintptr, <-chan struct{}, *tcpip.Error) { +func (ep *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (uintptr, <-chan struct{}, *tcpip.Error) { // We may need to resolve the route (match a link layer address to the // network address). If that requires blocking (e.g. to use ARP), // return a channel on which the caller can wait. @@ -269,13 +318,14 @@ func (ep *endpoint) finishWrite(payload tcpip.Payload, route *stack.Route) (uint } } - payloadBytes, err := payload.Get(payload.Size()) - if err != nil { - return 0, nil, err - } - switch ep.netProto { case header.IPv4ProtocolNumber: + if !ep.associated { + if err := route.WriteHeaderIncludedPacket(buffer.View(payloadBytes).ToVectorisedView()); err != nil { + return 0, nil, err + } + break + } hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength())) if err := route.WritePacket(nil /* gso */, hdr, buffer.View(payloadBytes).ToVectorisedView(), ep.transProto, route.DefaultTTL()); err != nil { return 0, nil, err @@ -335,15 +385,17 @@ func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { } defer route.Release() - // Re-register the endpoint with the appropriate NIC. - if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { - return err + if ep.associated { + // Re-register the endpoint with the appropriate NIC. + if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { + return err + } + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + ep.registeredNIC = nic } - ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) - // Save the route and NIC we've connected via. + // Save the route we've connected via. ep.route = route.Clone() - ep.registeredNIC = nic ep.connected = true return nil @@ -386,14 +438,16 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { return tcpip.ErrBadLocalAddress } - // Re-register the endpoint with the appropriate NIC. - if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { - return err + if ep.associated { + // Re-register the endpoint with the appropriate NIC. + if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { + return err + } + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + ep.registeredNIC = addr.NIC + ep.boundNIC = addr.NIC } - ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) - ep.registeredNIC = addr.NIC - ep.boundNIC = addr.NIC ep.boundAddr = addr.Addr ep.bound = true diff --git a/pkg/tcpip/transport/raw/protocol.go b/pkg/tcpip/transport/raw/protocol.go new file mode 100644 index 000000000..783c21e6b --- /dev/null +++ b/pkg/tcpip/transport/raw/protocol.go @@ -0,0 +1,32 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package raw + +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/waiter" +) + +type factory struct{} + +// NewUnassociatedRawEndpoint implements stack.UnassociatedEndpointFactory. +func (factory) NewUnassociatedRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + return newEndpoint(stack, netProto, transProto, waiterQueue, false /* associated */) +} + +func init() { + stack.RegisterUnassociatedFactory(factory{}) +} diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index cb40fea94..beb90afb5 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -117,6 +117,7 @@ const ( notifyDrain notifyReset notifyKeepaliveChanged + notifyMSSChanged ) // SACKInfo holds TCP SACK related information for a given endpoint. @@ -218,8 +219,6 @@ type endpoint struct { mu sync.RWMutex `state:"nosave"` id stack.TransportEndpointID - // state endpointState `state:".(endpointState)"` - // pState ProtocolState state EndpointState `state:".(EndpointState)"` isPortReserved bool `state:"manual"` @@ -313,6 +312,10 @@ type endpoint struct { // in SYN-RCVD state. synRcvdCount int + // userMSS if non-zero is the MSS value explicitly set by the user + // for this endpoint using the TCP_MAXSEG setsockopt. + userMSS int + // The following fields are used to manage the send buffer. When // segments are ready to be sent, they are added to sndQueue and the // protocol goroutine is signaled via sndWaker. @@ -917,6 +920,17 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { } return nil + case tcpip.MaxSegOption: + userMSS := v + if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { + return tcpip.ErrInvalidOptionValue + } + e.mu.Lock() + e.userMSS = int(userMSS) + e.mu.Unlock() + e.notifyProtocolGoroutine(notifyMSSChanged) + return nil + case tcpip.ReceiveBufferSizeOption: // Make sure the receive buffer size is within the min and max // allowed. @@ -1096,6 +1110,14 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { e.lastErrorMu.Unlock() return err + case *tcpip.MaxSegOption: + // This is just stubbed out. Linux never returns the user_mss + // value as it either returns the defaultMSS or returns the + // actual current MSS. Netstack just returns the defaultMSS + // always for now. + *o = header.TCPDefaultMSS + return nil + case *tcpip.SendBufferSizeOption: e.sndBufMu.Lock() *o = tcpip.SendBufferSizeOption(e.sndBufSize) diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index 630dd7925..bcc0f3e28 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -271,7 +271,7 @@ func (c *Context) GetPacketNonBlocking() []byte { // SendICMPPacket builds and sends an ICMPv4 packet via the link layer endpoint. func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byte, maxTotalSize int) { // Allocate a buffer data and headers. - buf := buffer.NewView(header.IPv4MinimumSize + header.ICMPv4MinimumSize + len(p1) + len(p2)) + buf := buffer.NewView(header.IPv4MinimumSize + header.ICMPv4PayloadOffset + len(p1) + len(p2)) if len(buf) > maxTotalSize { buf = buf[:maxTotalSize] } @@ -291,8 +291,8 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt icmp.SetType(typ) icmp.SetCode(code) - copy(icmp[header.ICMPv4MinimumSize:], p1) - copy(icmp[header.ICMPv4MinimumSize+len(p1):], p2) + copy(icmp[header.ICMPv4PayloadOffset:], p1) + copy(icmp[header.ICMPv4PayloadOffset+len(p1):], p2) // Inject packet. c.linkEP.Inject(ipv4.ProtocolNumber, buf.ToVectorisedView()) |