diff options
Diffstat (limited to 'pkg')
302 files changed, 12469 insertions, 3862 deletions
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 6960add0c..2061b38c0 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -1,4 +1,4 @@ -# Package linux contains the constants and types needed to inferface with a +# Package linux contains the constants and types needed to interface with a # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix # when the host OS may not be Linux. @@ -10,13 +10,13 @@ go_library( name = "linux", srcs = [ "aio.go", - "ashmem.go", "audit.go", - "binder.go", "bpf.go", "capability.go", + "clone.go", "dev.go", "elf.go", + "epoll.go", "errors.go", "eventfd.go", "exec.go", diff --git a/pkg/abi/linux/clone.go b/pkg/abi/linux/clone.go new file mode 100644 index 000000000..c2cbfca5e --- /dev/null +++ b/pkg/abi/linux/clone.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Clone constants per clone(2). +const ( + CLONE_VM = 0x100 + CLONE_FS = 0x200 + CLONE_FILES = 0x400 + CLONE_SIGHAND = 0x800 + CLONE_PARENT = 0x8000 + CLONE_PTRACE = 0x2000 + CLONE_VFORK = 0x4000 + CLONE_THREAD = 0x10000 + CLONE_NEWNS = 0x20000 + CLONE_SYSVSEM = 0x40000 + CLONE_SETTLS = 0x80000 + CLONE_PARENT_SETTID = 0x100000 + CLONE_CHILD_CLEARTID = 0x200000 + CLONE_DETACHED = 0x400000 + CLONE_UNTRACED = 0x800000 + CLONE_CHILD_SETTID = 0x1000000 + CLONE_NEWUTS = 0x4000000 + CLONE_NEWIPC = 0x8000000 + CLONE_NEWUSER = 0x10000000 + CLONE_NEWPID = 0x20000000 + CLONE_NEWNET = 0x40000000 + CLONE_IO = 0x80000000 +) diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go new file mode 100644 index 000000000..72083b604 --- /dev/null +++ b/pkg/abi/linux/epoll.go @@ -0,0 +1,56 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// EpollEvent is equivalent to struct epoll_event from epoll(2). +type EpollEvent struct { + Events uint32 + Fd int32 + Data int32 +} + +// Event masks. +const ( + EPOLLIN = 0x1 + EPOLLPRI = 0x2 + EPOLLOUT = 0x4 + EPOLLERR = 0x8 + EPOLLHUP = 0x10 + EPOLLRDNORM = 0x40 + EPOLLRDBAND = 0x80 + EPOLLWRNORM = 0x100 + EPOLLWRBAND = 0x200 + EPOLLMSG = 0x400 + EPOLLRDHUP = 0x2000 +) + +// Per-file descriptor flags. +const ( + EPOLLET = 0x80000000 + EPOLLONESHOT = 0x40000000 +) + +// Operation flags. +const ( + EPOLL_CLOEXEC = 0x80000 + EPOLL_NONBLOCK = 0x800 +) + +// Control operations. +const ( + EPOLL_CTL_ADD = 0x1 + EPOLL_CTL_DEL = 0x2 + EPOLL_CTL_MOD = 0x3 +) diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go index b30350193..f78315ebf 100644 --- a/pkg/abi/linux/fcntl.go +++ b/pkg/abi/linux/fcntl.go @@ -14,23 +14,41 @@ package linux -// Comands from linux/fcntl.h. +// Commands from linux/fcntl.h. const ( - F_DUPFD = 0 - F_GETFD = 1 - F_GETFL = 3 - F_GETOWN = 9 - F_SETFD = 2 - F_SETFL = 4 - F_SETLK = 6 - F_SETLKW = 7 - F_SETOWN = 8 + F_DUPFD = 0x0 + F_GETFD = 0x1 + F_SETFD = 0x2 + F_GETFL = 0x3 + F_SETFL = 0x4 + F_SETLK = 0x6 + F_SETLKW = 0x7 + F_SETOWN = 0x8 + F_GETOWN = 0x9 F_DUPFD_CLOEXEC = 1024 + 6 F_SETPIPE_SZ = 1024 + 7 F_GETPIPE_SZ = 1024 + 8 ) +// Commands for F_SETLK. +const ( + F_RDLCK = 0x0 + F_WRLCK = 0x1 + F_UNLCK = 0x2 +) + // Flags for fcntl. const ( FD_CLOEXEC = 00000001 ) + +// Lock structure for F_SETLK. +type Flock struct { + Type int16 + Whence int16 + _ [4]byte + Start int64 + Len int64 + Pid int32 + _ [4]byte +} diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 426003eb7..4b0ea33dc 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -24,24 +24,27 @@ import ( // Constants for open(2). const ( - O_ACCMODE = 00000003 - O_RDONLY = 00000000 - O_WRONLY = 00000001 - O_RDWR = 00000002 - O_CREAT = 00000100 - O_EXCL = 00000200 - O_NOCTTY = 00000400 - O_TRUNC = 00001000 - O_APPEND = 00002000 - O_NONBLOCK = 00004000 - O_ASYNC = 00020000 - O_DIRECT = 00040000 - O_LARGEFILE = 00100000 - O_DIRECTORY = 00200000 - O_NOFOLLOW = 00400000 - O_CLOEXEC = 02000000 - O_SYNC = 04010000 + O_ACCMODE = 000000003 + O_RDONLY = 000000000 + O_WRONLY = 000000001 + O_RDWR = 000000002 + O_CREAT = 000000100 + O_EXCL = 000000200 + O_NOCTTY = 000000400 + O_TRUNC = 000001000 + O_APPEND = 000002000 + O_NONBLOCK = 000004000 + O_DSYNC = 000010000 + O_ASYNC = 000020000 + O_DIRECT = 000040000 + O_LARGEFILE = 000100000 + O_DIRECTORY = 000200000 + O_NOFOLLOW = 000400000 + O_NOATIME = 001000000 + O_CLOEXEC = 002000000 + O_SYNC = 004000000 // __O_SYNC in Linux O_PATH = 010000000 + O_TMPFILE = 020000000 // __O_TMPFILE in Linux ) // Constants for fstatat(2). @@ -123,14 +126,23 @@ const ( // Values for mode_t. const ( - FileTypeMask = 0170000 - ModeSocket = 0140000 - ModeSymlink = 0120000 - ModeRegular = 0100000 - ModeBlockDevice = 060000 - ModeDirectory = 040000 - ModeCharacterDevice = 020000 - ModeNamedPipe = 010000 + S_IFMT = 0170000 + S_IFSOCK = 0140000 + S_IFLNK = 0120000 + S_IFREG = 0100000 + S_IFBLK = 060000 + S_IFDIR = 040000 + S_IFCHR = 020000 + S_IFIFO = 010000 + + FileTypeMask = S_IFMT + ModeSocket = S_IFSOCK + ModeSymlink = S_IFLNK + ModeRegular = S_IFREG + ModeBlockDevice = S_IFBLK + ModeDirectory = S_IFDIR + ModeCharacterDevice = S_IFCHR + ModeNamedPipe = S_IFIFO ModeSetUID = 04000 ModeSetGID = 02000 @@ -151,6 +163,19 @@ const ( PermissionsMask = 0777 ) +// Values for linux_dirent64.d_type. +const ( + DT_UNKNOWN = 0 + DT_FIFO = 1 + DT_CHR = 2 + DT_DIR = 4 + DT_BLK = 6 + DT_REG = 8 + DT_LNK = 10 + DT_SOCK = 12 + DT_WHT = 14 +) + // Values for preadv2/pwritev2. const ( RWF_HIPRI = 0x00000001 @@ -161,21 +186,21 @@ const ( // Stat represents struct stat. type Stat struct { - Dev uint64 - Ino uint64 - Nlink uint64 - Mode uint32 - UID uint32 - GID uint32 - X_pad0 int32 - Rdev uint64 - Size int64 - Blksize int64 - Blocks int64 - ATime Timespec - MTime Timespec - CTime Timespec - X_unused [3]int64 + Dev uint64 + Ino uint64 + Nlink uint64 + Mode uint32 + UID uint32 + GID uint32 + _ int32 + Rdev uint64 + Size int64 + Blksize int64 + Blocks int64 + ATime Timespec + MTime Timespec + CTime Timespec + _ [3]int64 } // SizeOfStat is the size of a Stat struct. @@ -208,6 +233,17 @@ const ( STATX__RESERVED = 0x80000000 ) +// Bitmasks for Statx.Attributes and Statx.AttributesMask, from +// include/uapi/linux/stat.h. +const ( + STATX_ATTR_COMPRESSED = 0x00000004 + STATX_ATTR_IMMUTABLE = 0x00000010 + STATX_ATTR_APPEND = 0x00000020 + STATX_ATTR_NODUMP = 0x00000040 + STATX_ATTR_ENCRYPTED = 0x00000800 + STATX_ATTR_AUTOMOUNT = 0x00001000 +) + // Statx represents struct statx. type Statx struct { Mask uint32 @@ -217,7 +253,6 @@ type Statx struct { UID uint32 GID uint32 Mode uint16 - _ uint16 Ino uint64 Size uint64 Blocks uint64 diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index c82ab9b5b..b416e3472 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -20,6 +20,7 @@ package linux const ( ANON_INODE_FS_MAGIC = 0x09041934 DEVPTS_SUPER_MAGIC = 0x00001cd1 + EXT_SUPER_MAGIC = 0xef53 OVERLAYFS_SUPER_MAGIC = 0x794c7630 PIPEFS_MAGIC = 0x50495045 PROC_SUPER_MAGIC = 0x9fa0 @@ -76,6 +77,15 @@ type Statfs struct { Spare [4]uint64 } +// Whence argument to lseek(2), from include/uapi/linux/fs.h. +const ( + SEEK_SET = 0 + SEEK_CUR = 1 + SEEK_END = 2 + SEEK_DATA = 3 + SEEK_HOLE = 4 +) + // Sync_file_range flags, from include/uapi/linux/fs.h const ( SYNC_FILE_RANGE_WAIT_BEFORE = 1 diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go index 04bb767dc..0e18db9ef 100644 --- a/pkg/abi/linux/ioctl.go +++ b/pkg/abi/linux/ioctl.go @@ -72,28 +72,3 @@ const ( SIOCGMIIPHY = 0x8947 SIOCGMIIREG = 0x8948 ) - -// ioctl(2) requests provided by uapi/linux/android/binder.h -const ( - BinderWriteReadIoctl = 0xc0306201 - BinderSetIdleTimeoutIoctl = 0x40086203 - BinderSetMaxThreadsIoctl = 0x40046205 - BinderSetIdlePriorityIoctl = 0x40046206 - BinderSetContextMgrIoctl = 0x40046207 - BinderThreadExitIoctl = 0x40046208 - BinderVersionIoctl = 0xc0046209 -) - -// ioctl(2) requests provided by drivers/staging/android/uapi/ashmem.h -const ( - AshmemSetNameIoctl = 0x41007701 - AshmemGetNameIoctl = 0x81007702 - AshmemSetSizeIoctl = 0x40087703 - AshmemGetSizeIoctl = 0x00007704 - AshmemSetProtMaskIoctl = 0x40087705 - AshmemGetProtMaskIoctl = 0x00007706 - AshmemPinIoctl = 0x40087707 - AshmemUnpinIoctl = 0x40087708 - AshmemGetPinStatusIoctl = 0x00007709 - AshmemPurgeAllCachesIoctl = 0x0000770a -) diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go index 2ef8d6cbb..22acd2d43 100644 --- a/pkg/abi/linux/ipc.go +++ b/pkg/abi/linux/ipc.go @@ -44,10 +44,10 @@ type IPCPerm struct { CUID uint32 CGID uint32 Mode uint16 - pad1 uint16 + _ uint16 Seq uint16 - pad2 uint16 - pad3 uint32 + _ uint16 + _ uint32 unused1 uint64 unused2 uint64 } diff --git a/pkg/abi/linux/linux.go b/pkg/abi/linux/linux.go index 8a8f831cd..281acdbde 100644 --- a/pkg/abi/linux/linux.go +++ b/pkg/abi/linux/linux.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package linux contains the constants and types needed to inferface with a Linux kernel. +// Package linux contains the constants and types needed to interface with a Linux kernel. package linux // NumSoftIRQ is the number of software IRQs, exposed via /proc/stat. diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go index 5e718c363..e8b6544b4 100644 --- a/pkg/abi/linux/netlink.go +++ b/pkg/abi/linux/netlink.go @@ -41,10 +41,10 @@ const ( // SockAddrNetlink is struct sockaddr_nl, from uapi/linux/netlink.h. type SockAddrNetlink struct { - Family uint16 - Padding uint16 - PortID uint32 - Groups uint32 + Family uint16 + _ uint16 + PortID uint32 + Groups uint32 } // SockAddrNetlinkSize is the size of SockAddrNetlink. diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go index 630dc339a..dd698e2bc 100644 --- a/pkg/abi/linux/netlink_route.go +++ b/pkg/abi/linux/netlink_route.go @@ -86,12 +86,12 @@ const ( // InterfaceInfoMessage is struct ifinfomsg, from uapi/linux/rtnetlink.h. type InterfaceInfoMessage struct { - Family uint8 - Padding uint8 - Type uint16 - Index int32 - Flags uint32 - Change uint32 + Family uint8 + _ uint8 + Type uint16 + Index int32 + Flags uint32 + Change uint32 } // Interface flags, from uapi/linux/if.h. diff --git a/pkg/abi/linux/sched.go b/pkg/abi/linux/sched.go index 193d9a242..70e820823 100644 --- a/pkg/abi/linux/sched.go +++ b/pkg/abi/linux/sched.go @@ -28,3 +28,9 @@ const ( // reverted back to SCHED_NORMAL on fork. SCHED_RESET_ON_FORK = 0x40000000 ) + +const ( + PRIO_PGRP = 0x1 + PRIO_PROCESS = 0x0 + PRIO_USER = 0x2 +) diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index e727066d7..546668bca 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -241,3 +241,9 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { Nsec: uint32(nsec % 1e9), } } + +// Utime represents struct utimbuf used by utimes(2). +type Utime struct { + Actime int64 + Modtime int64 +} diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go index 4f7759b87..1c4fd1784 100644 --- a/pkg/amutex/amutex.go +++ b/pkg/amutex/amutex.go @@ -21,13 +21,13 @@ import ( ) // Sleeper must be implemented by users of the abortable mutex to allow for -// cancelation of waits. +// cancellation of waits. type Sleeper interface { // SleepStart is called by the AbortableMutex.Lock() function when the // mutex is contended and the goroutine is about to sleep. // // A channel can be returned that causes the sleep to be canceled if - // it's readable. If no cancelation is desired, nil can be returned. + // it's readable. If no cancellation is desired, nil can be returned. SleepStart() <-chan struct{} // SleepFinish is called by AbortableMutex.Lock() once a contended mutex diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go index 211bdda4b..1d7f45641 100644 --- a/pkg/amutex/amutex_test.go +++ b/pkg/amutex/amutex_test.go @@ -47,7 +47,7 @@ func TestMutualExclusion(t *testing.T) { // goroutines ran concurrently within the critical section. // // If one of the goroutines doesn't complete, it's likely a bug that - // causes to to wait forever. + // causes it to wait forever. const gr = 1000 const iters = 100000 v := 0 diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go index 18a10d389..3fabaf445 100644 --- a/pkg/cpuid/cpuid.go +++ b/pkg/cpuid/cpuid.go @@ -620,7 +620,7 @@ func vendorIDFromRegs(bx, cx, dx uint32) string { // state includes floating point registers, and other cpu state that's not // associated with the normal task context. // -// Note: We can save some space here with an optimiazation where we use a +// Note: We can save some space here with an optimization where we use a // smaller chunk of memory depending on features that are actually enabled. // Currently we just use the largest possible size for simplicity (which is // about 2.5K worst case, with avx512). diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go index 8de0fb9b6..f6d26532b 100644 --- a/pkg/eventchannel/event.go +++ b/pkg/eventchannel/event.go @@ -144,7 +144,7 @@ type debugEmitter struct { inner Emitter } -// DebugEmitterFrom creates a new event channel emitter by wraping an existing +// DebugEmitterFrom creates a new event channel emitter by wrapping an existing // raw emitter. func DebugEmitterFrom(inner Emitter) Emitter { return &debugEmitter{ diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go index 2785243a2..83bcfe220 100644 --- a/pkg/fd/fd.go +++ b/pkg/fd/fd.go @@ -167,7 +167,7 @@ func NewFromFile(file *os.File) (*FD, error) { return New(fd), nil } -// Open is equivallent to open(2). +// Open is equivalent to open(2). func Open(path string, openmode int, perm uint32) (*FD, error) { f, err := syscall.Open(path, openmode|syscall.O_LARGEFILE, perm) if err != nil { @@ -176,7 +176,7 @@ func Open(path string, openmode int, perm uint32) (*FD, error) { return New(f), nil } -// OpenAt is equivallent to openat(2). +// OpenAt is equivalent to openat(2). func OpenAt(dir *FD, path string, flags int, mode uint32) (*FD, error) { f, err := syscall.Openat(dir.FD(), path, flags, mode) if err != nil { diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD new file mode 100644 index 000000000..e54e7371c --- /dev/null +++ b/pkg/fdchannel/BUILD @@ -0,0 +1,17 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "fdchannel", + srcs = ["fdchannel_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/fdchannel", + visibility = ["//visibility:public"], +) + +go_test( + name = "fdchannel_test", + size = "small", + srcs = ["fdchannel_test.go"], + embed = [":fdchannel"], +) diff --git a/pkg/fdchannel/fdchannel_test.go b/pkg/fdchannel/fdchannel_test.go new file mode 100644 index 000000000..5d01dc636 --- /dev/null +++ b/pkg/fdchannel/fdchannel_test.go @@ -0,0 +1,131 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdchannel + +import ( + "io/ioutil" + "os" + "sync" + "syscall" + "testing" + "time" +) + +func TestSendRecvFD(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + recvFD, err := recvEP.RecvFDNonblock() + if err != syscall.EAGAIN && err != syscall.EWOULDBLOCK { + t.Errorf("RecvFDNonblock before SendFD: got (%d, %v), wanted (<unspecified>, EAGAIN or EWOULDBLOCK", recvFD, err) + } + + if err := sendEP.SendFD(int(sendFile.Fd())); err != nil { + t.Fatalf("SendFD failed: %v", err) + } + recvFD, err = recvEP.RecvFD() + if err != nil { + t.Fatalf("RecvFD failed: %v", err) + } + recvFile := os.NewFile(uintptr(recvFD), "received file") + defer recvFile.Close() + + sendInfo, err := sendFile.Stat() + if err != nil { + t.Fatalf("failed to stat sent file: %v", err) + } + sendInfoSys := sendInfo.Sys() + sendStat, ok := sendInfoSys.(*syscall.Stat_t) + if !ok { + t.Fatalf("sent file's FileInfo is backed by unknown type %T", sendInfoSys) + } + + recvInfo, err := recvFile.Stat() + if err != nil { + t.Fatalf("failed to stat received file: %v", err) + } + recvInfoSys := recvInfo.Sys() + recvStat, ok := recvInfoSys.(*syscall.Stat_t) + if !ok { + t.Fatalf("received file's FileInfo is backed by unknown type %T", recvInfoSys) + } + + if sendStat.Dev != recvStat.Dev || sendStat.Ino != recvStat.Ino { + t.Errorf("sent file (dev=%d, ino=%d) does not match received file (dev=%d, ino=%d)", sendStat.Dev, sendStat.Ino, recvStat.Dev, recvStat.Ino) + } +} + +func TestShutdownThenRecvFD(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + recvEP.Shutdown() + if _, err := recvEP.RecvFD(); err == nil { + t.Error("RecvFD succeeded unexpectedly") + } +} + +func TestRecvFDThenShutdown(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + var receiverWG sync.WaitGroup + receiverWG.Add(1) + go func() { + defer receiverWG.Done() + if _, err := recvEP.RecvFD(); err == nil { + t.Error("RecvFD succeeded unexpectedly") + } + }() + defer receiverWG.Wait() + time.Sleep(time.Second) // to ensure recvEP.RecvFD() has blocked + recvEP.Shutdown() +} diff --git a/pkg/fdchannel/fdchannel_unsafe.go b/pkg/fdchannel/fdchannel_unsafe.go new file mode 100644 index 000000000..367235be5 --- /dev/null +++ b/pkg/fdchannel/fdchannel_unsafe.go @@ -0,0 +1,146 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris + +// Package fdchannel implements passing file descriptors between processes over +// Unix domain sockets. +package fdchannel + +import ( + "fmt" + "reflect" + "sync/atomic" + "syscall" + "unsafe" +) + +// int32 is the real type of a file descriptor. +const sizeofInt32 = int(unsafe.Sizeof(int32(0))) + +// NewConnectedSockets returns a pair of file descriptors, owned by the caller, +// representing connected sockets that may be passed to separate calls to +// NewEndpoint to create connected Endpoints. +func NewConnectedSockets() ([2]int, error) { + return syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0) +} + +// Endpoint sends file descriptors to, and receives them from, another +// connected Endpoint. +// +// Endpoint is not copyable or movable by value. +type Endpoint struct { + sockfd int32 // accessed using atomic memory operations + msghdr syscall.Msghdr + cmsg *syscall.Cmsghdr // followed by sizeofInt32 bytes of data +} + +// Init must be called on zero-value Endpoints before first use. sockfd must be +// a blocking AF_UNIX SOCK_SEQPACKET socket. +func (ep *Endpoint) Init(sockfd int) { + // "Datagram sockets in various domains (e.g., the UNIX and Internet + // domains) permit zero-length datagrams." - recv(2). Experimentally, + // sendmsg+recvmsg for a zero-length datagram is slightly faster than + // sendmsg+recvmsg for a single byte over a stream socket. + cmsgSlice := make([]byte, syscall.CmsgSpace(sizeofInt32)) + cmsgReflect := (*reflect.SliceHeader)((unsafe.Pointer)(&cmsgSlice)) + ep.sockfd = int32(sockfd) + ep.msghdr.Control = (*byte)((unsafe.Pointer)(cmsgReflect.Data)) + ep.cmsg = (*syscall.Cmsghdr)((unsafe.Pointer)(cmsgReflect.Data)) + // ep.msghdr.Controllen and ep.cmsg.* are mutated by recvmsg(2), so they're + // set before calling sendmsg/recvmsg. +} + +// NewEndpoint is a convenience function that returns an initialized Endpoint +// allocated on the heap. +func NewEndpoint(sockfd int) *Endpoint { + ep := &Endpoint{} + ep.Init(sockfd) + return ep +} + +// Destroy releases resources owned by ep. No other Endpoint methods may be +// called after Destroy. +func (ep *Endpoint) Destroy() { + // These need not use sync/atomic since there must not be any concurrent + // calls to Endpoint methods. + if ep.sockfd >= 0 { + syscall.Close(int(ep.sockfd)) + ep.sockfd = -1 + } +} + +// Shutdown causes concurrent and future calls to ep.SendFD(), ep.RecvFD(), and +// ep.RecvFDNonblock(), as well as the same calls in the connected Endpoint, to +// unblock and return errors. It does not wait for concurrent calls to return. +// +// Shutdown is the only Endpoint method that may be called concurrently with +// other methods. +func (ep *Endpoint) Shutdown() { + if sockfd := int(atomic.SwapInt32(&ep.sockfd, -1)); sockfd >= 0 { + syscall.Shutdown(sockfd, syscall.SHUT_RDWR) + syscall.Close(sockfd) + } +} + +// SendFD sends the open file description represented by the given file +// descriptor to the connected Endpoint. +func (ep *Endpoint) SendFD(fd int) error { + cmsgLen := syscall.CmsgLen(sizeofInt32) + ep.cmsg.Level = syscall.SOL_SOCKET + ep.cmsg.Type = syscall.SCM_RIGHTS + ep.cmsg.SetLen(cmsgLen) + *ep.cmsgData() = int32(fd) + ep.msghdr.SetControllen(cmsgLen) + _, _, e := syscall.Syscall(syscall.SYS_SENDMSG, uintptr(atomic.LoadInt32(&ep.sockfd)), uintptr((unsafe.Pointer)(&ep.msghdr)), 0) + if e != 0 { + return e + } + return nil +} + +// RecvFD receives an open file description from the connected Endpoint and +// returns a file descriptor representing it, owned by the caller. +func (ep *Endpoint) RecvFD() (int, error) { + return ep.recvFD(0) +} + +// RecvFDNonblock receives an open file description from the connected Endpoint +// and returns a file descriptor representing it, owned by the caller. If there +// are no pending receivable open file descriptions, RecvFDNonblock returns +// (<unspecified>, EAGAIN or EWOULDBLOCK). +func (ep *Endpoint) RecvFDNonblock() (int, error) { + return ep.recvFD(syscall.MSG_DONTWAIT) +} + +func (ep *Endpoint) recvFD(flags uintptr) (int, error) { + cmsgLen := syscall.CmsgLen(sizeofInt32) + ep.msghdr.SetControllen(cmsgLen) + _, _, e := syscall.Syscall(syscall.SYS_RECVMSG, uintptr(atomic.LoadInt32(&ep.sockfd)), uintptr((unsafe.Pointer)(&ep.msghdr)), flags|syscall.MSG_TRUNC) + if e != 0 { + return -1, e + } + if int(ep.msghdr.Controllen) != cmsgLen { + return -1, fmt.Errorf("received control message has incorrect length: got %d, wanted %d", ep.msghdr.Controllen, cmsgLen) + } + if ep.cmsg.Level != syscall.SOL_SOCKET || ep.cmsg.Type != syscall.SCM_RIGHTS { + return -1, fmt.Errorf("received control message has incorrect (level, type): got (%v, %v), wanted (%v, %v)", ep.cmsg.Level, ep.cmsg.Type, syscall.SOL_SOCKET, syscall.SCM_RIGHTS) + } + return int(*ep.cmsgData()), nil +} + +func (ep *Endpoint) cmsgData() *int32 { + // syscall.CmsgLen(0) == syscall.cmsgAlignOf(syscall.SizeofCmsghdr) + return (*int32)((unsafe.Pointer)(uintptr((unsafe.Pointer)(ep.cmsg)) + uintptr(syscall.CmsgLen(0)))) +} diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD new file mode 100644 index 000000000..11716af81 --- /dev/null +++ b/pkg/fspath/BUILD @@ -0,0 +1,28 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package( + default_visibility = ["//visibility:public"], + licenses = ["notice"], +) + +go_library( + name = "fspath", + srcs = [ + "builder.go", + "builder_unsafe.go", + "fspath.go", + ], + importpath = "gvisor.dev/gvisor/pkg/fspath", + deps = ["//pkg/syserror"], +) + +go_test( + name = "fspath_test", + size = "small", + srcs = [ + "builder_test.go", + "fspath_test.go", + ], + embed = [":fspath"], + deps = ["//pkg/syserror"], +) diff --git a/pkg/fspath/builder.go b/pkg/fspath/builder.go new file mode 100644 index 000000000..7ddb36826 --- /dev/null +++ b/pkg/fspath/builder.go @@ -0,0 +1,104 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fspath + +import ( + "fmt" +) + +// Builder is similar to strings.Builder, but is used to produce pathnames +// given path components in reverse order (from leaf to root). This is useful +// in the common case where a filesystem is represented by a tree of named +// nodes, and the path to a given node must be produced by walking upward from +// that node to a given root. +type Builder struct { + buf []byte + start int + needSep bool +} + +// Reset resets the Builder to be empty. +func (b *Builder) Reset() { + b.start = len(b.buf) + b.needSep = false +} + +// Len returns the number of accumulated bytes. +func (b *Builder) Len() int { + return len(b.buf) - b.start +} + +func (b *Builder) needToGrow(n int) bool { + return b.start < n +} + +func (b *Builder) grow(n int) { + newLen := b.Len() + n + var newCap int + if len(b.buf) == 0 { + newCap = 64 // arbitrary + } else { + newCap = 2 * len(b.buf) + } + for newCap < newLen { + newCap *= 2 + if newCap == 0 { + panic(fmt.Sprintf("required length (%d) causes buffer size to overflow", newLen)) + } + } + newBuf := make([]byte, newCap) + copy(newBuf[newCap-b.Len():], b.buf[b.start:]) + b.start += newCap - len(b.buf) + b.buf = newBuf +} + +// PrependComponent prepends the given path component to b's buffer. A path +// separator is automatically inserted if appropriate. +func (b *Builder) PrependComponent(pc string) { + if b.needSep { + b.PrependByte('/') + } + b.PrependString(pc) + b.needSep = true +} + +// PrependString prepends the given string to b's buffer. +func (b *Builder) PrependString(str string) { + if b.needToGrow(len(str)) { + b.grow(len(str)) + } + b.start -= len(str) + copy(b.buf[b.start:], str) +} + +// PrependByte prepends the given byte to b's buffer. +func (b *Builder) PrependByte(c byte) { + if b.needToGrow(1) { + b.grow(1) + } + b.start-- + b.buf[b.start] = c +} + +// AppendString appends the given string to b's buffer. +func (b *Builder) AppendString(str string) { + if b.needToGrow(len(str)) { + b.grow(len(str)) + } + oldStart := b.start + b.start -= len(str) + copy(b.buf[b.start:], b.buf[oldStart:]) + copy(b.buf[len(b.buf)-len(str):], str) +} diff --git a/pkg/fspath/builder_test.go b/pkg/fspath/builder_test.go new file mode 100644 index 000000000..22f890273 --- /dev/null +++ b/pkg/fspath/builder_test.go @@ -0,0 +1,58 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fspath + +import ( + "testing" +) + +func TestBuilder(t *testing.T) { + type testCase struct { + pcs []string // path components in reverse order + after string + want string + } + tests := []testCase{ + { + // Empty case. + }, + { + pcs: []string{"foo"}, + want: "foo", + }, + { + pcs: []string{"foo", "bar", "baz"}, + want: "baz/bar/foo", + }, + { + pcs: []string{"foo", "bar"}, + after: " (deleted)", + want: "bar/foo (deleted)", + }, + } + + for _, test := range tests { + t.Run(test.want, func(t *testing.T) { + var b Builder + for _, pc := range test.pcs { + b.PrependComponent(pc) + } + b.AppendString(test.after) + if got := b.String(); got != test.want { + t.Errorf("got %q, wanted %q", got, test.want) + } + }) + } +} diff --git a/pkg/abi/linux/ashmem.go b/pkg/fspath/builder_unsafe.go index 2a722abe0..75606808d 100644 --- a/pkg/abi/linux/ashmem.go +++ b/pkg/fspath/builder_unsafe.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,18 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -package linux +package fspath -// Constants used by ashmem in pin-related ioctls. -const ( - AshmemNotPurged = 0 - AshmemWasPurged = 1 - AshmemIsUnpinned = 0 - AshmemIsPinned = 1 +import ( + "unsafe" ) -// AshmemPin structure is used for pin-related ioctls. -type AshmemPin struct { - Offset uint32 - Len uint32 +// String returns the accumulated string. No other methods should be called +// after String. +func (b *Builder) String() string { + bs := b.buf[b.start:] + // Compare strings.Builder.String(). + return *(*string)(unsafe.Pointer(&bs)) } diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go new file mode 100644 index 000000000..f68752560 --- /dev/null +++ b/pkg/fspath/fspath.go @@ -0,0 +1,182 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fspath provides efficient tools for working with file paths in +// Linux-compatible filesystem implementations. +package fspath + +import ( + "strings" + + "gvisor.dev/gvisor/pkg/syserror" +) + +const pathSep = '/' + +// Parse parses a pathname as described by path_resolution(7). +func Parse(pathname string) (Path, error) { + if len(pathname) == 0 { + // "... POSIX decrees that an empty pathname must not be resolved + // successfully. Linux returns ENOENT in this case." - + // path_resolution(7) + return Path{}, syserror.ENOENT + } + // Skip leading path separators. + i := 0 + for pathname[i] == pathSep { + i++ + if i == len(pathname) { + // pathname consists entirely of path separators. + return Path{ + Absolute: true, + Dir: true, + }, nil + } + } + // Skip trailing path separators. This is required by Iterator.Next. This + // loop is guaranteed to terminate with j >= 0 because otherwise the + // pathname would consist entirely of path separators, so we would have + // returned above. + j := len(pathname) - 1 + for pathname[j] == pathSep { + j-- + } + // Find the end of the first path component. + firstEnd := i + 1 + for firstEnd != len(pathname) && pathname[firstEnd] != pathSep { + firstEnd++ + } + return Path{ + Begin: Iterator{ + partialPathname: pathname[i : j+1], + end: firstEnd - i, + }, + Absolute: i != 0, + Dir: j != len(pathname)-1, + }, nil +} + +// Path contains the information contained in a pathname string. +// +// Path is copyable by value. +type Path struct { + // Begin is an iterator to the first path component in the relative part of + // the path. + // + // Path doesn't store information about path components after the first + // since this would require allocation. + Begin Iterator + + // If true, the path is absolute, such that lookup should begin at the + // filesystem root. If false, the path is relative, such that where lookup + // begins is unspecified. + Absolute bool + + // If true, the pathname contains trailing path separators, so the last + // path component must exist and resolve to a directory. + Dir bool +} + +// String returns a pathname string equivalent to p. Note that the returned +// string is not necessarily equal to the string p was parsed from; in +// particular, redundant path separators will not be present. +func (p Path) String() string { + var b strings.Builder + if p.Absolute { + b.WriteByte(pathSep) + } + sep := false + for pit := p.Begin; pit.Ok(); pit = pit.Next() { + if sep { + b.WriteByte(pathSep) + } + b.WriteString(pit.String()) + sep = true + } + // Don't return "//" for Parse("/"). + if p.Dir && p.Begin.Ok() { + b.WriteByte(pathSep) + } + return b.String() +} + +// An Iterator represents either a path component in a Path or a terminal +// iterator indicating that the end of the path has been reached. +// +// Iterator is immutable and copyable by value. The zero value of Iterator is +// valid, and represents a terminal iterator. +type Iterator struct { + // partialPathname is a substring of the original pathname beginning at the + // start of the represented path component and ending immediately after the + // end of the last path component in the pathname. If partialPathname is + // empty, the PathnameIterator is terminal. + // + // See TestParseIteratorPartialPathnames in fspath_test.go for a worked + // example. + partialPathname string + + // end is the offset into partialPathname of the first byte after the end + // of the represented path component. + end int +} + +// Ok returns true if it is not terminal. +func (it Iterator) Ok() bool { + return len(it.partialPathname) != 0 +} + +// String returns the path component represented by it. +// +// Preconditions: it.Ok(). +func (it Iterator) String() string { + return it.partialPathname[:it.end] +} + +// Next returns an iterator to the path component after it. If it is the last +// component in the path, Next returns a terminal iterator. +// +// Preconditions: it.Ok(). +func (it Iterator) Next() Iterator { + if it.end == len(it.partialPathname) { + // End of the path. + return Iterator{} + } + // Skip path separators. Since Parse trims trailing path separators, if we + // aren't at the end of the path, there is definitely another path + // component. + i := it.end + 1 + for { + if it.partialPathname[i] != pathSep { + break + } + i++ + } + nextPartialPathname := it.partialPathname[i:] + // Find the end of this path component. + nextEnd := 1 + for nextEnd < len(nextPartialPathname) && nextPartialPathname[nextEnd] != pathSep { + nextEnd++ + } + return Iterator{ + partialPathname: nextPartialPathname, + end: nextEnd, + } +} + +// NextOk is equivalent to it.Next().Ok(), but is faster. +// +// Preconditions: it.Ok(). +func (it Iterator) NextOk() bool { + return it.end != len(it.partialPathname) +} diff --git a/pkg/fspath/fspath_test.go b/pkg/fspath/fspath_test.go new file mode 100644 index 000000000..215b35622 --- /dev/null +++ b/pkg/fspath/fspath_test.go @@ -0,0 +1,143 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fspath + +import ( + "reflect" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/syserror" +) + +func TestParseIteratorPartialPathnames(t *testing.T) { + path, err := Parse("/foo//bar///baz////") + if err != nil { + t.Fatalf("Parse failed: %v", err) + } + // Parse strips leading slashes, and records their presence as + // Path.Absolute. + if !path.Absolute { + t.Errorf("Path.Absolute: got false, wanted true") + } + // Parse strips trailing slashes, and records their presence as Path.Dir. + if !path.Dir { + t.Errorf("Path.Dir: got false, wanted true") + } + // The first Iterator.partialPathname is the input pathname, with leading + // and trailing slashes stripped. + it := path.Begin + if want := "foo//bar///baz"; it.partialPathname != want { + t.Errorf("first Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want) + } + // Successive Iterator.partialPathnames remove the leading path component + // and following slashes, until we run out of path components and get a + // terminal Iterator. + it = it.Next() + if want := "bar///baz"; it.partialPathname != want { + t.Errorf("second Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want) + } + it = it.Next() + if want := "baz"; it.partialPathname != want { + t.Errorf("third Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want) + } + it = it.Next() + if want := ""; it.partialPathname != want { + t.Errorf("fourth Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want) + } + if it.Ok() { + t.Errorf("fourth Iterator.Ok(): got true, wanted false") + } +} + +func TestParse(t *testing.T) { + type testCase struct { + pathname string + relpath []string + abs bool + dir bool + } + tests := []testCase{ + { + pathname: "/", + relpath: []string{}, + abs: true, + dir: true, + }, + { + pathname: "//", + relpath: []string{}, + abs: true, + dir: true, + }, + } + for _, sep := range []string{"/", "//"} { + for _, abs := range []bool{false, true} { + for _, dir := range []bool{false, true} { + for _, pcs := range [][]string{ + // single path component + {"foo"}, + // multiple path components, including non-UTF-8 + {".", "foo", "..", "\xe6", "bar"}, + } { + prefix := "" + if abs { + prefix = sep + } + suffix := "" + if dir { + suffix = sep + } + tests = append(tests, testCase{ + pathname: prefix + strings.Join(pcs, sep) + suffix, + relpath: pcs, + abs: abs, + dir: dir, + }) + } + } + } + } + + for _, test := range tests { + t.Run(test.pathname, func(t *testing.T) { + p, err := Parse(test.pathname) + if err != nil { + t.Fatalf("failed to parse pathname %q: %v", test.pathname, err) + } + t.Logf("pathname %q => path %q", test.pathname, p) + if p.Absolute != test.abs { + t.Errorf("path absoluteness: got %v, wanted %v", p.Absolute, test.abs) + } + if p.Dir != test.dir { + t.Errorf("path must resolve to a directory: got %v, wanted %v", p.Dir, test.dir) + } + pcs := []string{} + for pit := p.Begin; pit.Ok(); pit = pit.Next() { + pcs = append(pcs, pit.String()) + } + if !reflect.DeepEqual(pcs, test.relpath) { + t.Errorf("relative path: got %v, wanted %v", pcs, test.relpath) + } + }) + } +} + +func TestParseEmptyPathname(t *testing.T) { + p, err := Parse("") + if err != syserror.ENOENT { + t.Errorf("parsing empty pathname: got (%v, %v), wanted (<unspecified>, ENOENT)", p, err) + } +} diff --git a/pkg/log/log.go b/pkg/log/log.go index ab9ad01ef..9387586e6 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -222,7 +222,7 @@ var logMu sync.Mutex // log is the default logger. var log atomic.Value -// Log retieves the global logger. +// Log retrieves the global logger. func Log() *BasicLogger { return log.Load().(*BasicLogger) } diff --git a/pkg/p9/local_server/BUILD b/pkg/p9/local_server/BUILD deleted file mode 100644 index aa6db186c..000000000 --- a/pkg/p9/local_server/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") - -package(licenses = ["notice"]) - -go_binary( - name = "local_server", - srcs = ["local_server.go"], - deps = [ - "//pkg/fd", - "//pkg/log", - "//pkg/p9", - "//pkg/unet", - ], -) diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go deleted file mode 100644 index 905188aaa..000000000 --- a/pkg/p9/local_server/local_server.go +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Binary local_server provides a local 9P2000.L server for the p9 package. -// -// To use, first start the server: -// local_server /tmp/my_bind_addr -// -// Then, connect using the Linux 9P filesystem: -// mount -t 9p -o trans=unix /tmp/my_bind_addr /mnt -// -// This package also serves as an examplar. -package main - -import ( - "os" - "path" - "syscall" - - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/unet" -) - -// local wraps a local file. -type local struct { - p9.DefaultWalkGetAttr - - path string - file *os.File -} - -// info constructs a QID for this file. -func (l *local) info() (p9.QID, os.FileInfo, error) { - var ( - qid p9.QID - fi os.FileInfo - err error - ) - - // Stat the file. - if l.file != nil { - fi, err = l.file.Stat() - } else { - fi, err = os.Lstat(l.path) - } - if err != nil { - log.Warningf("error stating %#v: %v", l, err) - return qid, nil, err - } - - // Construct the QID type. - qid.Type = p9.ModeFromOS(fi.Mode()).QIDType() - - // Save the path from the Ino. - qid.Path = fi.Sys().(*syscall.Stat_t).Ino - return qid, fi, nil -} - -// Attach implements p9.Attacher.Attach. -func (l *local) Attach() (p9.File, error) { - return &local{path: "/"}, nil -} - -// Walk implements p9.File.Walk. -func (l *local) Walk(names []string) ([]p9.QID, p9.File, error) { - var qids []p9.QID - last := &local{path: l.path} - for _, name := range names { - c := &local{path: path.Join(last.path, name)} - qid, _, err := c.info() - if err != nil { - return nil, nil, err - } - qids = append(qids, qid) - last = c - } - return qids, last, nil -} - -// StatFS implements p9.File.StatFS. -// -// Not implemented. -func (l *local) StatFS() (p9.FSStat, error) { - return p9.FSStat{}, syscall.ENOSYS -} - -// FSync implements p9.File.FSync. -func (l *local) FSync() error { - return l.file.Sync() -} - -// GetAttr implements p9.File.GetAttr. -// -// Not fully implemented. -func (l *local) GetAttr(req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { - qid, fi, err := l.info() - if err != nil { - return qid, p9.AttrMask{}, p9.Attr{}, err - } - - stat := fi.Sys().(*syscall.Stat_t) - attr := p9.Attr{ - Mode: p9.FileMode(stat.Mode), - UID: p9.UID(stat.Uid), - GID: p9.GID(stat.Gid), - NLink: stat.Nlink, - RDev: stat.Rdev, - Size: uint64(stat.Size), - BlockSize: uint64(stat.Blksize), - Blocks: uint64(stat.Blocks), - ATimeSeconds: uint64(stat.Atim.Sec), - ATimeNanoSeconds: uint64(stat.Atim.Nsec), - MTimeSeconds: uint64(stat.Mtim.Sec), - MTimeNanoSeconds: uint64(stat.Mtim.Nsec), - CTimeSeconds: uint64(stat.Ctim.Sec), - CTimeNanoSeconds: uint64(stat.Ctim.Nsec), - } - valid := p9.AttrMask{ - Mode: true, - UID: true, - GID: true, - NLink: true, - RDev: true, - Size: true, - Blocks: true, - ATime: true, - MTime: true, - CTime: true, - } - - return qid, valid, attr, nil -} - -// SetAttr implements p9.File.SetAttr. -// -// Not implemented. -func (l *local) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { - return syscall.ENOSYS -} - -// Remove implements p9.File.Remove. -// -// Not implemented. -func (l *local) Remove() error { - return syscall.ENOSYS -} - -// Rename implements p9.File.Rename. -// -// Not implemented. -func (l *local) Rename(directory p9.File, name string) error { - return syscall.ENOSYS -} - -// Close implements p9.File.Close. -func (l *local) Close() error { - if l.file != nil { - return l.file.Close() - } - return nil -} - -// Open implements p9.File.Open. -func (l *local) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { - qid, _, err := l.info() - if err != nil { - return nil, qid, 0, err - } - - // Do the actual open. - f, err := os.OpenFile(l.path, int(mode), 0) - if err != nil { - return nil, qid, 0, err - } - l.file = f - - // Note: we don't send the local file for this server. - return nil, qid, 4096, nil -} - -// Read implements p9.File.Read. -func (l *local) ReadAt(p []byte, offset uint64) (int, error) { - return l.file.ReadAt(p, int64(offset)) -} - -// Write implements p9.File.Write. -func (l *local) WriteAt(p []byte, offset uint64) (int, error) { - return l.file.WriteAt(p, int64(offset)) -} - -// Create implements p9.File.Create. -func (l *local) Create(name string, mode p9.OpenFlags, permissions p9.FileMode, _ p9.UID, _ p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) { - f, err := os.OpenFile(l.path, int(mode)|syscall.O_CREAT|syscall.O_EXCL, os.FileMode(permissions)) - if err != nil { - return nil, nil, p9.QID{}, 0, err - } - - l2 := &local{path: path.Join(l.path, name), file: f} - qid, _, err := l2.info() - if err != nil { - l2.Close() - return nil, nil, p9.QID{}, 0, err - } - - return nil, l2, qid, 4096, nil -} - -// Mkdir implements p9.File.Mkdir. -// -// Not properly implemented. -func (l *local) Mkdir(name string, permissions p9.FileMode, _ p9.UID, _ p9.GID) (p9.QID, error) { - if err := os.Mkdir(path.Join(l.path, name), os.FileMode(permissions)); err != nil { - return p9.QID{}, err - } - - // Blank QID. - return p9.QID{}, nil -} - -// Symlink implements p9.File.Symlink. -// -// Not properly implemented. -func (l *local) Symlink(oldname string, newname string, _ p9.UID, _ p9.GID) (p9.QID, error) { - if err := os.Symlink(oldname, path.Join(l.path, newname)); err != nil { - return p9.QID{}, err - } - - // Blank QID. - return p9.QID{}, nil -} - -// Link implements p9.File.Link. -// -// Not properly implemented. -func (l *local) Link(target p9.File, newname string) error { - return os.Link(target.(*local).path, path.Join(l.path, newname)) -} - -// Mknod implements p9.File.Mknod. -// -// Not implemented. -func (l *local) Mknod(name string, mode p9.FileMode, major uint32, minor uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { - return p9.QID{}, syscall.ENOSYS -} - -// RenameAt implements p9.File.RenameAt. -// -// Not implemented. -func (l *local) RenameAt(oldname string, newdir p9.File, newname string) error { - return syscall.ENOSYS -} - -// UnlinkAt implements p9.File.UnlinkAt. -// -// Not implemented. -func (l *local) UnlinkAt(name string, flags uint32) error { - return syscall.ENOSYS -} - -// Readdir implements p9.File.Readdir. -func (l *local) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { - // We only do *all* dirents in single shot. - const maxDirentBuffer = 1024 * 1024 - buf := make([]byte, maxDirentBuffer) - n, err := syscall.ReadDirent(int(l.file.Fd()), buf) - if err != nil { - // Return zero entries. - return nil, nil - } - - // Parse the entries; note that we read up to offset+count here. - _, newCount, newNames := syscall.ParseDirent(buf[:n], int(offset)+int(count), nil) - var dirents []p9.Dirent - for i := int(offset); i >= 0 && i < newCount; i++ { - entry := local{path: path.Join(l.path, newNames[i])} - qid, _, err := entry.info() - if err != nil { - continue - } - dirents = append(dirents, p9.Dirent{ - QID: qid, - Type: qid.Type, - Name: newNames[i], - Offset: uint64(i + 1), - }) - } - - return dirents, nil -} - -// Readlink implements p9.File.Readlink. -// -// Not properly implemented. -func (l *local) Readlink() (string, error) { - return os.Readlink(l.path) -} - -// Flush implements p9.File.Flush. -func (l *local) Flush() error { - return nil -} - -// Connect implements p9.File.Connect. -func (l *local) Connect(p9.ConnectFlags) (*fd.FD, error) { - return nil, syscall.ECONNREFUSED -} - -// Renamed implements p9.File.Renamed. -func (l *local) Renamed(parent p9.File, newName string) { - l.path = path.Join(parent.(*local).path, newName) -} - -// Allocate implements p9.File.Allocate. -func (l *local) Allocate(mode p9.AllocateMode, offset, length uint64) error { - return syscall.Fallocate(int(l.file.Fd()), mode.ToLinux(), int64(offset), int64(length)) -} - -func main() { - log.SetLevel(log.Debug) - - if len(os.Args) != 2 { - log.Warningf("usage: %s <bind-addr>", os.Args[0]) - os.Exit(1) - } - - // Bind and listen on the socket. - serverSocket, err := unet.BindAndListen(os.Args[1], false) - if err != nil { - log.Warningf("err binding: %v", err) - os.Exit(1) - } - - // Run the server. - s := p9.NewServer(&local{}) - s.Serve(serverSocket) -} - -var ( - _ p9.File = &local{} -) diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go index 4039862e6..e12831dbd 100644 --- a/pkg/p9/p9.go +++ b/pkg/p9/p9.go @@ -92,7 +92,7 @@ func (o OpenFlags) String() string { } } -// Tag is a messsage tag. +// Tag is a message tag. type Tag uint16 // FID is a file identifier. diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go index 907afdf1b..fe649c2e8 100644 --- a/pkg/p9/p9test/client_test.go +++ b/pkg/p9/p9test/client_test.go @@ -389,7 +389,7 @@ func checkDeleted(h *Harness, file p9.File) { _, newFile, err := file.Walk(nil) if err == syscall.EBUSY { // We can't walk from here because this reference is open - // aleady. Okay, we will also have unopened cases through + // already. Okay, we will also have unopened cases through // TestUnlink, just skip the remove operation for now. return } else if err != nil { diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 0652fbbe6..9c08452fc 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -24,6 +24,9 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/refs", visibility = ["//:sandbox"], + deps = [ + "//pkg/log", + ], ) go_test( diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index 20f515391..6ecbace9e 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -17,9 +17,14 @@ package refs import ( + "bytes" + "fmt" "reflect" + "runtime" "sync" "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" ) // RefCounter is the interface to be implemented by objects that are reference @@ -189,6 +194,16 @@ type AtomicRefCount struct { // how these fields are used. refCount int64 + // name is the name of the type which owns this ref count. + // + // name is immutable after EnableLeakCheck is called. + name string + + // stack optionally records the caller of EnableLeakCheck. + // + // stack is immutable after EnableLeakCheck is called. + stack []uintptr + // mu protects the list below. mu sync.Mutex `state:"nosave"` @@ -196,6 +211,148 @@ type AtomicRefCount struct { weakRefs weakRefList `state:"nosave"` } +// LeakMode configures the leak checker. +type LeakMode uint32 + +const ( + // uninitializedLeakChecking indicates that the leak checker has not yet been initialized. + uninitializedLeakChecking LeakMode = iota + + // NoLeakChecking indicates that no effort should be made to check for + // leaks. + NoLeakChecking + + // LeaksLogWarning indicates that a warning should be logged when leaks + // are found. + LeaksLogWarning + + // LeaksLogTraces indicates that a trace collected during allocation + // should be logged when leaks are found. + LeaksLogTraces +) + +// leakMode stores the current mode for the reference leak checker. +// +// Values must be one of the LeakMode values. +// +// leakMode must be accessed atomically. +var leakMode uint32 + +// SetLeakMode configures the reference leak checker. +func SetLeakMode(mode LeakMode) { + atomic.StoreUint32(&leakMode, uint32(mode)) +} + +const maxStackFrames = 40 + +type fileLine struct { + file string + line int +} + +// A stackKey is a representation of a stack frame for use as a map key. +// +// The fileLine type is used as PC values seem to vary across collections, even +// for the same call stack. +type stackKey [maxStackFrames]fileLine + +var stackCache = struct { + sync.Mutex + entries map[stackKey][]uintptr +}{entries: map[stackKey][]uintptr{}} + +func makeStackKey(pcs []uintptr) stackKey { + frames := runtime.CallersFrames(pcs) + var key stackKey + keySlice := key[:0] + for { + frame, more := frames.Next() + keySlice = append(keySlice, fileLine{frame.File, frame.Line}) + + if !more || len(keySlice) == len(key) { + break + } + } + return key +} + +func recordStack() []uintptr { + pcs := make([]uintptr, maxStackFrames) + n := runtime.Callers(1, pcs) + if n == 0 { + // No pcs available. Stop now. + // + // This can happen if the first argument to runtime.Callers + // is large. + return nil + } + pcs = pcs[:n] + key := makeStackKey(pcs) + stackCache.Lock() + v, ok := stackCache.entries[key] + if !ok { + // Reallocate to prevent pcs from escaping. + v = append([]uintptr(nil), pcs...) + stackCache.entries[key] = v + } + stackCache.Unlock() + return v +} + +func formatStack(pcs []uintptr) string { + frames := runtime.CallersFrames(pcs) + var trace bytes.Buffer + for { + frame, more := frames.Next() + fmt.Fprintf(&trace, "%s:%d: %s\n", frame.File, frame.Line, frame.Function) + + if !more { + break + } + } + return trace.String() +} + +func (r *AtomicRefCount) finalize() { + var note string + switch LeakMode(atomic.LoadUint32(&leakMode)) { + case NoLeakChecking: + return + case uninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + msg := fmt.Sprintf("%sAtomicRefCount %p owned by %q garbage collected with ref count of %d (want 0)", note, r, r.name, n) + if len(r.stack) != 0 { + msg += ":\nCaller:\n" + formatStack(r.stack) + } + log.Warningf(msg) + } +} + +// EnableLeakCheck checks for reference leaks when the AtomicRefCount gets +// garbage collected. +// +// This function adds a finalizer to the AtomicRefCount, so the AtomicRefCount +// must be at the beginning of its parent. +// +// name is a friendly name that will be listed as the owner of the +// AtomicRefCount in logs. It should be the name of the parent type, including +// package. +func (r *AtomicRefCount) EnableLeakCheck(name string) { + if name == "" { + panic("invalid name") + } + switch LeakMode(atomic.LoadUint32(&leakMode)) { + case NoLeakChecking: + return + case LeaksLogTraces: + r.stack = recordStack() + } + r.name = name + runtime.SetFinalizer(r, (*AtomicRefCount).finalize) +} + // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *AtomicRefCount) ReadRefs() int64 { @@ -208,6 +365,8 @@ func (r *AtomicRefCount) ReadRefs() int64 { // // The sanity check here is limited to real references, since if they have // dropped beneath zero then the object should have been destroyed. +// +//go:nosplit func (r *AtomicRefCount) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { panic("Incrementing non-positive ref count") @@ -222,6 +381,8 @@ func (r *AtomicRefCount) IncRef() { // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to // distinguish other TryIncRef calls from genuine references held. +// +//go:nosplit func (r *AtomicRefCount) TryIncRef() bool { const speculativeRef = 1 << 32 v := atomic.AddInt64(&r.refCount, speculativeRef) @@ -263,6 +424,7 @@ func (r *AtomicRefCount) dropWeakRef(w *WeakRef) { // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // +//go:nosplit func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: @@ -298,6 +460,8 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { } // DecRef decrements this object's reference count. +// +//go:nosplit func (r *AtomicRefCount) DecRef() { r.DecRefWithDestructor(nil) } diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go index c298e9821..c7503f2cc 100644 --- a/pkg/seccomp/seccomp.go +++ b/pkg/seccomp/seccomp.go @@ -244,7 +244,7 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAc return nil } -// buildBSTProgram converts a binary tree started in 'root' into BPF code. The ouline of the code +// buildBSTProgram converts a binary tree started in 'root' into BPF code. The outline of the code // is as follows: // // // SYS_PIPE(22), root diff --git a/pkg/segment/set.go b/pkg/segment/set.go index 982eb3fdd..03e4f258f 100644 --- a/pkg/segment/set.go +++ b/pkg/segment/set.go @@ -1288,7 +1288,7 @@ func (s *Set) String() string { return s.root.String() } -// String stringifes a node (and all of its children) for debugging. +// String stringifies a node (and all of its children) for debugging. func (n *node) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 5dccb8e3c..bf802d1b6 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -23,7 +23,6 @@ go_library( "//pkg/sentry/fs/host", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/state", diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 66a506584..6ae60c5cb 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -123,9 +122,8 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID // TTYFileOperations that wraps the TTY is also returned. func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) { // Import file descriptors. - l := limits.NewLimitSet() - fdm := proc.Kernel.NewFDMap() - defer fdm.DecRef() + fdTable := proc.Kernel.NewFDTable() + defer fdTable.DecRef() // No matter what happens, we should close all files in the FilePayload // before returning. Any files that are imported will be duped. @@ -149,9 +147,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI WorkingDirectory: args.WorkingDirectory, Root: args.Root, Credentials: creds, - FDMap: fdm, + FDTable: fdTable, Umask: 0022, - Limits: l, + Limits: limits.NewLimitSet(), MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: proc.Kernel.RootUTSNamespace(), IPCNamespace: proc.Kernel.RootIPCNamespace(), @@ -212,7 +210,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI } // Add the file to the FD map. - if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil { + if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { return nil, 0, nil, err } } diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index e5b9b8bc2..d7259b47b 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -69,6 +69,7 @@ go_library( "//pkg/state", "//pkg/syserror", "//pkg/waiter", + "//third_party/gvsync", ], ) diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md index f53ed3eaa..db4a1b730 100644 --- a/pkg/sentry/fs/README.md +++ b/pkg/sentry/fs/README.md @@ -69,7 +69,7 @@ Specifically this state is: - An `fs.MountManager` containing mount points. -- A `kernel.FDMap` containing pointers to open files. +- A `kernel.FDTable` containing pointers to open files. Anything else managed by the VFS that can be easily loaded into memory from a filesystem is synced back to those filesystems and is not saved. Examples are @@ -126,7 +126,7 @@ A mount point is restored in two steps: - Second, during state.Load, each `fs.MountedFilesystem` optionally searches for a mount in the `fs.RestoreEnvironment` that matches its saved device - name. The `fs.MountedFilesystem` then restablishes a pointer to the root of + name. The `fs.MountedFilesystem` then reestablishes a pointer to the root of the mounted filesystem. For example, the mount specification provides the network connection for a mounted remote filesystem client to communicate with its remote file server. The `fs.MountedFilesystem` also trivially loads @@ -158,7 +158,7 @@ Otherwise an `fs.File` restores flags, an offset, and a unique identifier (only used internally). It may use the `fs.Inode`, which it indirectly holds a reference on through the -`fs.Dirent`, to restablish an open file handle on the backing filesystem (e.g. +`fs.Dirent`, to reestablish an open file handle on the backing filesystem (e.g. to continue reading and writing). ## Overlay diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD deleted file mode 100644 index 5b755cec5..000000000 --- a/pkg/sentry/fs/ashmem/BUILD +++ /dev/null @@ -1,65 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -go_library( - name = "ashmem", - srcs = [ - "area.go", - "device.go", - "pin_board.go", - "uint64_range.go", - "uint64_set.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ashmem", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/fs", - "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/tmpfs", - "//pkg/sentry/kernel/time", - "//pkg/sentry/memmap", - "//pkg/sentry/usage", - "//pkg/sentry/usermem", - "//pkg/syserror", - "//pkg/waiter", - ], -) - -go_test( - name = "ashmem_test", - size = "small", - srcs = ["pin_board_test.go"], - embed = [":ashmem"], - deps = [ - "//pkg/abi/linux", - "//pkg/sentry/usermem", - ], -) - -go_template_instance( - name = "uint64_range", - out = "uint64_range.go", - package = "ashmem", - template = "//pkg/segment:generic_range", - types = { - "T": "uint64", - }, -) - -go_template_instance( - name = "uint64_set", - out = "uint64_set.go", - package = "ashmem", - template = "//pkg/segment:generic_set", - types = { - "Key": "uint64", - "Range": "Range", - "Value": "noValue", - "Functions": "setFunctions", - }, -) diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go deleted file mode 100644 index e1971f91c..000000000 --- a/pkg/sentry/fs/ashmem/area.go +++ /dev/null @@ -1,308 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ashmem - -import ( - "sync" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/waiter" -) - -const ( - // namePrefix is the name prefix assumed and forced by the Linux implementation. - namePrefix = "dev/ashmem" - - // nameLen is the maximum name length. - nameLen = 256 -) - -// Area implements fs.FileOperations. -// -// +stateify savable -type Area struct { - fsutil.FileNoFsync `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` - fsutil.FileNoopFlush `state:"nosave"` - fsutil.FileNotDirReaddir `state:"nosave"` - fsutil.FileUseInodeUnstableAttr `state:"nosave"` - waiter.AlwaysReady `state:"nosave"` - - ad *Device - - // mu protects fields below. - mu sync.Mutex `state:"nosave"` - tmpfsFile *fs.File - name string - size uint64 - perms usermem.AccessType - pb *PinBoard -} - -// Release implements fs.FileOperations.Release. -func (a *Area) Release() { - a.mu.Lock() - defer a.mu.Unlock() - if a.tmpfsFile != nil { - a.tmpfsFile.DecRef() - a.tmpfsFile = nil - } -} - -// Seek implements fs.FileOperations.Seek. -func (a *Area) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { - a.mu.Lock() - defer a.mu.Unlock() - if a.size == 0 { - return 0, syserror.EINVAL - } - if a.tmpfsFile == nil { - return 0, syserror.EBADF - } - return a.tmpfsFile.FileOperations.Seek(ctx, file, whence, offset) -} - -// Read implements fs.FileOperations.Read. -func (a *Area) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - a.mu.Lock() - defer a.mu.Unlock() - if a.size == 0 { - return 0, nil - } - if a.tmpfsFile == nil { - return 0, syserror.EBADF - } - return a.tmpfsFile.FileOperations.Read(ctx, file, dst, offset) -} - -// Write implements fs.FileOperations.Write. -func (a *Area) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { - return 0, syserror.ENOSYS -} - -// ConfigureMMap implements fs.FileOperations.ConfigureMMap. -func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { - a.mu.Lock() - defer a.mu.Unlock() - if a.size == 0 { - return syserror.EINVAL - } - - if !a.perms.SupersetOf(opts.Perms) { - return syserror.EPERM - } - opts.MaxPerms = opts.MaxPerms.Intersect(a.perms) - - if a.tmpfsFile == nil { - tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}) - tmpfsInode := fs.NewInode(ctx, tmpfsInodeOps, fs.NewPseudoMountSource(ctx), fs.StableAttr{}) - dirent := fs.NewDirent(ctx, tmpfsInode, namePrefix+"/"+a.name) - tmpfsFile, err := tmpfsInode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}) - // Drop the extra reference on the Dirent. - dirent.DecRef() - - if err != nil { - return err - } - - // Truncate to the size set by ASHMEM_SET_SIZE ioctl. - err = tmpfsInodeOps.Truncate(ctx, tmpfsInode, int64(a.size)) - if err != nil { - return err - } - a.tmpfsFile = tmpfsFile - a.pb = NewPinBoard() - } - - return a.tmpfsFile.ConfigureMMap(ctx, opts) -} - -// Ioctl implements fs.FileOperations.Ioctl. -func (a *Area) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - // Switch on ioctl request. - switch args[1].Uint() { - case linux.AshmemSetNameIoctl: - name, err := usermem.CopyStringIn(ctx, io, args[2].Pointer(), nameLen-1, usermem.IOOpts{ - AddressSpaceActive: true, - }) - if err != nil { - return 0, err - } - - a.mu.Lock() - defer a.mu.Unlock() - - // Cannot set name for already mapped ashmem. - if a.tmpfsFile != nil { - return 0, syserror.EINVAL - } - a.name = name - return 0, nil - - case linux.AshmemGetNameIoctl: - a.mu.Lock() - var local []byte - if a.name != "" { - nameLen := len([]byte(a.name)) - local = make([]byte, nameLen, nameLen+1) - copy(local, []byte(a.name)) - local = append(local, 0) - } else { - nameLen := len([]byte(namePrefix)) - local = make([]byte, nameLen, nameLen+1) - copy(local, []byte(namePrefix)) - local = append(local, 0) - } - a.mu.Unlock() - - if _, err := io.CopyOut(ctx, args[2].Pointer(), local, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { - return 0, syserror.EFAULT - } - return 0, nil - - case linux.AshmemSetSizeIoctl: - a.mu.Lock() - defer a.mu.Unlock() - - // Cannot set size for already mapped ashmem. - if a.tmpfsFile != nil { - return 0, syserror.EINVAL - } - a.size = uint64(args[2].SizeT()) - return 0, nil - - case linux.AshmemGetSizeIoctl: - return uintptr(a.size), nil - - case linux.AshmemPinIoctl, linux.AshmemUnpinIoctl, linux.AshmemGetPinStatusIoctl: - // Locking and unlocking is ok since once tmpfsFile is set, it won't be nil again - // even after unmapping! Unlocking is needed in order to avoid a deadlock on - // usermem.CopyObjectIn. - - // Cannot execute pin-related ioctls before mapping. - a.mu.Lock() - if a.tmpfsFile == nil { - a.mu.Unlock() - return 0, syserror.EINVAL - } - a.mu.Unlock() - - var pin linux.AshmemPin - _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pin, usermem.IOOpts{ - AddressSpaceActive: true, - }) - if err != nil { - return 0, syserror.EFAULT - } - - a.mu.Lock() - defer a.mu.Unlock() - return a.pinOperation(pin, args[1].Uint()) - - case linux.AshmemPurgeAllCachesIoctl: - return 0, nil - - case linux.AshmemSetProtMaskIoctl: - prot := uint64(args[2].ModeT()) - perms := usermem.AccessType{ - Read: prot&linux.PROT_READ != 0, - Write: prot&linux.PROT_WRITE != 0, - Execute: prot&linux.PROT_EXEC != 0, - } - - a.mu.Lock() - defer a.mu.Unlock() - - // Can only narrow prot mask. - if !a.perms.SupersetOf(perms) { - return 0, syserror.EINVAL - } - - // TODO(b/30946773,gvisor.dev/issue/153): If personality flag - // READ_IMPLIES_EXEC is set, set PROT_EXEC if PORT_READ is set. - - a.perms = perms - return 0, nil - - case linux.AshmemGetProtMaskIoctl: - return uintptr(a.perms.Prot()), nil - default: - // Ioctls irrelevant to Ashmem. - return 0, syserror.EINVAL - } -} - -// pinOperation should only be called while holding a.mu. -func (a *Area) pinOperation(pin linux.AshmemPin, op uint32) (uintptr, error) { - // Page-align a.size for checks. - pageAlignedSize, ok := usermem.Addr(a.size).RoundUp() - if !ok { - return 0, syserror.EINVAL - } - // Len 0 means everything onward. - if pin.Len == 0 { - pin.Len = uint32(pageAlignedSize) - pin.Offset - } - // Both Offset and Len have to be page-aligned. - if pin.Offset%uint32(usermem.PageSize) != 0 { - return 0, syserror.EINVAL - } - if pin.Len%uint32(usermem.PageSize) != 0 { - return 0, syserror.EINVAL - } - // Adding Offset and Len must not cause an uint32 overflow. - if end := pin.Offset + pin.Len; end < pin.Offset { - return 0, syserror.EINVAL - } - // Pin range must not exceed a's size. - if uint32(pageAlignedSize) < pin.Offset+pin.Len { - return 0, syserror.EINVAL - } - // Handle each operation. - r := RangeFromAshmemPin(pin) - switch op { - case linux.AshmemPinIoctl: - if a.pb.PinRange(r) { - return linux.AshmemWasPurged, nil - } - return linux.AshmemNotPurged, nil - - case linux.AshmemUnpinIoctl: - // TODO(b/30946773): Implement purge on unpin. - a.pb.UnpinRange(r) - return 0, nil - - case linux.AshmemGetPinStatusIoctl: - if a.pb.RangePinnedStatus(r) { - return linux.AshmemIsPinned, nil - } - return linux.AshmemIsUnpinned, nil - - default: - panic("unreachable") - } - -} diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go deleted file mode 100644 index c5400dd94..000000000 --- a/pkg/sentry/fs/ashmem/pin_board.go +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ashmem - -import "gvisor.dev/gvisor/pkg/abi/linux" - -const maxUint64 = ^uint64(0) - -// setFunctions implements segment.Functions generated from segment.Functions for -// uint64 Key and noValue Value. For more information, see the build file and -// segment set implementation at pkg/segment/set.go. -type setFunctions struct{} - -// noValue is a type of range attached value, which is irrelevant here. -type noValue struct{} - -// MinKey implements segment.Functions.MinKey. -func (setFunctions) MinKey() uint64 { - return 0 -} - -// MaxKey implements segment.Functions.MaxKey. -func (setFunctions) MaxKey() uint64 { - return maxUint64 -} - -// ClearValue implements segment.Functions.ClearValue. -func (setFunctions) ClearValue(*noValue) { - return -} - -// Merge implements segment.Functions.Merge. -func (setFunctions) Merge(Range, noValue, Range, noValue) (noValue, bool) { - return noValue{}, true -} - -// Split implements segment.Functions.Split. -func (setFunctions) Split(Range, noValue, uint64) (noValue, noValue) { - return noValue{}, noValue{} -} - -// PinBoard represents a set of pinned ranges in ashmem. -// -// segment.Set is used for implementation where segments represent -// ranges of pinned bytes, while gaps represent ranges of unpinned -// bytes. All ranges are page-aligned. -// -// +stateify savable -type PinBoard struct { - Set -} - -// NewPinBoard creates a new pin board with all pages pinned. -func NewPinBoard() *PinBoard { - var pb PinBoard - pb.PinRange(Range{0, maxUint64}) - return &pb -} - -// PinRange pins all pages in the specified range and returns true -// if there are any newly pinned pages. -func (pb *PinBoard) PinRange(r Range) bool { - pinnedPages := false - for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; { - common := gap.Range().Intersect(r) - if common.Length() == 0 { - gap = gap.NextGap() - continue - } - pinnedPages = true - gap = pb.Insert(gap, common, noValue{}).NextGap() - } - return pinnedPages -} - -// UnpinRange unpins all pages in the specified range. -func (pb *PinBoard) UnpinRange(r Range) { - for seg := pb.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; { - common := seg.Range().Intersect(r) - if common.Length() == 0 { - seg = seg.NextSegment() - continue - } - seg = pb.RemoveRange(common).NextSegment() - } -} - -// RangePinnedStatus returns false if there's at least one unpinned page in the -// specified range. -func (pb *PinBoard) RangePinnedStatus(r Range) bool { - for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; { - common := gap.Range().Intersect(r) - if common.Length() == 0 { - gap = gap.NextGap() - continue - } - return false - } - return true -} - -// RangeFromAshmemPin converts ashmem's original pin structure -// to Range. -func RangeFromAshmemPin(ap linux.AshmemPin) Range { - if ap.Len == 0 { - return Range{ - uint64(ap.Offset), - maxUint64, - } - } - return Range{ - uint64(ap.Offset), - uint64(ap.Offset) + uint64(ap.Len), - } -} diff --git a/pkg/sentry/fs/ashmem/pin_board_test.go b/pkg/sentry/fs/ashmem/pin_board_test.go deleted file mode 100644 index 1ccd38f56..000000000 --- a/pkg/sentry/fs/ashmem/pin_board_test.go +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ashmem - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -func TestPinBoard(t *testing.T) { - pb := NewPinBoard() - - // Confirm that all pages are pinned. - if !pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { - t.Errorf("RangePinnedStatus(all pages) returned false (unpinned) at start.") - } - - // Unpin pages [1, 11) (counting from 0) - pb.UnpinRange(RangeFromAshmemPin(linux.AshmemPin{ - usermem.PageSize, - usermem.PageSize * 10, - })) - - // Confirm that pages [1, 11) are unpinned and that page 0 and pages - // larger than 10 are pinned. - pinned := []linux.AshmemPin{ - { - 0, - usermem.PageSize, - }, { - usermem.PageSize * 11, - 0, - }, - } - - for _, pin := range pinned { - if !pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { - t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned false (unpinned).", - pin.Offset, pin.Len) - } - } - - unpinned := []linux.AshmemPin{ - { - usermem.PageSize, - usermem.PageSize * 10, - }, - } - - for _, pin := range unpinned { - if pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { - t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned true (pinned).", - pin.Offset, pin.Len) - } - } - - // Pin pages [2, 6). - pb.PinRange(RangeFromAshmemPin(linux.AshmemPin{ - usermem.PageSize * 2, - usermem.PageSize * 4, - })) - - // Confirm that pages 0, [2, 6) and pages larger than 10 are pinned - // while others remain unpinned. - pinned = []linux.AshmemPin{ - { - 0, - usermem.PageSize, - }, - { - usermem.PageSize * 2, - usermem.PageSize * 4, - }, - { - usermem.PageSize * 11, - 0, - }, - } - - for _, pin := range pinned { - if !pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { - t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned false (unpinned).", - pin.Offset, pin.Len) - } - } - - unpinned = []linux.AshmemPin{ - { - usermem.PageSize, - usermem.PageSize, - }, { - usermem.PageSize * 6, - usermem.PageSize * 5, - }, - } - - for _, pin := range unpinned { - if pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { - t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned true (pinned).", - pin.Offset, pin.Len) - } - } - - // Status of a partially pinned range is unpinned. - if pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { - t.Errorf("RangePinnedStatus(all pages) returned true (pinned).") - } - - // Pin the whole range again. - pb.PinRange(RangeFromAshmemPin(linux.AshmemPin{0, 0})) - - // Confirm that all pages are pinned. - if !pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { - t.Errorf("RangePinnedStatus(all pages) returned false (unpinned) at start.") - } -} diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD deleted file mode 100644 index 639a5603a..000000000 --- a/pkg/sentry/fs/binder/BUILD +++ /dev/null @@ -1,27 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library") - -go_library( - name = "binder", - srcs = [ - "binder.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/binder", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/fs", - "//pkg/sentry/fs/fsutil", - "//pkg/sentry/kernel", - "//pkg/sentry/memmap", - "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", - "//pkg/sentry/usage", - "//pkg/sentry/usermem", - "//pkg/syserror", - "//pkg/waiter", - ], -) diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go deleted file mode 100644 index 66f6d32df..000000000 --- a/pkg/sentry/fs/binder/binder.go +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package binder implements Android Binder IPC module. -package binder - -import ( - "sync" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/waiter" -) - -const ( - currentProtocolVersion = 8 - - // mmapSizeLimit is the upper limit for mapped memory size in Binder. - mmapSizeLimit = 4 * 1024 * 1024 // 4MB -) - -// Device implements fs.InodeOperations. -// -// +stateify savable -type Device struct { - fsutil.InodeGenericChecker `state:"nosave"` - fsutil.InodeNoExtendedAttributes `state:"nosave"` - fsutil.InodeNoopAllocate `state:"nosave"` - fsutil.InodeNoopRelease `state:"nosave"` - fsutil.InodeNoopTruncate `state:"nosave"` - fsutil.InodeNoopWriteOut `state:"nosave"` - fsutil.InodeNotDirectory `state:"nosave"` - fsutil.InodeNotMappable `state:"nosave"` - fsutil.InodeNotSocket `state:"nosave"` - fsutil.InodeNotSymlink `state:"nosave"` - fsutil.InodeVirtual `state:"nosave"` - - fsutil.InodeSimpleAttributes -} - -var _ fs.InodeOperations = (*Device)(nil) - -// NewDevice creates and intializes a Device structure. -func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device { - return &Device{ - InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, 0), - } -} - -// GetFile implements fs.InodeOperations.GetFile. -// -// TODO(b/30946773): Add functionality to GetFile: Additional fields will be -// needed in the Device structure, initialize them here. Also, Device will need -// to keep track of the created Procs in order to implement BINDER_READ_WRITE -// ioctl. -func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - return fs.NewFile(ctx, d, flags, &Proc{ - bd: bd, - task: kernel.TaskFromContext(ctx), - mfp: pgalloc.MemoryFileProviderFromContext(ctx), - }), nil -} - -// Proc implements fs.FileOperations and fs.IoctlGetter. -// -// +stateify savable -type Proc struct { - fsutil.FileNoFsync `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` - fsutil.FileNotDirReaddir `state:"nosave"` - fsutil.FileUseInodeUnstableAttr `state:"nosave"` - waiter.AlwaysReady `state:"nosave"` - - bd *Device - task *kernel.Task - mfp pgalloc.MemoryFileProvider - - // mu protects fr. - mu sync.Mutex `state:"nosave"` - - // mapped is memory allocated from mfp.MemoryFile() by AddMapping. - mapped platform.FileRange -} - -// Release implements fs.FileOperations.Release. -func (bp *Proc) Release() { - bp.mu.Lock() - defer bp.mu.Unlock() - if bp.mapped.Length() != 0 { - bp.mfp.MemoryFile().DecRef(bp.mapped) - } -} - -// Seek implements fs.FileOperations.Seek. -// -// Binder doesn't support seek operation (unless in debug mode). -func (bp *Proc) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { - return offset, syserror.EOPNOTSUPP -} - -// Read implements fs.FileOperations.Read. -// -// Binder doesn't support read operation (unless in debug mode). -func (bp *Proc) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - return 0, syserror.EOPNOTSUPP -} - -// Write implements fs.FileOperations.Write. -// -// Binder doesn't support write operation. -func (bp *Proc) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { - return 0, syserror.EOPNOTSUPP -} - -// Flush implements fs.FileOperations.Flush. -// -// TODO(b/30946773): Implement. -func (bp *Proc) Flush(ctx context.Context, file *fs.File) error { - return nil -} - -// ConfigureMMap implements fs.FileOperations.ConfigureMMap. -func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { - // Compare drivers/android/binder.c:binder_mmap(). - if caller := kernel.TaskFromContext(ctx); caller != bp.task { - return syserror.EINVAL - } - if opts.Length > mmapSizeLimit { - opts.Length = mmapSizeLimit - } - opts.MaxPerms.Write = false - - // TODO(b/30946773): Binder sets VM_DONTCOPY, preventing the created vma - // from being copied across fork(), but we don't support this yet. As - // a result, MMs containing a Binder mapping cannot be forked (MM.Fork will - // fail when AddMapping returns EBUSY). - - return fsutil.GenericConfigureMMap(file, bp, opts) -} - -// Ioctl implements fs.FileOperations.Ioctl. -// -// TODO(b/30946773): Implement. -func (bp *Proc) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - // Switch on ioctl request. - switch uint32(args[1].Int()) { - case linux.BinderVersionIoctl: - ver := &linux.BinderVersion{ - ProtocolVersion: currentProtocolVersion, - } - // Copy result to user-space. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ver, usermem.IOOpts{ - AddressSpaceActive: true, - }) - return 0, err - case linux.BinderWriteReadIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderSetIdleTimeoutIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderSetMaxThreadsIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderSetIdlePriorityIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderSetContextMgrIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderThreadExitIoctl: - // TODO(b/30946773): Implement. - return 0, syserror.ENOSYS - default: - // Ioctls irrelevant to Binder. - return 0, syserror.EINVAL - } -} - -// AddMapping implements memmap.Mappable.AddMapping. -func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error { - bp.mu.Lock() - defer bp.mu.Unlock() - if bp.mapped.Length() != 0 { - // mmap has been called before, which binder_mmap() doesn't like. - return syserror.EBUSY - } - // Binder only allocates and maps a single page up-front - // (drivers/android/binder.c:binder_mmap() => binder_update_page_range()). - fr, err := bp.mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous) - if err != nil { - return err - } - bp.mapped = fr - return nil -} - -// RemoveMapping implements memmap.Mappable.RemoveMapping. -func (*Proc) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) { - // Nothing to do. Notably, we don't free bp.mapped to allow another mmap. -} - -// CopyMapping implements memmap.Mappable.CopyMapping. -func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error { - // Nothing to do. Notably, this is one case where CopyMapping isn't - // equivalent to AddMapping, as AddMapping would return EBUSY. - return nil -} - -// Translate implements memmap.Mappable.Translate. -func (bp *Proc) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { - // TODO(b/30946773): In addition to the page initially allocated and mapped - // in AddMapping (Linux: binder_mmap), Binder allocates and maps pages for - // each transaction (Linux: binder_ioctl => binder_ioctl_write_read => - // binder_thread_write => binder_transaction => binder_alloc_buf => - // binder_update_page_range). Since we don't actually implement - // BinderWriteReadIoctl (Linux: BINDER_WRITE_READ), we only ever have the - // first page. - var err error - if required.End > usermem.PageSize { - err = &memmap.BusError{syserror.EFAULT} - } - if required.Start == 0 { - return []memmap.Translation{ - { - Source: memmap.MappableRange{0, usermem.PageSize}, - File: bp.mfp.MemoryFile(), - Offset: bp.mapped.Start, - Perms: usermem.AnyAccess, - }, - }, err - } - return nil, err -} - -// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. -func (bp *Proc) InvalidateUnsavable(ctx context.Context) error { - return nil -} diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index a9b03d172..80e106e6f 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -11,6 +11,7 @@ go_library( "full.go", "null.go", "random.go", + "tty.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/dev", visibility = ["//pkg/sentry:internal"], @@ -20,8 +21,6 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", - "//pkg/sentry/fs/ashmem", - "//pkg/sentry/fs/binder", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/fs/tmpfs", diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index fb6c30ff0..f739c476c 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -20,8 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/ashmem" - "gvisor.dev/gvisor/pkg/sentry/fs/binder" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -40,12 +38,20 @@ const ( urandomDevMinor uint32 = 9 ) -func newCharacterDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode { +// TTY major device number comes from include/uapi/linux/major.h. +const ( + ttyDevMinor = 0 + ttyDevMajor = 5 +) + +func newCharacterDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSource, major uint16, minor uint32) *fs.Inode { return fs.NewInode(ctx, iops, msrc, fs.StableAttr{ - DeviceID: devDevice.DeviceID(), - InodeID: devDevice.NextIno(), - BlockSize: usermem.PageSize, - Type: fs.CharacterDevice, + DeviceID: devDevice.DeviceID(), + InodeID: devDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.CharacterDevice, + DeviceFileMajor: major, + DeviceFileMinor: minor, }) } @@ -81,7 +87,7 @@ func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.In } // New returns the root node of a device filesystem. -func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEnabled bool) *fs.Inode { +func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { contents := map[string]*fs.Inode{ "fd": newSymlink(ctx, "/proc/self/fd", msrc), "stdin": newSymlink(ctx, "/proc/self/fd/0", msrc), @@ -116,16 +122,8 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn // If no devpts is mounted, this will simply be a dangling // symlink, which is fine. "ptmx": newSymlink(ctx, "pts/ptmx", msrc), - } - - if binderEnabled { - binder := binder.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666)) - contents["binder"] = newCharacterDevice(ctx, binder, msrc) - } - if ashmemEnabled { - ashmem := ashmem.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666)) - contents["ashmem"] = newCharacterDevice(ctx, ashmem, msrc) + "tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor), } iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go index 6dbc8c382..55f8af704 100644 --- a/pkg/sentry/fs/dev/fs.go +++ b/pkg/sentry/fs/dev/fs.go @@ -15,19 +15,10 @@ package dev import ( - "strconv" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) -// Optional key containing boolean flag which specifies if Android Binder IPC should be enabled. -const binderEnabledKey = "binder_enabled" - -// Optional key containing boolean flag which specifies if Android ashmem should be enabled. -const ashmemEnabledKey = "ashmem_enabled" - // filesystem is a devtmpfs. // // +stateify savable @@ -39,7 +30,7 @@ func init() { fs.RegisterFilesystem(&filesystem{}) } -// FilesystemName is the name underwhich the filesystem is registered. +// FilesystemName is the name under which the filesystem is registered. // Name matches drivers/base/devtmpfs.c:dev_fs_type.name. const FilesystemName = "devtmpfs" @@ -67,33 +58,7 @@ func (*filesystem) Flags() fs.FilesystemFlags { // Mount returns a devtmpfs root that can be positioned in the vfs. func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) { - // device is always ignored. // devtmpfs backed by ramfs ignores bad options. See fs/ramfs/inode.c:ramfs_parse_options. // -> we should consider parsing the mode and backing devtmpfs by this. - - // Parse generic comma-separated key=value options. - options := fs.GenericMountSourceOptions(data) - - // binerEnabledKey is optional and binder is disabled by default. - binderEnabled := false - if beStr, exists := options[binderEnabledKey]; exists { - var err error - binderEnabled, err = strconv.ParseBool(beStr) - if err != nil { - return nil, syserror.EINVAL - } - } - - // ashmemEnabledKey is optional and ashmem is disabled by default. - ashmemEnabled := false - if aeStr, exists := options[ashmemEnabledKey]; exists { - var err error - ashmemEnabled, err = strconv.ParseBool(aeStr) - if err != nil { - return nil, syserror.EINVAL - } - } - - // Construct the devtmpfs root. - return New(ctx, fs.NewNonCachingMountSource(ctx, f, flags), binderEnabled, ashmemEnabled), nil + return New(ctx, fs.NewNonCachingMountSource(ctx, f, flags)), nil } diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/dev/tty.go index 594039367..87d80e292 100644 --- a/pkg/sentry/fs/ashmem/device.go +++ b/pkg/sentry/fs/dev/tty.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,21 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package ashmem implements Android ashmem module (Anonymus Shared Memory). -package ashmem +package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) -// Device implements fs.InodeOperations. -// // +stateify savable -type Device struct { +type ttyInodeOperations struct { fsutil.InodeGenericChecker `state:"nosave"` fsutil.InodeNoExtendedAttributes `state:"nosave"` fsutil.InodeNoopAllocate `state:"nosave"` @@ -35,6 +32,7 @@ type Device struct { fsutil.InodeNoopWriteOut `state:"nosave"` fsutil.InodeNotDirectory `state:"nosave"` fsutil.InodeNotMappable `state:"nosave"` + fsutil.InodeNotOpenable `state:"nosave"` fsutil.InodeNotSocket `state:"nosave"` fsutil.InodeNotSymlink `state:"nosave"` fsutil.InodeVirtual `state:"nosave"` @@ -42,20 +40,28 @@ type Device struct { fsutil.InodeSimpleAttributes } -var _ fs.InodeOperations = (*Device)(nil) +var _ fs.InodeOperations = (*ttyInodeOperations)(nil) -// NewDevice creates and intializes a Device structure. -func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device { - return &Device{ - InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, linux.ANON_INODE_FS_MAGIC), +func newTTYDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *ttyInodeOperations { + return &ttyInodeOperations{ + InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC), } } -// GetFile implements fs.InodeOperations.GetFile. -func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - return fs.NewFile(ctx, d, flags, &Area{ - ad: ad, - tmpfsFile: nil, - perms: usermem.AnyAccess, - }), nil +// +stateify savable +type ttyFileOperations struct { + fsutil.FileNoSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNoopWrite `state:"nosave"` + fsutil.FileNoopRead `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` } + +var _ fs.FileOperations = (*ttyFileOperations)(nil) diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 3cba259d9..fbca06761 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -229,11 +229,13 @@ func newDirent(inode *Inode, name string) *Dirent { if inode != nil { inode.MountSource.IncDirentRefs() } - return &Dirent{ + d := Dirent{ Inode: inode, name: name, children: make(map[string]*refs.WeakRef), } + d.EnableLeakCheck("fs.Dirent") + return &d } // NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent. @@ -918,7 +920,7 @@ type DirIterator interface { // calls, and must start with the given offset. // // The caller must ensure that this operation is permitted. - IterateDir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error) + IterateDir(ctx context.Context, d *Dirent, dirCtx *DirCtx, offset int) (int, error) } // DirentReaddir serializes the directory entries of d including "." and "..". @@ -988,7 +990,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, // it.IterateDir should be passed an offset that does not include the // initial dot elements. We will add them back later. offset -= 2 - newOffset, err := it.IterateDir(ctx, dirCtx, int(offset)) + newOffset, err := it.IterateDir(ctx, d, dirCtx, int(offset)) if int64(newOffset) < offset { panic(fmt.Sprintf("node.Readdir returned offset %v less than input offset %v", newOffset, offset)) } diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 71f2d11de..60a15a275 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -146,7 +146,7 @@ func (c *DirentCache) contains(d *Dirent) bool { return c.list.Front() == d } -// Invalidate removes all Dirents from the cache, caling DecRef on each. +// Invalidate removes all Dirents from the cache, calling DecRef on each. func (c *DirentCache) Invalidate() { if c == nil { return @@ -159,7 +159,7 @@ func (c *DirentCache) Invalidate() { } // setMaxSize sets cache max size. If current size is larger than max size, the -// cache shrinks to acommodate the new max. +// cache shrinks to accommodate the new max. func (c *DirentCache) setMaxSize(max uint64) { c.mu.Lock() c.maxSize = max diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD new file mode 100644 index 000000000..b386f31c8 --- /dev/null +++ b/pkg/sentry/fs/ext/BUILD @@ -0,0 +1,18 @@ +package(licenses = ["notice"]) + +load("//tools/go_stateify:defs.bzl", "go_library") + +go_library( + name = "ext", + srcs = [ + "ext.go", + "utils.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/fs/ext/disklayout", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fs/ext/assets/README.md b/pkg/sentry/fs/ext/assets/README.md new file mode 100644 index 000000000..b6eac4f40 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/README.md @@ -0,0 +1,28 @@ +### Tiny Ext4 Image + +The image is of size 64Kb which supports 64 1k blocks and 16 inodes. This is the +smallest size mkfs.ext4 works with. + +This image was generated using the following commands. + +```bash +fallocate -l 64K tiny.ext4 +mkfs.ext4 -j tiny.ext4 +``` + +You can mount it using: + +```bash +sudo mount -o loop tiny.ext4 $MOUNTPOINT +``` + +`file.txt`, `bigfile.txt` and `symlink.txt` were added to this image by just +mounting it and copying (while preserving links) those files to the mountpoint +directory using: + +```bash +sudo cp -P {file.txt,symlink.txt,bigfile.txt} $MOUNTPOINT/ +``` + +The files in this directory mirror the contents and organisation of the files +stored in the image. diff --git a/pkg/sentry/fs/ext/assets/bigfile.txt b/pkg/sentry/fs/ext/assets/bigfile.txt new file mode 100644 index 000000000..3857cf516 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/bigfile.txt @@ -0,0 +1,41 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus faucibus eleifend orci, ut ornare nibh faucibus eu. Cras at condimentum massa. Nullam luctus, elit non porttitor congue, sapien diam feugiat sapien, sed eleifend nulla mauris non arcu. Sed lacinia mauris magna, eu mollis libero varius sit amet. Donec mollis, quam convallis commodo posuere, dolor nisi placerat nisi, in faucibus augue mi eu lorem. In pharetra consectetur faucibus. Ut euismod ex efficitur egestas tincidunt. Maecenas condimentum ut ante in rutrum. Vivamus sed arcu tempor, faucibus turpis et, lacinia diam. + +Sed in lacus vel nisl interdum bibendum in sed justo. Nunc tellus risus, molestie vitae arcu sed, molestie tempus ligula. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc risus neque, volutpat et ante non, ullamcorper condimentum ante. Aliquam sed metus in urna condimentum convallis. Vivamus ut libero mauris. Proin mollis posuere consequat. Vestibulum placerat mollis est et pulvinar. + +Donec rutrum odio ac diam pharetra, id fermentum magna cursus. Pellentesque in dapibus elit, et condimentum orci. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Suspendisse euismod dapibus est, id vestibulum mauris. Nulla facilisi. Nulla cursus gravida nisi. Phasellus vestibulum rutrum lectus, a dignissim mauris hendrerit vitae. In at elementum mauris. Integer vel efficitur velit. Nullam fringilla sapien mi, quis luctus neque efficitur ac. Aenean nec quam dapibus nunc commodo pharetra. Proin sapien mi, fermentum aliquet vulputate non, aliquet porttitor diam. Quisque lacinia, urna et finibus fermentum, nunc lacus vehicula ex, sed congue metus lectus ac quam. Aliquam erat volutpat. Suspendisse sodales, dolor ut tincidunt finibus, augue erat varius tellus, a interdum erat sem at nunc. Vestibulum cursus iaculis sapien, vitae feugiat dui auctor quis. + +Pellentesque nec maximus nulla, eu blandit diam. Maecenas quis arcu ornare, congue ante at, vehicula ipsum. Praesent feugiat mauris rutrum sem fermentum, nec luctus ipsum placerat. Pellentesque placerat ipsum at dignissim fringilla. Vivamus et posuere sem, eget hendrerit felis. Aenean vulputate, augue vel mollis feugiat, justo ipsum mollis dolor, eu mollis elit neque ut ipsum. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Fusce bibendum sem quam, vulputate laoreet mi dapibus imperdiet. Sed a purus non nibh pretium aliquet. Integer eget luctus augue, vitae tincidunt magna. Ut eros enim, egestas eu nulla et, lobortis egestas arcu. Cras id ipsum ac justo lacinia rutrum. Vivamus lectus leo, ultricies sed justo at, pellentesque feugiat magna. Ut sollicitudin neque elit, vel ornare mauris commodo id. + +Duis dapibus orci et sapien finibus finibus. Mauris eleifend, lacus at vestibulum maximus, quam ligula pharetra erat, sit amet dapibus neque elit vitae neque. In bibendum sollicitudin erat, eget ultricies tortor malesuada at. Sed sit amet orci turpis. Donec feugiat ligula nibh, molestie tincidunt lectus elementum id. Donec volutpat maximus nibh, in vulputate felis posuere eu. Cras tincidunt ullamcorper lacus. Phasellus porta lorem auctor, congue magna a, commodo elit. + +Etiam auctor mi quis elit sodales, eu pulvinar arcu condimentum. Aenean imperdiet risus et dapibus tincidunt. Nullam tincidunt dictum dui, sed commodo urna rutrum id. Ut mollis libero vel elit laoreet bibendum. Quisque arcu arcu, tincidunt at ultricies id, vulputate nec metus. In tristique posuere quam sit amet volutpat. Vivamus scelerisque et nunc at dapibus. Fusce finibus libero ut ligula pretium rhoncus. Mauris non elit in arcu finibus imperdiet. Pellentesque nec massa odio. Proin rutrum mauris non sagittis efficitur. Aliquam auctor quam at dignissim faucibus. Ut eget ligula in magna posuere ultricies vitae sit amet turpis. Duis maximus odio nulla. Donec gravida sem tristique tempus scelerisque. + +Interdum et malesuada fames ac ante ipsum primis in faucibus. Fusce pharetra magna vulputate aliquet tempus. Duis id hendrerit arcu. Quisque ut ex elit. Integer velit orci, venenatis ut sapien ac, placerat porttitor dui. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc hendrerit cursus diam, hendrerit finibus ipsum scelerisque ut. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. + +Nulla non euismod neque. Phasellus vel sapien eu metus pulvinar rhoncus. Suspendisse eu mollis tellus, quis vestibulum tortor. Maecenas interdum dolor sed nulla fermentum maximus. Donec imperdiet ullamcorper condimentum. Nam quis nibh ante. Praesent quis tellus ut tortor pulvinar blandit sit amet ut sapien. Vestibulum est orci, pellentesque vitae tristique sit amet, tristique non felis. + +Vivamus sodales pellentesque varius. Sed vel tempus ligula. Nulla tristique nisl vel dui facilisis, ac sodales augue hendrerit. Proin augue nisi, vestibulum quis augue nec, sagittis tincidunt velit. Vestibulum euismod, nulla nec sodales faucibus, urna sapien vulputate magna, id varius metus sapien ut neque. Duis in mollis urna, in scelerisque enim. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc condimentum dictum turpis, et egestas neque dapibus eget. Quisque fringilla, dui eu venenatis eleifend, erat nibh lacinia urna, at lacinia lacus sapien eu dui. Duis eu erat ut mi lacinia convallis a sed ex. + +Fusce elit metus, tincidunt nec eleifend a, hendrerit nec ligula. Duis placerat finibus sollicitudin. In euismod porta tellus, in luctus justo bibendum bibendum. Maecenas at magna eleifend lectus tincidunt suscipit ut a ligula. Nulla tempor accumsan felis, fermentum dapibus est eleifend vitae. Mauris urna sem, fringilla at ultricies non, ultrices in arcu. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam vehicula nunc at laoreet imperdiet. Nunc tristique ut risus id aliquet. Integer eleifend massa orci. + +Vestibulum sed ante sollicitudin nisi fringilla bibendum nec vel quam. Sed pretium augue eu ligula congue pulvinar. Donec vitae magna tincidunt, pharetra lacus id, convallis nulla. Cras viverra nisl nisl, varius convallis leo vulputate nec. Morbi at consequat dui, sed aliquet metus. Sed suscipit fermentum mollis. Maecenas nec mi sodales, tincidunt purus in, tristique mauris. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec interdum mi in velit efficitur, quis ultrices ex imperdiet. Sed vestibulum, magna ut tristique pretium, mi ipsum placerat tellus, non tempor enim augue et ex. Pellentesque eget felis quis ante sodales viverra ac sed lacus. Donec suscipit tempus massa, eget laoreet massa molestie at. + +Aenean fringilla dui non aliquet consectetur. Fusce cursus quam nec orci hendrerit faucibus. Donec consequat suscipit enim, non volutpat lectus auctor interdum. Proin lorem purus, maximus vel orci vitae, suscipit egestas turpis. Donec risus urna, congue a sem eu, aliquet placerat odio. Morbi gravida tristique turpis, quis efficitur enim. Nunc interdum gravida ipsum vel facilisis. Nunc congue finibus sollicitudin. Quisque euismod aliquet lectus et tincidunt. Curabitur ultrices sem ut mi fringilla fermentum. Morbi pretium, nisi sit amet dapibus congue, dolor enim consectetur risus, a interdum ligula odio sed odio. Quisque facilisis, mi at suscipit gravida, nunc sapien cursus justo, ut luctus odio nulla quis leo. Integer condimentum lobortis mauris, non egestas tellus lobortis sit amet. + +In sollicitudin velit ac ante vehicula, vitae varius tortor mollis. In hac habitasse platea dictumst. Quisque et orci lorem. Integer malesuada fringilla luctus. Pellentesque malesuada, mi non lobortis porttitor, ante ligula vulputate ante, nec dictum risus eros sit amet sapien. Nulla aliquam lorem libero, ac varius nulla tristique eget. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut pellentesque mauris orci, vel consequat mi varius a. Ut sit amet elit vulputate, lacinia metus non, fermentum nisl. Pellentesque eu nisi sed quam egestas blandit. Duis sit amet lobortis dolor. Donec consectetur sem interdum, tristique elit sit amet, sodales lacus. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Fusce id aliquam augue. Sed pretium congue risus vitae lacinia. Vestibulum non vulputate risus, ut malesuada justo. + +Sed odio elit, consectetur ac mauris quis, consequat commodo libero. Fusce sodales velit vulputate pulvinar fermentum. Donec iaculis nec nisl eget faucibus. Mauris at dictum velit. Donec fermentum lectus eu viverra volutpat. Aliquam consequat facilisis lorem, cursus consequat dui bibendum ullamcorper. Pellentesque nulla magna, imperdiet at magna et, cursus egestas enim. Nullam semper molestie lectus sit amet semper. Duis eget tincidunt est. Integer id neque risus. Integer ultricies hendrerit vestibulum. Donec blandit blandit sagittis. Nunc consectetur vitae nisi consectetur volutpat. + +Nulla id lorem fermentum, efficitur magna a, hendrerit dui. Vivamus sagittis orci gravida, bibendum quam eget, molestie est. Phasellus nec enim tincidunt, volutpat sapien non, laoreet diam. Nulla posuere enim nec porttitor lobortis. Donec auctor odio ut orci eleifend, ut eleifend purus convallis. Interdum et malesuada fames ac ante ipsum primis in faucibus. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut hendrerit, purus eget viverra tincidunt, sem magna imperdiet libero, et aliquam turpis neque vitae elit. Maecenas semper varius iaculis. Cras non lorem quis quam bibendum eleifend in et libero. Curabitur at purus mauris. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus porta diam sed elit eleifend gravida. + +Nulla facilisi. Ut ultricies diam vel diam consectetur, vel porta augue molestie. Fusce interdum sapien et metus facilisis pellentesque. Nulla convallis sem at nunc vehicula facilisis. Nam ac rutrum purus. Nunc bibendum, dolor sit amet tempus ullamcorper, lorem leo tempor sem, id fringilla nunc augue scelerisque augue. Nullam sit amet rutrum nisl. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Donec sed mauris gravida eros vehicula sagittis at eget orci. Cras elementum, eros at accumsan bibendum, libero neque blandit purus, vitae vestibulum libero massa ac nibh. Integer at placerat nulla. Mauris eu eleifend orci. Aliquam consequat ligula vitae erat porta lobortis. Duis fermentum elit ac aliquet ornare. + +Mauris eget cursus tellus, eget sodales purus. Aliquam malesuada, augue id vulputate finibus, nisi ex bibendum nisl, sit amet laoreet quam urna a dolor. Nullam ultricies, sapien eu laoreet consequat, erat eros dignissim diam, ultrices sodales lectus mauris et leo. Morbi lacinia eu ante at tempus. Sed iaculis finibus magna malesuada efficitur. Donec faucibus erat sit amet elementum feugiat. Praesent a placerat nisi. Etiam lacinia gravida diam, et sollicitudin sapien tincidunt ut. + +Maecenas felis quam, tincidunt vitae venenatis scelerisque, viverra vitae odio. Phasellus enim neque, ultricies suscipit malesuada sit amet, vehicula sit amet purus. Nulla placerat sit amet dui vel tincidunt. Nam quis neque vel magna commodo egestas. Vestibulum sagittis rutrum lorem ut congue. Maecenas vel ultrices tellus. Donec efficitur, urna ac consequat iaculis, lorem felis pharetra eros, eget faucibus orci lectus sit amet arcu. + +Ut a tempus nisi. Nulla facilisi. Praesent vulputate maximus mi et dapibus. Sed sit amet libero ac augue hendrerit efficitur in a sapien. Mauris placerat velit sit amet tellus sollicitudin faucibus. Donec egestas a magna ac suscipit. Duis enim sapien, mollis sed egestas et, vestibulum vel leo. + +Proin quis dapibus dui. Donec eu tincidunt nunc. Vivamus eget purus consectetur, maximus ante vitae, tincidunt elit. Aenean mattis dolor a gravida aliquam. Praesent quis tellus id sem maximus vulputate nec sed nulla. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur metus nulla, volutpat volutpat est eu, hendrerit congue erat. Aliquam sollicitudin augue ante. Sed sollicitudin, magna eu consequat elementum, mi augue ullamcorper felis, molestie imperdiet erat metus iaculis est. Proin ac tortor nisi. Pellentesque quis nisi risus. Integer enim sapien, tincidunt quis tortor id, accumsan venenatis mi. Nulla facilisi. + +Cras pretium sit amet quam congue maximus. Morbi lacus libero, imperdiet commodo massa sed, scelerisque placerat libero. Cras nisl nisi, consectetur sed bibendum eu, venenatis at enim. Proin sodales justo at quam aliquam, a consectetur mi ornare. Donec porta ac est sit amet efficitur. Suspendisse vestibulum tortor id neque imperdiet, id lacinia risus vehicula. Phasellus ac eleifend purus. Mauris vel gravida ante. Aliquam vitae lobortis risus. Sed vehicula consectetur tincidunt. Nam et justo vitae purus molestie consequat. Pellentesque ipsum ex, convallis quis blandit non, gravida et urna. Donec diam ligula amet. diff --git a/pkg/sentry/fs/ext/assets/file.txt b/pkg/sentry/fs/ext/assets/file.txt new file mode 100644 index 000000000..980a0d5f1 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/file.txt @@ -0,0 +1 @@ +Hello World! diff --git a/pkg/sentry/fs/ext/assets/symlink.txt b/pkg/sentry/fs/ext/assets/symlink.txt new file mode 120000 index 000000000..4c330738c --- /dev/null +++ b/pkg/sentry/fs/ext/assets/symlink.txt @@ -0,0 +1 @@ +file.txt
\ No newline at end of file diff --git a/pkg/sentry/fs/ext/assets/tiny.ext4 b/pkg/sentry/fs/ext/assets/tiny.ext4 Binary files differnew file mode 100644 index 000000000..a6859736d --- /dev/null +++ b/pkg/sentry/fs/ext/assets/tiny.ext4 diff --git a/pkg/sentry/fs/ext/disklayout/BUILD b/pkg/sentry/fs/ext/disklayout/BUILD new file mode 100644 index 000000000..dde15110d --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/BUILD @@ -0,0 +1,48 @@ +package(licenses = ["notice"]) + +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +go_library( + name = "disklayout", + srcs = [ + "block_group.go", + "block_group_32.go", + "block_group_64.go", + "dirent.go", + "dirent_new.go", + "dirent_old.go", + "disklayout.go", + "extent.go", + "inode.go", + "inode_new.go", + "inode_old.go", + "superblock.go", + "superblock_32.go", + "superblock_64.go", + "superblock_old.go", + "test_utils.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + ], +) + +go_test( + name = "disklayout_test", + size = "small", + srcs = [ + "block_group_test.go", + "dirent_test.go", + "extent_test.go", + "inode_test.go", + "superblock_test.go", + ], + embed = [":disklayout"], + deps = ["//pkg/sentry/kernel/time"], +) diff --git a/pkg/sentry/fs/ext4/disklayout/block_group.go b/pkg/sentry/fs/ext/disklayout/block_group.go index 7359df5b9..ad6f4fef8 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group.go +++ b/pkg/sentry/fs/ext/disklayout/block_group.go @@ -12,26 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package disklayout provides ext4 disk level structures which can be directly -// filled with bytes from the underlying device. All structures on disk are in -// little-endian order. Only jbd2 (journal) structures are in big-endian order. -// Structs aim to emulate structures `exactly` how they are layed out on disk. -// -// Note: All fields in these structs are exported because binary.Read would -// panic otherwise. package disklayout -// BlockGroup represents Linux struct ext4_group_desc which is internally -// called a block group descriptor. An ext4 file system is split into a series -// of block groups. This provides an access layer to information needed to -// access and use a block group. +// BlockGroup represents a Linux ext block group descriptor. An ext file system +// is split into a series of block groups. This provides an access layer to +// information needed to access and use a block group. +// +// Location: +// - The block group descriptor table is always placed in the blocks +// immediately after the block containing the superblock. +// - The 1st block group descriptor in the original table is in the +// (sb.FirstDataBlock() + 1)th block. +// - See SuperBlock docs to see where the block group descriptor table is +// replicated. +// - sb.BgDescSize() must be used as the block group descriptor entry size +// while reading the table from disk. // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. type BlockGroup interface { // InodeTable returns the absolute block number of the block containing the // inode table. This points to an array of Inode structs. Inode tables are // statically allocated at mkfs time. The superblock records the number of - // inodes per group (length of this table). + // inodes per group (length of this table) and the size of each inode struct. InodeTable() uint64 // BlockBitmap returns the absolute block number of the block containing the @@ -73,15 +75,15 @@ type BlockGroup interface { // Checksum returns this block group's checksum. // - // If RO_COMPAT_METADATA_CSUM feature is set: + // If SbMetadataCsum feature is set: // - checksum is crc32c(FS UUID + group number + group descriptor // structure) & 0xFFFF. // - // If RO_COMPAT_GDT_CSUM feature is set: + // If SbGdtCsum feature is set: // - checksum is crc16(FS UUID + group number + group descriptor // structure). // - // RO_COMPAT_METADATA_CSUM and RO_COMPAT_GDT_CSUM should not be both set. + // SbMetadataCsum and SbGdtCsum should not be both set. // If they are, Linux warns and asks to run fsck. Checksum() uint16 @@ -128,8 +130,8 @@ func (f BGFlags) ToInt() uint16 { // BGFlagsFromInt converts the 16-bit flag representation to a BGFlags struct. func BGFlagsFromInt(flags uint16) BGFlags { return BGFlags{ - InodeUninit: (flags & BgInodeUninit) > 0, - BlockUninit: (flags & BgBlockUninit) > 0, - InodeZeroed: (flags & BgInodeZeroed) > 0, + InodeUninit: flags&BgInodeUninit > 0, + BlockUninit: flags&BgBlockUninit > 0, + InodeZeroed: flags&BgInodeZeroed > 0, } } diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_32.go b/pkg/sentry/fs/ext/disklayout/block_group_32.go index 087f1fb4a..3e16c76db 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_32.go +++ b/pkg/sentry/fs/ext/disklayout/block_group_32.go @@ -15,14 +15,8 @@ package disklayout // BlockGroup32Bit emulates the first half of struct ext4_group_desc in -// fs/ext4/ext4.h. It is the block group descriptor struct for 32-bit ext4 -// filesystems. It implements BlockGroup interface. -// -// The suffix `Lo` here stands for lower bits because this is also used in the -// 64-bit version where these fields represent the lower half of the fields. -// The suffix `Raw` has been added to indicate that the field does not have a -// counterpart in the 64-bit version and to resolve name collision with the -// interface. +// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and +// 32-bit ext4 filesystems. It implements BlockGroup interface. type BlockGroup32Bit struct { BlockBitmapLo uint32 InodeBitmapLo uint32 @@ -38,6 +32,9 @@ type BlockGroup32Bit struct { ChecksumRaw uint16 } +// Compiles only if BlockGroup32Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup32Bit)(nil) + // InodeTable implements BlockGroup.InodeTable. func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) } diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_64.go b/pkg/sentry/fs/ext/disklayout/block_group_64.go index 27de3990d..9a809197a 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_64.go +++ b/pkg/sentry/fs/ext/disklayout/block_group_64.go @@ -18,9 +18,6 @@ package disklayout // It is the block group descriptor struct for 64-bit ext4 filesystems. // It implements BlockGroup interface. It is an extension of the 32-bit // version of BlockGroup. -// -// The suffix `Hi` here stands for upper bits because they represent the upper -// half of the fields. type BlockGroup64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. @@ -40,6 +37,9 @@ type BlockGroup64Bit struct { _ uint32 // Padding to 64 bytes. } +// Compiles only if BlockGroup64Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup64Bit)(nil) + // Methods to override. Checksum() and Flags() are not overridden. // InodeTable implements BlockGroup.InodeTable. diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_test.go b/pkg/sentry/fs/ext/disklayout/block_group_test.go index 794ae99cc..0ef4294c0 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_test.go +++ b/pkg/sentry/fs/ext/disklayout/block_group_test.go @@ -16,19 +16,11 @@ package disklayout import ( "testing" - - "gvisor.dev/gvisor/pkg/binary" ) -// TestBlockGroupSize tests the fact that the block group struct for -// 32-bit ext filesystems should be exactly 32 bytes big and for 64-bit fs it -// should be 64 bytes. +// TestBlockGroupSize tests that the block group descriptor structs are of the +// correct size. func TestBlockGroupSize(t *testing.T) { - if got, want := int(binary.Size(BlockGroup32Bit{})), 32; got != want { - t.Errorf("BlockGroup32Bit should be exactly 32 bytes but is %d bytes", got) - } - - if got, want := int(binary.Size(BlockGroup64Bit{})), 64; got != want { - t.Errorf("BlockGroup64Bit should be exactly 64 bytes but is %d bytes", got) - } + assertSize(t, BlockGroup32Bit{}, 32) + assertSize(t, BlockGroup64Bit{}, 64) } diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go new file mode 100644 index 000000000..685bf57b8 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent.go @@ -0,0 +1,69 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +const ( + // MaxFileName is the maximum length of an ext fs file's name. + MaxFileName = 255 +) + +var ( + // inodeTypeByFileType maps ext4 file types to vfs inode types. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype. + inodeTypeByFileType = map[uint8]fs.InodeType{ + 0: fs.Anonymous, + 1: fs.RegularFile, + 2: fs.Directory, + 3: fs.CharacterDevice, + 4: fs.BlockDevice, + 5: fs.Pipe, + 6: fs.Socket, + 7: fs.Symlink, + } +) + +// The Dirent interface should be implemented by structs representing ext +// directory entries. These are for the linear classical directories which +// just store a list of dirent structs. A directory is a series of data blocks +// where is each data block contains a linear array of dirents. The last entry +// of the block has a record size that takes it to the end of the block. The +// end of the directory is when you read dirInode.Size() bytes from the blocks. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. +type Dirent interface { + // Inode returns the absolute inode number of the underlying inode. + // Inode number 0 signifies an unused dirent. + Inode() uint32 + + // RecordSize returns the record length of this dirent on disk. The next + // dirent in the dirent list should be read after these many bytes from + // the current dirent. Must be a multiple of 4. + RecordSize() uint16 + + // FileName returns the name of the file. Can be at most 255 is length. + FileName() string + + // FileType returns the inode type of the underlying inode. This is a + // performance hack so that we do not have to read the underlying inode struct + // to know the type of inode. This will only work when the SbDirentFileType + // feature is set. If not, the second returned value will be false indicating + // that user code has to use the inode mode to extract the file type. + FileType() (fs.InodeType, bool) +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_new.go b/pkg/sentry/fs/ext/disklayout/dirent_new.go new file mode 100644 index 000000000..29ae4a5c2 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent_new.go @@ -0,0 +1,61 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +// DirentNew represents the ext4 directory entry struct. This emulates Linux's +// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we +// only need 8 bits to store the NameLength. As a result, NameLength has been +// shortened and the other 8 bits are used to encode the file type. Use the +// FileTypeRaw field only if the SbDirentFileType feature is set. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentNew struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint8 + FileTypeRaw uint8 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentNew implements Dirent. +var _ Dirent = (*DirentNew)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentNew) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentNew) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentNew) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentNew) FileType() (fs.InodeType, bool) { + if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok { + return inodeType, true + } + + panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw)) +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_old.go b/pkg/sentry/fs/ext/disklayout/dirent_old.go new file mode 100644 index 000000000..2e0f9c812 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent_old.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/fs" + +// DirentOld represents the old directory entry struct which does not contain +// the file type. This emulates Linux's ext4_dir_entry struct. This is used in +// ext2, ext3 and sometimes in ext4. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentOld struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint16 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentOld implements Dirent. +var _ Dirent = (*DirentOld)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentOld) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentOld) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentOld) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentOld) FileType() (fs.InodeType, bool) { + return fs.Anonymous, false +} diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go index 304da2032..cc6dff2c9 100644 --- a/pkg/sentry/kernel/kdefs/kdefs.go +++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,9 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package kdefs defines common kernel definitions. -// -package kdefs +package disklayout + +import ( + "testing" +) + +// TestDirentSize tests that the dirent structs are of the correct +// size. +func TestDirentSize(t *testing.T) { + want := uintptr(263) -// FD is a File Descriptor. -type FD int32 + assertSize(t, DirentOld{}, want) + assertSize(t, DirentNew{}, want) +} diff --git a/pkg/sentry/fs/ext/disklayout/disklayout.go b/pkg/sentry/fs/ext/disklayout/disklayout.go new file mode 100644 index 000000000..bdf4e2132 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/disklayout.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package disklayout provides Linux ext file system's disk level structures +// which can be directly read into from the underlying device. Structs aim to +// emulate structures `exactly` how they are layed out on disk. +// +// This library aims to be compatible with all ext(2/3/4) systems so it +// provides a generic interface for all major structures and various +// implementations (for different versions). The user code is responsible for +// using appropriate implementations based on the underlying device. +// +// Interfacing all major structures here serves a few purposes: +// - Abstracts away the complexity of the underlying structure from client +// code. The client only has to figure out versioning on set up and then +// can use these as black boxes and pass it higher up the stack. +// - Having pointer receivers forces the user to use pointers to these +// heavy structs. Hence, prevents the client code from unintentionally +// copying these by value while passing the interface around. +// - Version-based implementation selection is resolved on set up hence +// avoiding per call overhead of choosing implementation. +// - All interface methods are pretty light weight (do not take in any +// parameters by design). Passing pointer arguments to interface methods +// can lead to heap allocation as the compiler won't be able to perform +// escape analysis on an unknown implementation at compile time. +// +// Notes: +// - All fields in these structs are exported because binary.Read would +// panic otherwise. +// - All structures on disk are in little-endian order. Only jbd2 (journal) +// structures are in big-endian order. +// - All OS dependent fields in these structures will be interpretted using +// the Linux version of that field. +// - The suffix `Lo` in field names stands for lower bits of that field. +// - The suffix `Hi` in field names stands for upper bits of that field. +// - The suffix `Raw` has been added to indicate that the field is not split +// into Lo and Hi fields and also to resolve name collision with the +// respective interface. +package disklayout diff --git a/pkg/sentry/fs/ext/disklayout/extent.go b/pkg/sentry/fs/ext/disklayout/extent.go new file mode 100644 index 000000000..567523d32 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/extent.go @@ -0,0 +1,139 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// Extents were introduced in ext4 and provide huge performance gains in terms +// data locality and reduced metadata block usage. Extents are organized in +// extent trees. The root node is contained in inode.BlocksRaw. +// +// Terminology: +// - Physical Block: +// Filesystem data block which is addressed normally wrt the entire +// filesystem (addressed with 48 bits). +// +// - File Block: +// Data block containing *only* file data and addressed wrt to the file +// with only 32 bits. The (i)th file block contains file data from +// byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()). + +const ( + // ExtentStructsSize is the size of all the three extent on-disk structs. + ExtentStructsSize = 12 + + // ExtentMagic is the magic number which must be present in the header. + ExtentMagic = 0xf30a +) + +// ExtentEntryPair couples an in-memory ExtendNode with the ExtentEntry that +// points to it. We want to cache these structs in memory to avoid repeated +// disk reads. +// +// Note: This struct itself does not represent an on-disk struct. +type ExtentEntryPair struct { + // Entry points to the child node on disk. + Entry ExtentEntry + // Node points to child node in memory. Is nil if the current node is a leaf. + Node *ExtentNode +} + +// ExtentNode represents an extent tree node. For internal nodes, all Entries +// will be ExtendIdxs. For leaf nodes, they will all be Extents. +// +// Note: This struct itself does not represent an on-disk struct. +type ExtentNode struct { + Header ExtentHeader + Entries []ExtentEntryPair +} + +// ExtentEntry reprsents an extent tree node entry. The entry can either be +// an ExtentIdx or Extent itself. This exists to simplify navigation logic. +type ExtentEntry interface { + // FileBlock returns the first file block number covered by this entry. + FileBlock() uint32 + + // PhysicalBlock returns the child physical block that this entry points to. + PhysicalBlock() uint64 +} + +// ExtentHeader emulates the ext4_extent_header struct in ext4. Each extent +// tree node begins with this and is followed by `NumEntries` number of: +// - Extent if `Depth` == 0 +// - ExtentIdx otherwise +type ExtentHeader struct { + // Magic in the extent magic number, must be 0xf30a. + Magic uint16 + + // NumEntries indicates the number of valid entries following the header. + NumEntries uint16 + + // MaxEntries that could follow the header. Used while adding entries. + MaxEntries uint16 + + // Height represents the distance of this node from the farthest leaf. Please + // note that Linux incorrectly calls this `Depth` (which means the distance + // of the node from the root). + Height uint16 + _ uint32 +} + +// ExtentIdx emulates the ext4_extent_idx struct in ext4. Only present in +// internal nodes. Sorted in ascending order based on FirstFileBlock since +// Linux does a binary search on this. This points to a block containing the +// child node. +type ExtentIdx struct { + FirstFileBlock uint32 + ChildBlockLo uint32 + ChildBlockHi uint16 + _ uint16 +} + +// Compiles only if ExtentIdx implements ExtentEntry. +var _ ExtentEntry = (*ExtentIdx)(nil) + +// FileBlock implements ExtentEntry.FileBlock. +func (ei *ExtentIdx) FileBlock() uint32 { + return ei.FirstFileBlock +} + +// PhysicalBlock implements ExtentEntry.PhysicalBlock. It returns the +// physical block number of the child block. +func (ei *ExtentIdx) PhysicalBlock() uint64 { + return (uint64(ei.ChildBlockHi) << 32) | uint64(ei.ChildBlockLo) +} + +// Extent represents the ext4_extent struct in ext4. Only present in leaf +// nodes. Sorted in ascending order based on FirstFileBlock since Linux does a +// binary search on this. This points to an array of data blocks containing the +// file data. It covers `Length` data blocks starting from `StartBlock`. +type Extent struct { + FirstFileBlock uint32 + Length uint16 + StartBlockHi uint16 + StartBlockLo uint32 +} + +// Compiles only if Extent implements ExtentEntry. +var _ ExtentEntry = (*Extent)(nil) + +// FileBlock implements ExtentEntry.FileBlock. +func (e *Extent) FileBlock() uint32 { + return e.FirstFileBlock +} + +// PhysicalBlock implements ExtentEntry.PhysicalBlock. It returns the +// physical block number of the first data block this extent covers. +func (e *Extent) PhysicalBlock() uint64 { + return (uint64(e.StartBlockHi) << 32) | uint64(e.StartBlockLo) +} diff --git a/pkg/sentry/fs/ext/disklayout/extent_test.go b/pkg/sentry/fs/ext/disklayout/extent_test.go new file mode 100644 index 000000000..b0fad9b71 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/extent_test.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestExtentSize tests that the extent structs are of the correct +// size. +func TestExtentSize(t *testing.T) { + assertSize(t, ExtentHeader{}, ExtentStructsSize) + assertSize(t, ExtentIdx{}, ExtentStructsSize) + assertSize(t, Extent{}, ExtentStructsSize) +} diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fs/ext/disklayout/inode.go new file mode 100644 index 000000000..b48001910 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode.go @@ -0,0 +1,267 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// The Inode interface must be implemented by structs representing ext inodes. +// The inode stores all the metadata pertaining to the file (except for the +// file name which is held by the directory entry). It does NOT expose all +// fields and should be extended if need be. +// +// Some file systems (e.g. FAT) use the directory entry to store all this +// information. Ext file systems do not so that they can support hard links. +// However, ext4 cheats a little bit and duplicates the file type in the +// directory entry for performance gains. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. +type Inode interface { + // Mode returns the linux file mode which is majorly used to extract + // information like: + // - File permissions (read/write/execute by user/group/others). + // - Sticky, set UID and GID bits. + // - File type. + // + // Masks to extract this information are provided in pkg/abi/linux/file.go. + Mode() linux.FileMode + + // UID returns the owner UID. + UID() auth.KUID + + // GID returns the owner GID. + GID() auth.KGID + + // Size returns the size of the file in bytes. + Size() uint64 + + // InodeSize returns the size of this inode struct in bytes. + // In ext2 and ext3, the inode struct and inode disk record size was fixed at + // 128 bytes. Ext4 makes it possible for the inode struct to be bigger. + // However, accessing any field beyond the 128 bytes marker must be verified + // using this method. + InodeSize() uint16 + + // AccessTime returns the last access time. Shows when the file was last read. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the extended attribute value checksum. + AccessTime() time.Time + + // ChangeTime returns the last change time. Shows when the file meta data + // (like permissions) was last changed. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the lower 32 bits of the attribute + // value’s reference count. + ChangeTime() time.Time + + // ModificationTime returns the last modification time. Shows when the file + // content was last modified. + // + // If InExtendedAttr is set, then this should NOT be used because + // the underlying field contains the number of the inode that owns the + // extended attribute. + ModificationTime() time.Time + + // DeletionTime returns the deletion time. Inodes are marked as deleted by + // writing to the underlying field. FS tools can restore files until they are + // actually overwritten. + DeletionTime() time.Time + + // LinksCount returns the number of hard links to this inode. + // + // Normally there is an upper limit on the number of hard links: + // - ext2/ext3 = 32,000 + // - ext4 = 65,000 + // + // This implies that an ext4 directory cannot have more than 64,998 + // subdirectories because each subdirectory will have a hard link to the + // directory via the `..` entry. The directory has hard link via the `.` entry + // of its own. And finally the inode is initiated with 1 hard link (itself). + // + // The underlying value is reset to 1 if all the following hold: + // - Inode is a directory. + // - SbDirNlink is enabled. + // - Number of hard links is incremented past 64,999. + // Hard link value of 1 for a directory would indicate that the number of hard + // links is unknown because a directory can have minimum 2 hard links (itself + // and `.` entry). + LinksCount() uint16 + + // Flags returns InodeFlags which represents the inode flags. + Flags() InodeFlags + + // Blocks returns the underlying inode.i_block array. This field is special + // and is used to store various kinds of things depending on the filesystem + // version and inode type. + // - In ext2/ext3, it contains the block map. + // - In ext4, it contains the extent tree. + // - For inline files, it contains the file contents. + // - For symlinks, it contains the link path (if it fits here). + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. + Blocks() [60]byte +} + +// Inode flags. This is not comprehensive and flags which were not used in +// the Linux kernel have been excluded. +const ( + // InSync indicates that all writes to the file must be synchronous. + InSync = 0x8 + + // InImmutable indicates that this file is immutable. + InImmutable = 0x10 + + // InAppend indicates that this file can only be appended to. + InAppend = 0x20 + + // InNoDump indicates that teh dump(1) utility should not dump this file. + InNoDump = 0x40 + + // InNoAccessTime indicates that the access time of this inode must not be + // updated. + InNoAccessTime = 0x80 + + // InIndex indicates that this directory has hashed indexes. + InIndex = 0x1000 + + // InJournalData indicates that file data must always be written through a + // journal device. + InJournalData = 0x4000 + + // InDirSync indicates that all the directory entiry data must be written + // synchronously. + InDirSync = 0x10000 + + // InTopDir indicates that this inode is at the top of the directory hierarchy. + InTopDir = 0x20000 + + // InHugeFile indicates that this is a huge file. + InHugeFile = 0x40000 + + // InExtents indicates that this inode uses extents. + InExtents = 0x80000 + + // InExtendedAttr indicates that this inode stores a large extended attribute + // value in its data blocks. + InExtendedAttr = 0x200000 + + // InInline indicates that this inode has inline data. + InInline = 0x10000000 + + // InReserved indicates that this inode is reserved for the ext4 library. + InReserved = 0x80000000 +) + +// InodeFlags represents all possible combinations of inode flags. It aims to +// cover the bit masks and provide a more user-friendly interface. +type InodeFlags struct { + Sync bool + Immutable bool + Append bool + NoDump bool + NoAccessTime bool + Index bool + JournalData bool + DirSync bool + TopDir bool + HugeFile bool + Extents bool + ExtendedAttr bool + Inline bool + Reserved bool +} + +// ToInt converts inode flags back to its 32-bit rep. +func (f InodeFlags) ToInt() uint32 { + var res uint32 + + if f.Sync { + res |= InSync + } + if f.Immutable { + res |= InImmutable + } + if f.Append { + res |= InAppend + } + if f.NoDump { + res |= InNoDump + } + if f.NoAccessTime { + res |= InNoAccessTime + } + if f.Index { + res |= InIndex + } + if f.JournalData { + res |= InJournalData + } + if f.DirSync { + res |= InDirSync + } + if f.TopDir { + res |= InTopDir + } + if f.HugeFile { + res |= InHugeFile + } + if f.Extents { + res |= InExtents + } + if f.ExtendedAttr { + res |= InExtendedAttr + } + if f.Inline { + res |= InInline + } + if f.Reserved { + res |= InReserved + } + + return res +} + +// InodeFlagsFromInt converts the integer representation of inode flags to +// a InodeFlags struct. +func InodeFlagsFromInt(f uint32) InodeFlags { + return InodeFlags{ + Sync: f&InSync > 0, + Immutable: f&InImmutable > 0, + Append: f&InAppend > 0, + NoDump: f&InNoDump > 0, + NoAccessTime: f&InNoAccessTime > 0, + Index: f&InIndex > 0, + JournalData: f&InJournalData > 0, + DirSync: f&InDirSync > 0, + TopDir: f&InTopDir > 0, + HugeFile: f&InHugeFile > 0, + Extents: f&InExtents > 0, + ExtendedAttr: f&InExtendedAttr > 0, + Inline: f&InInline > 0, + Reserved: f&InReserved > 0, + } +} + +// These masks define how users can view/modify inode flags. The rest of the +// flags are for internal kernel usage only. +const ( + InUserReadFlagMask = 0x4BDFFF + InUserWriteFlagMask = 0x4B80FF +) diff --git a/pkg/sentry/fs/ext/disklayout/inode_new.go b/pkg/sentry/fs/ext/disklayout/inode_new.go new file mode 100644 index 000000000..4f5348372 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_new.go @@ -0,0 +1,96 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/kernel/time" + +// InodeNew represents ext4 inode structure which can be bigger than +// OldInodeSize. The actual size of this struct should be determined using +// inode.ExtraInodeSize. Accessing any field here should be verified with the +// actual size. The extra space between the end of the inode struct and end of +// the inode record can be used to store extended attr. +// +// If the TimeExtra fields are in scope, the lower 2 bits of those are used +// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits +// are used to provide nanoscond precision. Hence, these timestamps will now +// overflow in May 2446. +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +type InodeNew struct { + InodeOld + + ExtraInodeSize uint16 + ChecksumHi uint16 + ChangeTimeExtra uint32 + ModificationTimeExtra uint32 + AccessTimeExtra uint32 + CreationTime uint32 + CreationTimeExtra uint32 + VersionHi uint32 + ProjectID uint32 +} + +// Compiles only if InodeNew implements Inode. +var _ Inode = (*InodeNew)(nil) + +// fromExtraTime decodes the extra time and constructs the kernel time struct +// with nanosecond precision. +func fromExtraTime(lo int32, extra uint32) time.Time { + // See description above InodeNew for format. + seconds := (int64(extra&0x3) << 32) + int64(lo) + nanoseconds := int64(extra >> 2) + return time.FromUnix(seconds, nanoseconds) +} + +// Only override methods which change due to ext4 specific fields. + +// Size implements Inode.Size. +func (in *InodeNew) Size() uint64 { + return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeNew) InodeSize() uint16 { + return oldInodeSize + in.ExtraInodeSize +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeNew) ChangeTime() time.Time { + // Apply new timestamp logic if inode.ChangeTimeExtra is in scope. + if in.ExtraInodeSize >= 8 { + return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra) + } + + return in.InodeOld.ChangeTime() +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeNew) ModificationTime() time.Time { + // Apply new timestamp logic if inode.ModificationTimeExtra is in scope. + if in.ExtraInodeSize >= 12 { + return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra) + } + + return in.InodeOld.ModificationTime() +} + +// AccessTime implements Inode.AccessTime. +func (in *InodeNew) AccessTime() time.Time { + // Apply new timestamp logic if inode.AccessTimeExtra is in scope. + if in.ExtraInodeSize >= 16 { + return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra) + } + + return in.InodeOld.AccessTime() +} diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fs/ext/disklayout/inode_old.go new file mode 100644 index 000000000..dc4c9d8e4 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_old.go @@ -0,0 +1,117 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +const ( + // oldInodeSize is the inode size in ext2/ext3. + oldInodeSize = 128 +) + +// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. +// Inode struct size and record size are both 128 bytes for this. +// +// All fields representing time are in seconds since the epoch. Which means that +// they will overflow in January 2038. +type InodeOld struct { + ModeRaw uint16 + UIDLo uint16 + SizeLo uint32 + + // The time fields are signed integers because they could be negative to + // represent time before the epoch. + AccessTimeRaw int32 + ChangeTimeRaw int32 + ModificationTimeRaw int32 + DeletionTimeRaw int32 + + GIDLo uint16 + LinksCountRaw uint16 + BlocksCountLo uint32 + FlagsRaw uint32 + VersionLo uint32 // This is OS dependent. + BlocksRaw [60]byte + Generation uint32 + FileACLLo uint32 + SizeHi uint32 + ObsoFaddr uint32 + + // OS dependent fields have been inlined here. + BlocksCountHi uint16 + FileACLHi uint16 + UIDHi uint16 + GIDHi uint16 + ChecksumLo uint16 + _ uint16 +} + +// Compiles only if InodeOld implements Inode. +var _ Inode = (*InodeOld)(nil) + +// Mode implements Inode.Mode. +func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) } + +// UID implements Inode.UID. +func (in *InodeOld) UID() auth.KUID { + return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo)) +} + +// GID implements Inode.GID. +func (in *InodeOld) GID() auth.KGID { + return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo)) +} + +// Size implements Inode.Size. +func (in *InodeOld) Size() uint64 { + // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL. + return uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeOld) InodeSize() uint16 { return oldInodeSize } + +// AccessTime implements Inode.AccessTime. +func (in *InodeOld) AccessTime() time.Time { + return time.FromUnix(int64(in.AccessTimeRaw), 0) +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeOld) ChangeTime() time.Time { + return time.FromUnix(int64(in.ChangeTimeRaw), 0) +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeOld) ModificationTime() time.Time { + return time.FromUnix(int64(in.ModificationTimeRaw), 0) +} + +// DeletionTime implements Inode.DeletionTime. +func (in *InodeOld) DeletionTime() time.Time { + return time.FromUnix(int64(in.DeletionTimeRaw), 0) +} + +// LinksCount implements Inode.LinksCount. +func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } + +// Flags implements Inode.Flags. +func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } + +// Blocks implements Inode.Blocks. +func (in *InodeOld) Blocks() [60]byte { return in.BlocksRaw } diff --git a/pkg/sentry/fs/ext/disklayout/inode_test.go b/pkg/sentry/fs/ext/disklayout/inode_test.go new file mode 100644 index 000000000..9cae9e4f0 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_test.go @@ -0,0 +1,222 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// TestInodeSize tests that the inode structs are of the correct size. +func TestInodeSize(t *testing.T) { + assertSize(t, InodeOld{}, oldInodeSize) + + // This was updated from 156 bytes to 160 bytes in Oct 2015. + assertSize(t, InodeNew{}, 160) +} + +// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in +// ext4 inode structs are decoded correctly. +// +// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +func TestTimestampSeconds(t *testing.T) { + type timestampTest struct { + // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set. + // If this is set then the 32-bit time is negative. + msbSet bool + + // lowerBound tells if we should take the lowest possible value of + // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to + // false it tells to take the highest possible value. + lowerBound bool + + // extraBits is InodeNew.[X]TimeExtra. + extraBits uint32 + + // want is the kernel time struct that is expected. + want time.Time + } + + tests := []timestampTest{ + // 1901-12-13 + { + msbSet: true, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(-0x80000000), 0), + }, + + // 1969-12-31 + { + msbSet: true, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(-1), 0), + }, + + // 1970-01-01 + { + msbSet: false, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(0), 0), + }, + + // 2038-01-19 + { + msbSet: false, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(0x7fffffff), 0), + }, + + // 2038-01-19 + { + msbSet: true, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x80000000), 0), + }, + + // 2106-02-07 + { + msbSet: true, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0xffffffff), 0), + }, + + // 2106-02-07 + { + msbSet: false, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x100000000), 0), + }, + + // 2174-02-25 + { + msbSet: false, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0x17fffffff), 0), + }, + + // 2174-02-25 + { + msbSet: true, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x180000000), 0), + }, + + // 2242-03-16 + { + msbSet: true, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x1ffffffff), 0), + }, + + // 2242-03-16 + { + msbSet: false, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x200000000), 0), + }, + + // 2310-04-04 + { + msbSet: false, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x27fffffff), 0), + }, + + // 2310-04-04 + { + msbSet: true, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x280000000), 0), + }, + + // 2378-04-22 + { + msbSet: true, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x2ffffffff), 0), + }, + + // 2378-04-22 + { + msbSet: false, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x300000000), 0), + }, + + // 2446-05-10 + { + msbSet: false, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x37fffffff), 0), + }, + } + + lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000 + upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111 + lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000 + upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111 + + get32BitTime := func(test timestampTest) int32 { + if test.msbSet { + if test.lowerBound { + return lowerMSB1 + } + + return upperMSB1 + } + + if test.lowerBound { + return lowerMSB0 + } + + return upperMSB0 + } + + getTestName := func(test timestampTest) string { + return fmt.Sprintf( + "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t", + strconv.FormatInt(int64(test.extraBits), 2), + test.msbSet, + test.lowerBound, + ) + } + + for _, test := range tests { + t.Run(getTestName(test), func(t *testing.T) { + if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want { + t.Errorf("Expected: %v, Got: %v", test.want, got) + } + }) + } +} diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go new file mode 100644 index 000000000..7a337a5e0 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock.go @@ -0,0 +1,471 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +const ( + // SbOffset is the absolute offset at which the superblock is placed. + SbOffset = 1024 +) + +// SuperBlock should be implemented by structs representing the ext superblock. +// The superblock holds a lot of information about the enclosing filesystem. +// This interface aims to provide access methods to important information held +// by the superblock. It does NOT expose all fields of the superblock, only the +// ones necessary. This can be expanded when need be. +// +// Location and replication: +// - The superblock is located at offset 1024 in block group 0. +// - Redundant copies of the superblock and group descriptors are kept in +// all groups if SbSparse feature flag is NOT set. If it is set, the +// replicas only exist in groups whose group number is either 0 or a +// power of 3, 5, or 7. +// - There is also a sparse superblock feature v2 in which there are just +// two replicas saved in the block groups pointed by sb.s_backup_bgs. +// +// Replicas should eventually be updated if the superblock is updated. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. +type SuperBlock interface { + // InodesCount returns the total number of inodes in this filesystem. + InodesCount() uint32 + + // BlocksCount returns the total number of data blocks in this filesystem. + BlocksCount() uint64 + + // FreeBlocksCount returns the number of free blocks in this filesystem. + FreeBlocksCount() uint64 + + // FreeInodesCount returns the number of free inodes in this filesystem. + FreeInodesCount() uint32 + + // MountCount returns the number of mounts since the last fsck. + MountCount() uint16 + + // MaxMountCount returns the number of mounts allowed beyond which a fsck is + // needed. + MaxMountCount() uint16 + + // FirstDataBlock returns the absolute block number of the first data block, + // which contains the super block itself. + // + // If the filesystem has 1kb data blocks then this should return 1. For all + // other configurations, this typically returns 0. + FirstDataBlock() uint32 + + // BlockSize returns the size of one data block in this filesystem. + // This can be calculated by 2^(10 + sb.s_log_block_size). This ensures that + // the smallest block size is 1kb. + BlockSize() uint64 + + // BlocksPerGroup returns the number of data blocks in a block group. + BlocksPerGroup() uint32 + + // ClusterSize returns block cluster size (set during mkfs time by admin). + // This can be calculated by 2^(10 + sb.s_log_cluster_size). This ensures that + // the smallest cluster size is 1kb. + // + // sb.s_log_cluster_size must equal sb.s_log_block_size if bigalloc feature + // is NOT set and consequently BlockSize() = ClusterSize() in that case. + ClusterSize() uint64 + + // ClustersPerGroup returns: + // - number of clusters per group if bigalloc is enabled. + // - BlocksPerGroup() otherwise. + ClustersPerGroup() uint32 + + // InodeSize returns the size of the inode disk record size in bytes. Use this + // to iterate over inode arrays on disk. + // + // In ext2 and ext3: + // - Each inode had a disk record of 128 bytes. + // - The inode struct size was fixed at 128 bytes. + // + // In ext4 its possible to allocate larger on-disk inodes: + // - Inode disk record size = sb.s_inode_size (function return value). + // = 256 (default) + // - Inode struct size = 128 + inode.i_extra_isize. + // = 128 + 32 = 160 (default) + InodeSize() uint16 + + // InodesPerGroup returns the number of inodes in a block group. + InodesPerGroup() uint32 + + // BgDescSize returns the size of the block group descriptor struct. + // + // In ext2, ext3, ext4 (without 64-bit feature), the block group descriptor + // is only 32 bytes long. + // In ext4 with 64-bit feature, the block group descriptor expands to AT LEAST + // 64 bytes. It might be bigger than that. + BgDescSize() uint16 + + // CompatibleFeatures returns the CompatFeatures struct which holds all the + // compatible features this fs supports. + CompatibleFeatures() CompatFeatures + + // IncompatibleFeatures returns the CompatFeatures struct which holds all the + // incompatible features this fs supports. + IncompatibleFeatures() IncompatFeatures + + // ReadOnlyCompatibleFeatures returns the CompatFeatures struct which holds all the + // readonly compatible features this fs supports. + ReadOnlyCompatibleFeatures() RoCompatFeatures + + // Magic() returns the magic signature which must be 0xef53. + Magic() uint16 + + // Revision returns the superblock revision. Superblock struct fields from + // offset 0x54 till 0x150 should only be used if superblock has DynamicRev. + Revision() SbRevision +} + +// SbRevision is the type for superblock revisions. +type SbRevision uint32 + +// Super block revisions. +const ( + // OldRev is the good old (original) format. + OldRev SbRevision = 0 + + // DynamicRev is v2 format w/ dynamic inode sizes. + DynamicRev SbRevision = 1 +) + +// Superblock compatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbDirPrealloc indicates directory preallocation. + SbDirPrealloc = 0x1 + + // SbHasJournal indicates the presence of a journal. jbd2 should only work + // with this being set. + SbHasJournal = 0x4 + + // SbExtAttr indicates extended attributes support. + SbExtAttr = 0x8 + + // SbResizeInode indicates that the fs has reserved GDT blocks (right after + // group descriptors) for fs expansion. + SbResizeInode = 0x10 + + // SbDirIndex indicates that the fs has directory indices. + SbDirIndex = 0x20 + + // SbSparseV2 stands for Sparse superblock version 2. + SbSparseV2 = 0x200 +) + +// CompatFeatures represents a superblock's compatible feature set. If the +// kernel does not understand any of these feature, it can still read/write +// to this fs. +type CompatFeatures struct { + DirPrealloc bool + HasJournal bool + ExtAttr bool + ResizeInode bool + DirIndex bool + SparseV2 bool +} + +// ToInt converts superblock compatible features back to its 32-bit rep. +func (f CompatFeatures) ToInt() uint32 { + var res uint32 + + if f.DirPrealloc { + res |= SbDirPrealloc + } + if f.HasJournal { + res |= SbHasJournal + } + if f.ExtAttr { + res |= SbExtAttr + } + if f.ResizeInode { + res |= SbResizeInode + } + if f.DirIndex { + res |= SbDirIndex + } + if f.SparseV2 { + res |= SbSparseV2 + } + + return res +} + +// CompatFeaturesFromInt converts the integer representation of superblock +// compatible features to CompatFeatures struct. +func CompatFeaturesFromInt(f uint32) CompatFeatures { + return CompatFeatures{ + DirPrealloc: f&SbDirPrealloc > 0, + HasJournal: f&SbHasJournal > 0, + ExtAttr: f&SbExtAttr > 0, + ResizeInode: f&SbResizeInode > 0, + DirIndex: f&SbDirIndex > 0, + SparseV2: f&SbSparseV2 > 0, + } +} + +// Superblock incompatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbDirentFileType indicates that directory entries record the file type. + // We should use struct ext4_dir_entry_2 for dirents then. + SbDirentFileType = 0x2 + + // SbRecovery indicates that the filesystem needs recovery. + SbRecovery = 0x4 + + // SbJournalDev indicates that the filesystem has a separate journal device. + SbJournalDev = 0x8 + + // SbMetaBG indicates that the filesystem is using Meta block groups. Moves + // the group descriptors from the congested first block group into the first + // group of each metablock group to increase the maximum block groups limit + // and hence support much larger filesystems. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#meta-block-groups. + SbMetaBG = 0x10 + + // SbExtents indicates that the filesystem uses extents. Must be set in ext4 + // filesystems. + SbExtents = 0x40 + + // SbIs64Bit indicates that this filesystem addresses blocks with 64-bits. + // Hence can support 2^64 data blocks. + SbIs64Bit = 0x80 + + // SbMMP indicates that this filesystem has multiple mount protection. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#multiple-mount-protection. + SbMMP = 0x100 + + // SbFlexBg indicates that this filesystem has flexible block groups. Several + // block groups are tied into one logical block group so that all the metadata + // for the block groups (bitmaps and inode tables) are close together for + // faster loading. Consequently, large files will be continuous on disk. + // However, this does not affect the placement of redundant superblocks and + // group descriptors. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#flexible-block-groups. + SbFlexBg = 0x200 + + // SbLargeDir shows that large directory enabled. Directory htree can be 3 + // levels deep. Directory htrees are allowed to be 2 levels deep otherwise. + SbLargeDir = 0x4000 + + // SbInlineData allows inline data in inodes for really small files. + SbInlineData = 0x8000 + + // SbEncrypted indicates that this fs contains encrypted inodes. + SbEncrypted = 0x10000 +) + +// IncompatFeatures represents a superblock's incompatible feature set. If the +// kernel does not understand any of these feature, it should refuse to mount. +type IncompatFeatures struct { + DirentFileType bool + Recovery bool + JournalDev bool + MetaBG bool + Extents bool + Is64Bit bool + MMP bool + FlexBg bool + LargeDir bool + InlineData bool + Encrypted bool +} + +// ToInt converts superblock incompatible features back to its 32-bit rep. +func (f IncompatFeatures) ToInt() uint32 { + var res uint32 + + if f.DirentFileType { + res |= SbDirentFileType + } + if f.Recovery { + res |= SbRecovery + } + if f.JournalDev { + res |= SbJournalDev + } + if f.MetaBG { + res |= SbMetaBG + } + if f.Extents { + res |= SbExtents + } + if f.Is64Bit { + res |= SbIs64Bit + } + if f.MMP { + res |= SbMMP + } + if f.FlexBg { + res |= SbFlexBg + } + if f.LargeDir { + res |= SbLargeDir + } + if f.InlineData { + res |= SbInlineData + } + if f.Encrypted { + res |= SbEncrypted + } + + return res +} + +// IncompatFeaturesFromInt converts the integer representation of superblock +// incompatible features to IncompatFeatures struct. +func IncompatFeaturesFromInt(f uint32) IncompatFeatures { + return IncompatFeatures{ + DirentFileType: f&SbDirentFileType > 0, + Recovery: f&SbRecovery > 0, + JournalDev: f&SbJournalDev > 0, + MetaBG: f&SbMetaBG > 0, + Extents: f&SbExtents > 0, + Is64Bit: f&SbIs64Bit > 0, + MMP: f&SbMMP > 0, + FlexBg: f&SbFlexBg > 0, + LargeDir: f&SbLargeDir > 0, + InlineData: f&SbInlineData > 0, + Encrypted: f&SbEncrypted > 0, + } +} + +// Superblock readonly compatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbSparse indicates sparse superblocks. Only groups with number either 0 or + // a power of 3, 5, or 7 will have redundant copies of the superblock and + // block descriptors. + SbSparse = 0x1 + + // SbLargeFile indicates that this fs has been used to store a file >= 2GiB. + SbLargeFile = 0x2 + + // SbHugeFile indicates that this fs contains files whose sizes are + // represented in units of logicals blocks, not 512-byte sectors. + SbHugeFile = 0x8 + + // SbGdtCsum indicates that group descriptors have checksums. + SbGdtCsum = 0x10 + + // SbDirNlink indicates that the new subdirectory limit is 64,999. Ext3 has a + // 32,000 subdirectory limit. + SbDirNlink = 0x20 + + // SbExtraIsize indicates that large inodes exist on this filesystem. + SbExtraIsize = 0x40 + + // SbHasSnapshot indicates the existence of a snapshot. + SbHasSnapshot = 0x80 + + // SbQuota enables usage tracking for all quota types. + SbQuota = 0x100 + + // SbBigalloc maps to the bigalloc feature. When set, the minimum allocation + // unit becomes a cluster rather than a data block. Then block bitmaps track + // clusters, not data blocks. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#bigalloc. + SbBigalloc = 0x200 + + // SbMetadataCsum indicates that the fs supports metadata checksumming. + SbMetadataCsum = 0x400 + + // SbReadOnly marks this filesystem as readonly. Should refuse to mount in + // read/write mode. + SbReadOnly = 0x1000 +) + +// RoCompatFeatures represents a superblock's readonly compatible feature set. +// If the kernel does not understand any of these feature, it can still mount +// readonly. But if the user wants to mount read/write, the kernel should +// refuse to mount. +type RoCompatFeatures struct { + Sparse bool + LargeFile bool + HugeFile bool + GdtCsum bool + DirNlink bool + ExtraIsize bool + HasSnapshot bool + Quota bool + Bigalloc bool + MetadataCsum bool + ReadOnly bool +} + +// ToInt converts superblock readonly compatible features to its 32-bit rep. +func (f RoCompatFeatures) ToInt() uint32 { + var res uint32 + + if f.Sparse { + res |= SbSparse + } + if f.LargeFile { + res |= SbLargeFile + } + if f.HugeFile { + res |= SbHugeFile + } + if f.GdtCsum { + res |= SbGdtCsum + } + if f.DirNlink { + res |= SbDirNlink + } + if f.ExtraIsize { + res |= SbExtraIsize + } + if f.HasSnapshot { + res |= SbHasSnapshot + } + if f.Quota { + res |= SbQuota + } + if f.Bigalloc { + res |= SbBigalloc + } + if f.MetadataCsum { + res |= SbMetadataCsum + } + if f.ReadOnly { + res |= SbReadOnly + } + + return res +} + +// RoCompatFeaturesFromInt converts the integer representation of superblock +// readonly compatible features to RoCompatFeatures struct. +func RoCompatFeaturesFromInt(f uint32) RoCompatFeatures { + return RoCompatFeatures{ + Sparse: f&SbSparse > 0, + LargeFile: f&SbLargeFile > 0, + HugeFile: f&SbHugeFile > 0, + GdtCsum: f&SbGdtCsum > 0, + DirNlink: f&SbDirNlink > 0, + ExtraIsize: f&SbExtraIsize > 0, + HasSnapshot: f&SbHasSnapshot > 0, + Quota: f&SbQuota > 0, + Bigalloc: f&SbBigalloc > 0, + MetadataCsum: f&SbMetadataCsum > 0, + ReadOnly: f&SbReadOnly > 0, + } +} diff --git a/pkg/sentry/fs/ext/disklayout/superblock_32.go b/pkg/sentry/fs/ext/disklayout/superblock_32.go new file mode 100644 index 000000000..587e4afaa --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock_32.go @@ -0,0 +1,75 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of +// the ext4_super_block struct in fs/ext4/ext4.h. +type SuperBlock32Bit struct { + // We embed the old superblock struct here because the 32-bit version is just + // an extension of the old version. + SuperBlockOld + + FirstInode uint32 + InodeSizeRaw uint16 + BlockGroupNumber uint16 + FeatureCompat uint32 + FeatureIncompat uint32 + FeatureRoCompat uint32 + UUID [16]byte + VolumeName [16]byte + LastMounted [64]byte + AlgoUsageBitmap uint32 + PreallocBlocks uint8 + PreallocDirBlocks uint8 + ReservedGdtBlocks uint16 + JournalUUID [16]byte + JournalInum uint32 + JournalDev uint32 + LastOrphan uint32 + HashSeed [4]uint32 + DefaultHashVersion uint8 + JnlBackupType uint8 + BgDescSizeRaw uint16 + DefaultMountOpts uint32 + FirstMetaBg uint32 + MkfsTime uint32 + JnlBlocks [17]uint32 +} + +// Compiles only if SuperBlock32Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock32Bit)(nil) + +// Only override methods which change based on the additional fields above. +// Not overriding SuperBlock.BgDescSize because it would still return 32 here. + +// InodeSize implements SuperBlock.InodeSize. +func (sb *SuperBlock32Bit) InodeSize() uint16 { + return sb.InodeSizeRaw +} + +// CompatibleFeatures implements SuperBlock.CompatibleFeatures. +func (sb *SuperBlock32Bit) CompatibleFeatures() CompatFeatures { + return CompatFeaturesFromInt(sb.FeatureCompat) +} + +// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. +func (sb *SuperBlock32Bit) IncompatibleFeatures() IncompatFeatures { + return IncompatFeaturesFromInt(sb.FeatureIncompat) +} + +// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. +func (sb *SuperBlock32Bit) ReadOnlyCompatibleFeatures() RoCompatFeatures { + return RoCompatFeaturesFromInt(sb.FeatureRoCompat) +} diff --git a/pkg/sentry/fs/ext/disklayout/superblock_64.go b/pkg/sentry/fs/ext/disklayout/superblock_64.go new file mode 100644 index 000000000..a2c2278fb --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock_64.go @@ -0,0 +1,94 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of +// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly +// 1024 bytes (smallest possible block size) and hence the superblock always +// fits in no more than one data block. +type SuperBlock64Bit struct { + // We embed the 32-bit struct here because 64-bit version is just an extension + // of the 32-bit version. + SuperBlock32Bit + + BlocksCountHi uint32 + ReservedBlocksCountHi uint32 + FreeBlocksCountHi uint32 + MinInodeSize uint16 + WantInodeSize uint16 + Flags uint32 + RaidStride uint16 + MmpInterval uint16 + MmpBlock uint64 + RaidStripeWidth uint32 + LogGroupsPerFlex uint8 + ChecksumType uint8 + _ uint16 + KbytesWritten uint64 + SnapshotInum uint32 + SnapshotID uint32 + SnapshotRsrvBlocksCount uint64 + SnapshotList uint32 + ErrorCount uint32 + FirstErrorTime uint32 + FirstErrorInode uint32 + FirstErrorBlock uint64 + FirstErrorFunction [32]byte + FirstErrorLine uint32 + LastErrorTime uint32 + LastErrorInode uint32 + LastErrorLine uint32 + LastErrorBlock uint64 + LastErrorFunction [32]byte + MountOpts [64]byte + UserQuotaInum uint32 + GroupQuotaInum uint32 + OverheadBlocks uint32 + BackupBgs [2]uint32 + EncryptAlgos [4]uint8 + EncryptPwSalt [16]uint8 + LostFoundInode uint32 + ProjectQuotaInode uint32 + ChecksumSeed uint32 + WtimeHi uint8 + MtimeHi uint8 + MkfsTimeHi uint8 + LastCheckHi uint8 + FirstErrorTimeHi uint8 + LastErrorTimeHi uint8 + _ [2]uint8 + Encoding uint16 + EncodingFlags uint16 + _ [95]uint32 + Checksum uint32 +} + +// Compiles only if SuperBlock64Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock64Bit)(nil) + +// Only override methods which change based on the 64-bit feature. + +// BlocksCount implements SuperBlock.BlocksCount. +func (sb *SuperBlock64Bit) BlocksCount() uint64 { + return (uint64(sb.BlocksCountHi) << 32) | uint64(sb.BlocksCountLo) +} + +// FreeBlocksCount implements SuperBlock.FreeBlocksCount. +func (sb *SuperBlock64Bit) FreeBlocksCount() uint64 { + return (uint64(sb.FreeBlocksCountHi) << 32) | uint64(sb.FreeBlocksCountLo) +} + +// BgDescSize implements SuperBlock.BgDescSize. +func (sb *SuperBlock64Bit) BgDescSize() uint16 { return sb.BgDescSizeRaw } diff --git a/pkg/sentry/fs/ext/disklayout/superblock_old.go b/pkg/sentry/fs/ext/disklayout/superblock_old.go new file mode 100644 index 000000000..aada8b550 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock_old.go @@ -0,0 +1,105 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlockOld implements SuperBlock and represents the old version of the +// superblock struct in ext2 and ext3 systems. +type SuperBlockOld struct { + InodesCountRaw uint32 + BlocksCountLo uint32 + ReservedBlocksCount uint32 + FreeBlocksCountLo uint32 + FreeInodesCountRaw uint32 + FirstDataBlockRaw uint32 + LogBlockSize uint32 + LogClusterSize uint32 + BlocksPerGroupRaw uint32 + ClustersPerGroupRaw uint32 + InodesPerGroupRaw uint32 + Mtime uint32 + Wtime uint32 + MountCountRaw uint16 + MaxMountCountRaw uint16 + MagicRaw uint16 + State uint16 + Errors uint16 + MinorRevLevel uint16 + LastCheck uint32 + CheckInterval uint32 + CreatorOS uint32 + RevLevel uint32 + DefResUID uint16 + DefResGID uint16 +} + +// Compiles only if SuperBlockOld implements SuperBlock. +var _ SuperBlock = (*SuperBlockOld)(nil) + +// InodesCount implements SuperBlock.InodesCount. +func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw } + +// BlocksCount implements SuperBlock.BlocksCount. +func (sb *SuperBlockOld) BlocksCount() uint64 { return uint64(sb.BlocksCountLo) } + +// FreeBlocksCount implements SuperBlock.FreeBlocksCount. +func (sb *SuperBlockOld) FreeBlocksCount() uint64 { return uint64(sb.FreeBlocksCountLo) } + +// FreeInodesCount implements SuperBlock.FreeInodesCount. +func (sb *SuperBlockOld) FreeInodesCount() uint32 { return sb.FreeInodesCountRaw } + +// MountCount implements SuperBlock.MountCount. +func (sb *SuperBlockOld) MountCount() uint16 { return sb.MountCountRaw } + +// MaxMountCount implements SuperBlock.MaxMountCount. +func (sb *SuperBlockOld) MaxMountCount() uint16 { return sb.MaxMountCountRaw } + +// FirstDataBlock implements SuperBlock.FirstDataBlock. +func (sb *SuperBlockOld) FirstDataBlock() uint32 { return sb.FirstDataBlockRaw } + +// BlockSize implements SuperBlock.BlockSize. +func (sb *SuperBlockOld) BlockSize() uint64 { return 1 << (10 + sb.LogBlockSize) } + +// BlocksPerGroup implements SuperBlock.BlocksPerGroup. +func (sb *SuperBlockOld) BlocksPerGroup() uint32 { return sb.BlocksPerGroupRaw } + +// ClusterSize implements SuperBlock.ClusterSize. +func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterSize) } + +// ClustersPerGroup implements SuperBlock.ClustersPerGroup. +func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } + +// InodeSize implements SuperBlock.InodeSize. +func (sb *SuperBlockOld) InodeSize() uint16 { return oldInodeSize } + +// InodesPerGroup implements SuperBlock.InodesPerGroup. +func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } + +// BgDescSize implements SuperBlock.BgDescSize. +func (sb *SuperBlockOld) BgDescSize() uint16 { return 32 } + +// CompatibleFeatures implements SuperBlock.CompatibleFeatures. +func (sb *SuperBlockOld) CompatibleFeatures() CompatFeatures { return CompatFeatures{} } + +// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. +func (sb *SuperBlockOld) IncompatibleFeatures() IncompatFeatures { return IncompatFeatures{} } + +// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. +func (sb *SuperBlockOld) ReadOnlyCompatibleFeatures() RoCompatFeatures { return RoCompatFeatures{} } + +// Magic implements SuperBlock.Magic. +func (sb *SuperBlockOld) Magic() uint16 { return sb.MagicRaw } + +// Revision implements SuperBlock.Revision. +func (sb *SuperBlockOld) Revision() SbRevision { return SbRevision(sb.RevLevel) } diff --git a/pkg/sentry/fs/ext/disklayout/superblock_test.go b/pkg/sentry/fs/ext/disklayout/superblock_test.go new file mode 100644 index 000000000..463b5ba21 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock_test.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestSuperBlockSize tests that the superblock structs are of the correct +// size. +func TestSuperBlockSize(t *testing.T) { + assertSize(t, SuperBlockOld{}, 84) + assertSize(t, SuperBlock32Bit{}, 336) + assertSize(t, SuperBlock64Bit{}, 1024) +} diff --git a/pkg/sentry/fs/ext/disklayout/test_utils.go b/pkg/sentry/fs/ext/disklayout/test_utils.go new file mode 100644 index 000000000..9c63f04c0 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/test_utils.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/binary" +) + +func assertSize(t *testing.T, v interface{}, want uintptr) { + t.Helper() + + if got := binary.Size(v); got != want { + t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) + } +} diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go new file mode 100644 index 000000000..9d33be08f --- /dev/null +++ b/pkg/sentry/fs/ext/ext.go @@ -0,0 +1,75 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ext implements readonly ext(2/3/4) filesystems. +package ext + +import ( + "io" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Filesystem implements vfs.FilesystemImpl. +type Filesystem struct { + // dev is the ReadSeeker for the underlying fs device and is protected by mu. + dev io.ReadSeeker + + // mu synchronizes the usage of dev. The ext filesystems take locality into + // condsideration, i.e. data blocks of a file will tend to be placed close + // together. On a spinning disk, locality reduces the amount of movement of + // the head hence speeding up IO operations. On an SSD there are no moving + // parts but locality increases the size of each transer request. Hence, + // having mutual exclusion on the read seeker while reading a file *should* + // help in achieving the intended performance gains. + // + // Note: This synchronization was not coupled with the ReadSeeker itself + // because we want to synchronize across read/seek operations for the + // performance gains mentioned above. Helps enforcing one-file-at-a-time IO. + mu sync.Mutex + + // sb represents the filesystem superblock. Immutable after initialization. + sb disklayout.SuperBlock + + // bgs represents all the block group descriptors for the filesystem. + // Immutable after initialization. + bgs []disklayout.BlockGroup +} + +// newFilesystem is the Filesystem constructor. +func newFilesystem(dev io.ReadSeeker) (*Filesystem, error) { + fs := Filesystem{dev: dev} + var err error + + fs.sb, err = readSuperBlock(dev) + if err != nil { + return nil, err + } + + if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { + // mount(2) specifies that EINVAL should be returned if the superblock is + // invalid. + return nil, syserror.EINVAL + } + + fs.bgs, err = readBlockGroups(dev, fs.sb) + if err != nil { + return nil, err + } + + return &fs, nil +} diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go new file mode 100644 index 000000000..861ef9a73 --- /dev/null +++ b/pkg/sentry/fs/ext/utils.go @@ -0,0 +1,103 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "encoding/binary" + "io" + + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/syserror" +) + +// readFromDisk performs a binary read from disk into the given struct from +// the absolute offset provided. +// +// All disk reads should use this helper so we avoid reading from stale +// previously used offsets. This function forces the offset parameter. +func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error { + if _, err := dev.Seek(abOff, io.SeekStart); err != nil { + return syserror.EIO + } + + if err := binary.Read(dev, binary.LittleEndian, v); err != nil { + return syserror.EIO + } + + return nil +} + +// readSuperBlock reads the SuperBlock from block group 0 in the underlying +// device. There are three versions of the superblock. This function identifies +// and returns the correct version. +func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) { + var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + if sb.Revision() == disklayout.OldRev { + return sb, nil + } + + sb = &disklayout.SuperBlock32Bit{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + if !sb.IncompatibleFeatures().Is64Bit { + return sb, nil + } + + sb = &disklayout.SuperBlock64Bit{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + return sb, nil +} + +// blockGroupsCount returns the number of block groups in the ext fs. +func blockGroupsCount(sb disklayout.SuperBlock) uint64 { + blocksCount := sb.BlocksCount() + blocksPerGroup := uint64(sb.BlocksPerGroup()) + + // Round up the result. float64 can compromise precision so do it manually. + bgCount := blocksCount / blocksPerGroup + if blocksCount%blocksPerGroup != 0 { + bgCount++ + } + + return bgCount +} + +// readBlockGroups reads the block group descriptor table from block group 0 in +// the underlying device. +func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { + bgCount := blockGroupsCount(sb) + bgdSize := uint64(sb.BgDescSize()) + is64Bit := sb.IncompatibleFeatures().Is64Bit + bgds := make([]disklayout.BlockGroup, bgCount) + + for i, off := uint64(0), uint64(sb.FirstDataBlock()+1)*sb.BlockSize(); i < bgCount; i, off = i+1, off+bgdSize { + if is64Bit { + bgds[i] = &disklayout.BlockGroup64Bit{} + } else { + bgds[i] = &disklayout.BlockGroup32Bit{} + } + + if err := readFromDisk(dev, int64(off), bgds[i]); err != nil { + return nil, err + } + } + return bgds, nil +} diff --git a/pkg/sentry/fs/ext4/BUILD b/pkg/sentry/fs/ext4/BUILD deleted file mode 100644 index 9dce67635..000000000 --- a/pkg/sentry/fs/ext4/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library") - -go_library( - name = "ext4", - srcs = ["fs.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/context", - "//pkg/sentry/fs", - ], -) diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD deleted file mode 100644 index 2fb1d5b37..000000000 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") - -go_library( - name = "disklayout", - srcs = [ - "block_group.go", - "block_group_32.go", - "block_group_64.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout", -) - -go_test( - name = "disklayout_test", - size = "small", - srcs = ["block_group_test.go"], - embed = [":disklayout"], - deps = ["//pkg/binary"], -) diff --git a/pkg/sentry/fs/ext4/fs.go b/pkg/sentry/fs/ext4/fs.go deleted file mode 100644 index 5c7274821..000000000 --- a/pkg/sentry/fs/ext4/fs.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package ext4 implements the ext4 filesystem. -package ext4 - -import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -// filesystem implements fs.Filesystem for ext4. -// -// +stateify savable -type filesystem struct{} - -func init() { - fs.RegisterFilesystem(&filesystem{}) -} - -// FilesystemName is the name under which the filesystem is registered. -// Name matches fs/ext4/super.c:ext4_fs_type.name. -const FilesystemName = "ext4" - -// Name is the name of the file system. -func (*filesystem) Name() string { - return FilesystemName -} - -// AllowUserMount prohibits users from using mount(2) with this file system. -func (*filesystem) AllowUserMount() bool { - return false -} - -// AllowUserList prohibits this filesystem to be listed in /proc/filesystems. -func (*filesystem) AllowUserList() bool { - return false -} - -// Flags returns properties of the filesystem. -// -// In Linux, ext4 returns FS_REQUIRES_DEV. See fs/ext4/super.c -func (*filesystem) Flags() fs.FilesystemFlags { - return fs.FilesystemRequiresDev -} - -// Mount returns the root inode of the ext4 fs. -func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, cgroupsInt interface{}) (*fs.Inode, error) { - panic("unimplemented") -} diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 8e1f5674d..bb8117f89 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -130,14 +130,15 @@ type File struct { // to false respectively. func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOperations) *File { dirent.IncRef() - f := &File{ + f := File{ UniqueID: uniqueid.GlobalFromContext(ctx), Dirent: dirent, FileOperations: fops, flags: flags, } f.mu.Init() - return f + f.EnableLeakCheck("fs.File") + return &f } // DecRef destroys the File when it is no longer referenced. diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index 2c5bd1b19..d86f5bf45 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -155,5 +155,16 @@ type FileOperations interface { // refer. // // Preconditions: The AddressSpace (if any) that io refers to is activated. - Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) + Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) +} + +// FifoSizer is an interface for setting and getting the size of a pipe. +type FifoSizer interface { + // FifoSize returns the pipe capacity in bytes. + FifoSize(ctx context.Context, file *File) (int64, error) + + // SetFifoSize sets the new pipe capacity in bytes. + // + // The new size is returned (which may be capped). + SetFifoSize(size int64) (int64, error) } diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 5fb5fbe97..9820f0b13 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -85,12 +85,6 @@ type overlayFileOperations struct { // protected by File.mu of the owning file, which is held during // Readdir and Seek calls. dirCursor string - - // dirCacheMu protects dirCache. - dirCacheMu sync.RWMutex `state:"nosave"` - - // dirCache is cache of DentAttrs from upper and lower Inodes. - dirCache *SortedDentryMap } // Release implements FileOperations.Release. @@ -171,53 +165,68 @@ func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, seriali if root != nil { defer root.DecRef() } + dirCtx := &DirCtx{ Serializer: serializer, DirCursor: &f.dirCursor, } + return DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset()) +} - // If the directory dirent is frozen, then DirentReaddir will calculate - // the children based off the frozen dirent tree. There is no need to - // call readdir on the upper/lower layers. - if file.Dirent.frozen { - return DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset()) +// IterateDir implements DirIterator.IterateDir. +func (f *overlayFileOperations) IterateDir(ctx context.Context, d *Dirent, dirCtx *DirCtx, offset int) (int, error) { + o := d.Inode.overlay + + if !d.Inode.MountSource.CacheReaddir() { + // Can't use the dirCache. Simply read the entries. + entries, err := readdirEntries(ctx, o) + if err != nil { + return offset, err + } + n, err := GenericReaddir(dirCtx, entries) + return offset + n, err } - // Otherwise proceed with usual overlay readdir. - o := file.Dirent.Inode.overlay + // Otherwise, use or create cached entries. + + o.dirCacheMu.RLock() + if o.dirCache != nil { + n, err := GenericReaddir(dirCtx, o.dirCache) + o.dirCacheMu.RUnlock() + return offset + n, err + } + o.dirCacheMu.RUnlock() // readdirEntries holds o.copyUpMu to ensure that copy-up does not - // occur while calculating the readir results. + // occur while calculating the readdir results. // // However, it is possible for a copy-up to occur after the call to - // readdirEntries, but before setting f.dirCache. This is OK, since - // copy-up only does not change the children in a way that would affect - // the children returned in dirCache. Copy-up only moves - // files/directories between layers in the overlay. + // readdirEntries, but before setting o.dirCache. This is OK, since + // copy-up does not change the children in a way that would affect the + // children returned in dirCache. Copy-up only moves files/directories + // between layers in the overlay. // - // It is also possible for Readdir to race with a Create operation - // (which may trigger a copy-up during it's execution). Depending on - // whether the Create happens before or after the readdirEntries call, - // the newly created file may or may not appear in the readdir results. - // But this can only be caused by a real race between readdir and - // create syscalls, so it's also OK. - dirCache, err := readdirEntries(ctx, o) - if err != nil { - return file.Offset(), err + // We must hold dirCacheMu around both readdirEntries and setting + // o.dirCache to synchronize with dirCache invalidations done by + // Create, Remove, Rename. + o.dirCacheMu.Lock() + + // We expect dirCache to be nil (we just checked above), but there is a + // chance that a racing call managed to just set it, in which case we + // can use that new value. + if o.dirCache == nil { + dirCache, err := readdirEntries(ctx, o) + if err != nil { + o.dirCacheMu.Unlock() + return offset, err + } + o.dirCache = dirCache } - f.dirCacheMu.Lock() - f.dirCache = dirCache - f.dirCacheMu.Unlock() + o.dirCacheMu.DowngradeLock() + n, err := GenericReaddir(dirCtx, o.dirCache) + o.dirCacheMu.RUnlock() - return DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset()) -} - -// IterateDir implements DirIterator.IterateDir. -func (f *overlayFileOperations) IterateDir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error) { - f.dirCacheMu.RLock() - n, err := GenericReaddir(dirCtx, f.dirCache) - f.dirCacheMu.RUnlock() return offset + n, err } @@ -338,13 +347,14 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt // preventing us from saving a proper inode mapping for the // file. file.IncRef() - id := &overlayMappingIdentity{ + id := overlayMappingIdentity{ id: opts.MappingIdentity, overlayFile: file, } + id.EnableLeakCheck("fs.overlayMappingIdentity") // Swap out the old MappingIdentity for the wrapped one. - opts.MappingIdentity = id + opts.MappingIdentity = &id return nil } @@ -388,9 +398,49 @@ func (f *overlayFileOperations) UnstableAttr(ctx context.Context, file *File) (U return f.lower.UnstableAttr(ctx) } -// Ioctl implements fs.FileOperations.Ioctl and always returns ENOTTY. -func (*overlayFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - return 0, syserror.ENOTTY +// Ioctl implements fs.FileOperations.Ioctl. +func (f *overlayFileOperations) Ioctl(ctx context.Context, overlayFile *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + f.upperMu.Lock() + defer f.upperMu.Unlock() + + if f.upper == nil { + // It's possible that ioctl changes the file. Since we don't know all + // possible ioctls, only allow them to propagate to the upper. Triggering a + // copy up on any ioctl would be too drastic. In the future, it can have a + // list of ioctls that are safe to send to lower and a list that triggers a + // copy up. + return 0, syserror.ENOTTY + } + return f.upper.FileOperations.Ioctl(ctx, f.upper, io, args) +} + +// FifoSize implements FifoSizer.FifoSize. +func (f *overlayFileOperations) FifoSize(ctx context.Context, overlayFile *File) (rv int64, err error) { + err = f.onTop(ctx, overlayFile, func(file *File, ops FileOperations) error { + sz, ok := ops.(FifoSizer) + if !ok { + return syserror.EINVAL + } + rv, err = sz.FifoSize(ctx, file) + return err + }) + return +} + +// SetFifoSize implements FifoSizer.SetFifoSize. +func (f *overlayFileOperations) SetFifoSize(size int64) (rv int64, err error) { + f.upperMu.Lock() + defer f.upperMu.Unlock() + + if f.upper == nil { + // Named pipes cannot be copied up and changes to the lower are prohibited. + return 0, syserror.EINVAL + } + sz, ok := f.upper.FileOperations.(FifoSizer) + if !ok { + return 0, syserror.EINVAL + } + return sz.SetFifoSize(size) } // readdirEntries returns a sorted map of directory entries from the diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go index 1278f9c78..0fab876a9 100644 --- a/pkg/sentry/fs/flags.go +++ b/pkg/sentry/fs/flags.go @@ -28,7 +28,11 @@ type FileFlags struct { // NonBlocking indicates that I/O should not block. NonBlocking bool - // Sync indicates that any writes should be synchronous. + // DSync indicates that each write will flush data and metadata required to + // read the file's contents. + DSync bool + + // Sync indicates that each write will flush data and all file metadata. Sync bool // Append indicates this file is append only. @@ -96,6 +100,9 @@ func (f FileFlags) ToLinux() (mask uint) { if f.NonBlocking { mask |= linux.O_NONBLOCK } + if f.DSync { + mask |= linux.O_DSYNC + } if f.Sync { mask |= linux.O_SYNC } diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index cca540b6c..626b9126a 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -219,7 +219,7 @@ func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpt type FileNoIoctl struct{} // Ioctl implements fs.FileOperations.Ioctl. -func (FileNoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (FileNoIoctl) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) { return 0, syserror.ENOTTY } @@ -285,7 +285,7 @@ func NewStaticDirFileOperations(dentries *fs.SortedDentryMap) *StaticDirFileOper } // IterateDir implements DirIterator.IterateDir. -func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { +func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCtx *fs.DirCtx, offset int) (int, error) { n, err := fs.GenericReaddir(dirCtx, sdfo.dentryMap) return offset + n, err } diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md index e1630553b..71a577d9d 100644 --- a/pkg/sentry/fs/g3doc/inotify.md +++ b/pkg/sentry/fs/g3doc/inotify.md @@ -9,7 +9,7 @@ For the most part, the sentry implementation of inotify mirrors the Linux architecture. Inotify instances (i.e. the fd returned by inotify_init(2)) are backed by a pseudo-filesystem. Events are generated from various places in the sentry, including the [syscall layer][syscall_dir], the [vfs layer][dirent] and -the [process fd table][fd_map]. Watches are stored in inodes and generated +the [process fd table][fd_table]. Watches are stored in inodes and generated events are queued to the inotify instance owning the watches for delivery to the user. @@ -114,7 +114,7 @@ ordering. [dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go [event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go -[fd_map]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_map.go +[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go [inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go [inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go [inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index ebbca162b..9e2e412cd 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -137,7 +137,7 @@ func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer } // IterateDir implements fs.DirIterator.IterateDir. -func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { +func (f *fileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCtx *fs.DirCtx, offset int) (int, error) { f.inodeOperations.readdirMu.Lock() defer f.inodeOperations.readdirMu.Unlock() diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index b87c4f150..27eeae3d9 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -79,11 +79,12 @@ func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*han newFile.close(ctx) return nil, err } - h := &handles{ + h := handles{ File: newFile, Host: hostFile, } - return h, nil + h.EnableLeakCheck("gofer.handles") + return &h, nil } type handleReadWriter struct { diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index b91386909..8c17603f8 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -145,16 +145,17 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string defer d.DecRef() // Construct the new file, caching the handles if allowed. - h := &handles{ + h := handles{ File: newFile, Host: hostFile, } + h.EnableLeakCheck("gofer.handles") if iops.fileState.canShareHandles() { iops.fileState.handlesMu.Lock() - iops.fileState.setSharedHandlesLocked(flags, h) + iops.fileState.setSharedHandlesLocked(flags, &h) iops.fileState.handlesMu.Unlock() } - return NewFile(ctx, d, name, flags, iops, h), nil + return NewFile(ctx, d, name, flags, iops, &h), nil } // CreateLink uses Create to create a symlink between oldname and newname. diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 7ad0d8bc5..69d08a627 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -29,7 +29,7 @@ import ( ) // DefaultDirentCacheSize is the default dirent cache size for 9P mounts. It can -// be adjusted independentely from the other dirent caches. +// be adjusted independently from the other dirent caches. var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize // +stateify savable @@ -145,16 +145,21 @@ func (s *session) Destroy() { s.client.Close() } -// Revalidate implements MountSource.Revalidate. +// Revalidate implements MountSourceOperations.Revalidate. func (s *session) Revalidate(ctx context.Context, name string, parent, child *fs.Inode) bool { return s.cachePolicy.revalidate(ctx, name, parent, child) } -// Keep implements MountSource.Keep. +// Keep implements MountSourceOperations.Keep. func (s *session) Keep(d *fs.Dirent) bool { return s.cachePolicy.keep(d) } +// CacheReaddir implements MountSourceOperations.CacheReaddir. +func (s *session) CacheReaddir() bool { + return s.cachePolicy.cacheReaddir() +} + // ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings. func (s *session) ResetInodeMappings() { s.inodeMappings = make(map[uint64]string) @@ -236,7 +241,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF } // Construct the session. - s := &session{ + s := session{ connID: dev, msize: o.msize, version: o.version, @@ -245,13 +250,14 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF superBlockFlags: superBlockFlags, mounter: mounter, } + s.EnableLeakCheck("gofer.session") if o.privateunixsocket { s.endpoints = newEndpointMaps() } // Construct the MountSource with the session and superBlockFlags. - m := fs.NewMountSource(ctx, s, filesystem, superBlockFlags) + m := fs.NewMountSource(ctx, &s, filesystem, superBlockFlags) // Given that gofer files can consume host FDs, restrict the number // of files that can be held by the cache. @@ -285,7 +291,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF return nil, err } - sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr, false) + sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false) return fs.NewInode(ctx, iops, m, sattr), nil } diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 29a79441e..d045e04ff 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -111,5 +111,4 @@ func (s *session) afterLoad() { panic("failed to restore endpoint maps: " + err.Error()) } } - } diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index bd665a6b1..f6c626f2c 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -179,7 +179,7 @@ func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer } // IterateDir implements fs.DirIterator.IterateDir. -func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { +func (f *fileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCtx *fs.DirCtx, offset int) (int, error) { if f.dirinfo == nil { f.dirinfo = new(dirInfo) f.dirinfo.buf = make([]byte, usermem.PageSize) diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index 18994c456..2d959f10d 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -64,12 +64,12 @@ func TestMultipleReaddir(t *testing.T) { defer openFile.DecRef() c1 := &fs.DirCtx{DirCursor: new(string)} - if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c1, 0); err != nil { + if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, dirent, c1, 0); err != nil { t.Fatalf("First Readdir failed: %v", err) } c2 := &fs.DirCtx{DirCursor: new(string)} - if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c2, 0); err != nil { + if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, dirent, c2, 0); err != nil { t.Errorf("Second Readdir failed: %v", err) } diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 7fedc88bc..44c4ee5f2 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -47,12 +47,12 @@ const maxSendBufferSize = 8 << 20 // // +stateify savable type ConnectedEndpoint struct { - queue *waiter.Queue - path string - // ref keeps track of references to a connectedEndpoint. ref refs.AtomicRefCount + queue *waiter.Queue + path string + // If srfd >= 0, it is the host FD that file was imported from. srfd int `state:"wait"` @@ -133,6 +133,8 @@ func NewConnectedEndpoint(ctx context.Context, file *fd.FD, queue *waiter.Queue, // AtomicRefCounters start off with a single reference. We need two. e.ref.IncRef() + e.ref.EnableLeakCheck("host.ConnectedEndpoint") + return &e, nil } diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 0ceedb200..2526412a4 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -114,7 +114,7 @@ func (t *TTYFileOperations) Release() { } // Ioctl implements fs.FileOperations.Ioctl. -func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { // Ignore arg[0]. This is the real FD: fd := t.fileOperations.iops.fileState.FD() ioctl := args[1].Uint64() diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index e4aae1135..f4ddfa406 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -86,12 +86,14 @@ type LockCtx struct { // NewInode takes a reference on msrc. func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, sattr StableAttr) *Inode { msrc.IncRef() - return &Inode{ + i := Inode{ InodeOperations: iops, StableAttr: sattr, Watches: newWatches(), MountSource: msrc, } + i.EnableLeakCheck("fs.Inode") + return &i } // DecRef drops a reference on the Inode. diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 57b8b14e3..e0602da17 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -217,6 +217,9 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st return nil, err } + // We've added to the directory so we must drop the cache. + o.markDirectoryDirty() + // Take another reference on the upper file's inode, which will be // owned by the overlay entry. upperFile.Dirent.Inode.IncRef() @@ -265,7 +268,12 @@ func overlayCreateDirectory(ctx context.Context, o *overlayEntry, parent *Dirent if err := copyUpLockedForRename(ctx, parent); err != nil { return err } - return o.upper.InodeOperations.CreateDirectory(ctx, o.upper, name, perm) + if err := o.upper.InodeOperations.CreateDirectory(ctx, o.upper, name, perm); err != nil { + return err + } + // We've added to the directory so we must drop the cache. + o.markDirectoryDirty() + return nil } func overlayCreateLink(ctx context.Context, o *overlayEntry, parent *Dirent, oldname string, newname string) error { @@ -273,7 +281,12 @@ func overlayCreateLink(ctx context.Context, o *overlayEntry, parent *Dirent, old if err := copyUpLockedForRename(ctx, parent); err != nil { return err } - return o.upper.InodeOperations.CreateLink(ctx, o.upper, oldname, newname) + if err := o.upper.InodeOperations.CreateLink(ctx, o.upper, oldname, newname); err != nil { + return err + } + // We've added to the directory so we must drop the cache. + o.markDirectoryDirty() + return nil } func overlayCreateHardLink(ctx context.Context, o *overlayEntry, parent *Dirent, target *Dirent, name string) error { @@ -285,7 +298,12 @@ func overlayCreateHardLink(ctx context.Context, o *overlayEntry, parent *Dirent, if err := copyUpLockedForRename(ctx, target); err != nil { return err } - return o.upper.InodeOperations.CreateHardLink(ctx, o.upper, target.Inode.overlay.upper, name) + if err := o.upper.InodeOperations.CreateHardLink(ctx, o.upper, target.Inode.overlay.upper, name); err != nil { + return err + } + // We've added to the directory so we must drop the cache. + o.markDirectoryDirty() + return nil } func overlayCreateFifo(ctx context.Context, o *overlayEntry, parent *Dirent, name string, perm FilePermissions) error { @@ -293,7 +311,12 @@ func overlayCreateFifo(ctx context.Context, o *overlayEntry, parent *Dirent, nam if err := copyUpLockedForRename(ctx, parent); err != nil { return err } - return o.upper.InodeOperations.CreateFifo(ctx, o.upper, name, perm) + if err := o.upper.InodeOperations.CreateFifo(ctx, o.upper, name, perm); err != nil { + return err + } + // We've added to the directory so we must drop the cache. + o.markDirectoryDirty() + return nil } func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child *Dirent) error { @@ -316,8 +339,12 @@ func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child * } } if child.Inode.overlay.lowerExists { - return overlayCreateWhiteout(o.upper, child.name) + if err := overlayCreateWhiteout(o.upper, child.name); err != nil { + return err + } } + // We've removed from the directory so we must drop the cache. + o.markDirectoryDirty() return nil } @@ -393,8 +420,12 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena return err } if renamed.Inode.overlay.lowerExists { - return overlayCreateWhiteout(oldParent.Inode.overlay.upper, oldName) + if err := overlayCreateWhiteout(oldParent.Inode.overlay.upper, oldName); err != nil { + return err + } } + // We've changed the directory so we must drop the cache. + oldParent.Inode.overlay.markDirectoryDirty() return nil } @@ -411,6 +442,9 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri return nil, err } + // We've added to the directory so we must drop the cache. + o.markDirectoryDirty() + // Grab the inode and drop the dirent, we don't need it. inode := d.Inode inode.IncRef() diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index 41238f3fa..c7f4e2d13 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -202,7 +202,7 @@ func (i *Inotify) UnstableAttr(ctx context.Context, file *File) (UnstableAttr, e } // Ioctl implements fs.FileOperations.Ioctl. -func (i *Inotify) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (i *Inotify) Ioctl(ctx context.Context, _ *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch args[1].Int() { case linux.FIONREAD: i.evMu.Lock() diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index a074ecab0..636484424 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -134,7 +134,7 @@ const ( // LockRegion attempts to acquire a typed lock for the uid on a region // of a file. Returns true if successful in locking the region. If false // is returned, the caller should normally interpret this as "try again later" if -// accquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode. +// acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode. // Blocker is the interface used to provide blocking behavior, passing a nil Blocker // will result in non-blocking behavior. func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool { diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go index b3c4ad80b..7a24c6f1b 100644 --- a/pkg/sentry/fs/mock.go +++ b/pkg/sentry/fs/mock.go @@ -75,6 +75,12 @@ func (n *MockMountSourceOps) Keep(dirent *Dirent) bool { return n.keep } +// CacheReaddir implements fs.MountSourceOperations.CacheReaddir. +func (n *MockMountSourceOps) CacheReaddir() bool { + // Common case: cache readdir results if there is a dirent cache. + return n.keep +} + // WriteOut implements fs.InodeOperations.WriteOut. func (n *MockInodeOperations) WriteOut(context.Context, *Inode) error { return nil diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 2eb6a9d82..7a9692800 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" ) -// DirentOperations provide file systems greater control over how long a Dirent stays pinned -// in core. Implementations must not take Dirent.mu. +// DirentOperations provide file systems greater control over how long a Dirent +// stays pinned in core. Implementations must not take Dirent.mu. type DirentOperations interface { // Revalidate is called during lookup each time we encounter a Dirent // in the cache. Implementations may update stale properties of the @@ -37,6 +37,12 @@ type DirentOperations interface { // Keep returns true if the Dirent should be kept in memory for as long // as possible beyond any active references. Keep(dirent *Dirent) bool + + // CacheReaddir returns true if directory entries returned by + // FileOperations.Readdir may be cached for future use. + // + // Postconditions: This method must always return the same value. + CacheReaddir() bool } // MountSourceOperations contains filesystem specific operations. @@ -132,12 +138,14 @@ func NewMountSource(ctx context.Context, mops MountSourceOperations, filesystem if filesystem != nil { fsType = filesystem.Name() } - return &MountSource{ + msrc := MountSource{ MountSourceOperations: mops, Flags: flags, FilesystemType: fsType, fscache: NewDirentCache(DefaultDirentCacheSize), } + msrc.EnableLeakCheck("fs.MountSource") + return &msrc } // DirentRefs returns the current mount direntRefs. @@ -190,25 +198,28 @@ func (msrc *MountSource) SetDirentCacheLimiter(l *DirentCacheLimiter) { // aggressively. func NewCachingMountSource(ctx context.Context, filesystem Filesystem, flags MountSourceFlags) *MountSource { return NewMountSource(ctx, &SimpleMountSourceOperations{ - keep: true, - revalidate: false, + keep: true, + revalidate: false, + cacheReaddir: true, }, filesystem, flags) } // NewNonCachingMountSource returns a generic mount that will never cache dirents. func NewNonCachingMountSource(ctx context.Context, filesystem Filesystem, flags MountSourceFlags) *MountSource { return NewMountSource(ctx, &SimpleMountSourceOperations{ - keep: false, - revalidate: false, + keep: false, + revalidate: false, + cacheReaddir: false, }, filesystem, flags) } // NewRevalidatingMountSource returns a generic mount that will cache dirents, -// but will revalidate them on each lookup. +// but will revalidate them on each lookup and always perform uncached readdir. func NewRevalidatingMountSource(ctx context.Context, filesystem Filesystem, flags MountSourceFlags) *MountSource { return NewMountSource(ctx, &SimpleMountSourceOperations{ - keep: true, - revalidate: true, + keep: true, + revalidate: true, + cacheReaddir: false, }, filesystem, flags) } @@ -216,8 +227,9 @@ func NewRevalidatingMountSource(ctx context.Context, filesystem Filesystem, flag // an actual filesystem. It is always non-caching. func NewPseudoMountSource(ctx context.Context) *MountSource { return NewMountSource(ctx, &SimpleMountSourceOperations{ - keep: false, - revalidate: false, + keep: false, + revalidate: false, + cacheReaddir: false, }, nil, MountSourceFlags{}) } @@ -225,8 +237,9 @@ func NewPseudoMountSource(ctx context.Context) *MountSource { // // +stateify savable type SimpleMountSourceOperations struct { - keep bool - revalidate bool + keep bool + revalidate bool + cacheReaddir bool } // Revalidate implements MountSourceOperations.Revalidate. @@ -239,6 +252,11 @@ func (smo *SimpleMountSourceOperations) Keep(*Dirent) bool { return smo.keep } +// CacheReaddir implements MountSourceOperations.CacheReaddir. +func (smo *SimpleMountSourceOperations) CacheReaddir() bool { + return smo.cacheReaddir +} + // ResetInodeMappings implements MountSourceOperations.ResetInodeMappings. func (*SimpleMountSourceOperations) ResetInodeMappings() {} diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go index bc0ce7adc..299712cd7 100644 --- a/pkg/sentry/fs/mount_overlay.go +++ b/pkg/sentry/fs/mount_overlay.go @@ -66,13 +66,17 @@ func (o *overlayMountSourceOperations) Revalidate(ctx context.Context, name stri panic("an overlay cannot revalidate file objects from the lower fs") } - // Do we have anything to revalidate? - if child.overlay.upper == nil { - return false + var revalidate bool + child.overlay.copyMu.RLock() + if child.overlay.upper != nil { + // Does the upper require revalidation? + revalidate = o.upper.Revalidate(ctx, name, parent.overlay.upper, child.overlay.upper) + } else { + // Nothing to revalidate. + revalidate = false } - - // Does the upper require revalidation? - return o.upper.Revalidate(ctx, name, parent.overlay.upper, child.overlay.upper) + child.overlay.copyMu.RUnlock() + return revalidate } // Keep implements MountSourceOperations by delegating to the upper @@ -81,6 +85,17 @@ func (o *overlayMountSourceOperations) Keep(dirent *Dirent) bool { return o.upper.Keep(dirent) } +// CacheReaddir implements MountSourceOperations.CacheReaddir for an overlay by +// performing the logical AND of the upper and lower filesystems' CacheReaddir +// methods. +// +// N.B. This is fs-global instead of inode-specific because it must always +// return the same value. If it was inode-specific, we couldn't guarantee that +// property across copy up. +func (o *overlayMountSourceOperations) CacheReaddir() bool { + return o.lower.CacheReaddir() && o.upper.CacheReaddir() +} + // ResetInodeMappings propagates the call to both upper and lower MountSource. func (o *overlayMountSourceOperations) ResetInodeMappings() { o.upper.ResetInodeMappings() diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 281364dfc..693ffc760 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -181,12 +181,14 @@ func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error d: newRootMount(1, d), } - return &MountNamespace{ + mns := MountNamespace{ userns: creds.UserNamespace, root: d, mounts: mnts, mountID: 2, - }, nil + } + mns.EnableLeakCheck("fs.MountNamespace") + return &mns, nil } // UserNamespace returns the user namespace associated with this mount manager. @@ -661,6 +663,11 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s } defer d.DecRef() + // Check that it is a regular file. + if !IsRegular(d.Inode.StableAttr) { + continue + } + // Check whether we can read and execute the found file. if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil { log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err) diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index c87f24997..1d3ff39e0 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/third_party/gvsync" ) // The virtual filesystem implements an overlay configuration. For a high-level @@ -196,6 +197,12 @@ type overlayEntry struct { // these locks is sufficient to read upper; holding all three for writing // is required to mutate it. upper *Inode + + // dirCacheMu protects dirCache. + dirCacheMu gvsync.DowngradableRWMutex `state:"nosave"` + + // dirCache is cache of DentAttrs from upper and lower Inodes. + dirCache *SortedDentryMap } // newOverlayEntry returns a new overlayEntry. @@ -258,6 +265,17 @@ func (o *overlayEntry) isMappableLocked() bool { return o.inodeLocked().Mappable() != nil } +// markDirectoryDirty marks any cached data dirty for this directory. This is +// necessary in order to ensure that this node does not retain stale state +// throughout its lifetime across multiple open directory handles. +// +// Currently this means invalidating any readdir caches. +func (o *overlayEntry) markDirectoryDirty() { + o.dirCacheMu.Lock() + o.dirCache = nil + o.dirCacheMu.Unlock() +} + // AddMapping implements memmap.Mappable.AddMapping. func (o *overlayEntry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { o.mapsMu.Lock() diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index da41a10ab..70ed854a8 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -42,7 +42,6 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index ea7aded9a..bee421d76 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -42,8 +41,8 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF var file *fs.File var fdFlags kernel.FDFlags t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - file, fdFlags = fdm.GetDescriptor(kdefs.FD(n)) + if fdTable := t.FDTable(); fdTable != nil { + file, fdFlags = fdTable.Get(int32(n)) } }) if file == nil { @@ -56,36 +55,31 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF // toDentAttr callback for each to get a DentAttr, which it then emits. This is // a helper for implementing fs.InodeOperations.Readdir. func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { - var fds kernel.FDs + var fds []int32 t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - fds = fdm.GetFDs() + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.GetFDs() } }) - fdInts := make([]int, 0, len(fds)) - for _, fd := range fds { - fdInts = append(fdInts, int(fd)) - } - - // Find the fd to start at. - idx := sort.SearchInts(fdInts, int(offset)) - if idx == len(fdInts) { + // Find the appropriate starting point. + idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(offset) }) + if idx == len(fds) { return offset, nil } - fdInts = fdInts[idx:] + fds = fds[idx:] - var fd int - for _, fd = range fdInts { + // Serialize all FDs. + for _, fd := range fds { name := strconv.FormatUint(uint64(fd), 10) - if err := c.DirEmit(name, toDentAttr(fd)); err != nil { + if err := c.DirEmit(name, toDentAttr(int(fd))); err != nil { // Returned offset is the next fd to serialize. return int64(fd), err } } // We serialized them all. Next offset should be higher than last // serialized fd. - return int64(fd + 1), nil + return int64(fds[len(fds)-1] + 1), nil } // fd implements fs.InodeOperations for a file in /proc/TID/fd/. @@ -154,9 +148,9 @@ func (f *fd) Close() error { type fdDir struct { ramfs.Dir - // We hold a reference on the task's fdmap but only keep an indirect - // task pointer to avoid Dirent loading circularity caused by fdmap's - // potential back pointers into the dirent tree. + // We hold a reference on the task's FDTable but only keep an indirect + // task pointer to avoid Dirent loading circularity caused by the + // table's back pointers into the dirent tree. t *kernel.Task } diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go index 715591ae5..f14833805 100644 --- a/pkg/sentry/fs/proc/fs.go +++ b/pkg/sentry/fs/proc/fs.go @@ -30,7 +30,7 @@ func init() { fs.RegisterFilesystem(&filesystem{}) } -// FilesystemName is the name underwhich the filesystem is registered. +// FilesystemName is the name under which the filesystem is registered. // Name matches fs/proc/root.c:proc_fs_type.name. const FilesystemName = "proc" diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 37694620c..6b839685b 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -155,37 +155,40 @@ func (n *netDev) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se contents[1] = " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n" for _, i := range interfaces { - // TODO(b/71872867): Collect stats from each inet.Stack - // implementation (hostinet, epsocket, and rpcinet). - // Implements the same format as // net/core/net-procfs.c:dev_seq_printf_stats. - l := fmt.Sprintf("%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", + var stats inet.StatDev + if err := n.s.Statistics(&stats, i.Name); err != nil { + log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) + continue + } + l := fmt.Sprintf( + "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", i.Name, // Received - 0, // bytes - 0, // packets - 0, // errors - 0, // dropped - 0, // fifo - 0, // frame - 0, // compressed - 0, // multicast + stats[0], // bytes + stats[1], // packets + stats[2], // errors + stats[3], // dropped + stats[4], // fifo + stats[5], // frame + stats[6], // compressed + stats[7], // multicast // Transmitted - 0, // bytes - 0, // packets - 0, // errors - 0, // dropped - 0, // fifo - 0, // frame - 0, // compressed - 0) // multicast + stats[8], // bytes + stats[9], // packets + stats[10], // errors + stats[11], // dropped + stats[12], // fifo + stats[13], // frame + stats[14], // compressed + stats[15]) // multicast contents = append(contents, l) } var data []seqfile.SeqData for _, l := range contents { - data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*ifinet6)(nil)}) + data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*netDev)(nil)}) } return data, 0 diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go index c1405a746..01ac97530 100644 --- a/pkg/sentry/fs/proc/rpcinet_proc.go +++ b/pkg/sentry/fs/proc/rpcinet_proc.go @@ -29,7 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// rpcInetInode implments fs.InodeOperations. +// rpcInetInode implements fs.InodeOperations. type rpcInetInode struct { fsutil.SimpleFileInode diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index b2e36aeee..87184ec67 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -162,6 +162,11 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry // subtask to emit. offset := file.Offset() + tasks := f.t.ThreadGroup().MemberIDs(f.pidns) + if len(tasks) == 0 { + return offset, syserror.ENOENT + } + if offset == 0 { // Serialize "." and "..". root := fs.RootFromContext(ctx) @@ -178,12 +183,12 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry } // Serialize tasks. - tasks := f.t.ThreadGroup().MemberIDs(f.pidns) taskInts := make([]int, 0, len(tasks)) for _, tid := range tasks { taskInts = append(taskInts, int(tid)) } + sort.Sort(sort.IntSlice(taskInts)) // Find the task to start at. idx := sort.SearchInts(taskInts, int(offset)) if idx == len(taskInts) { @@ -580,8 +585,8 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( var fds int var vss, rss, data uint64 s.t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - fds = fdm.Size() + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() } if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index 0c9e6ec09..f3e984c24 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -431,7 +431,7 @@ func (dfo *dirFileOperations) Seek(ctx context.Context, file *fs.File, whence fs } // IterateDir implements DirIterator.IterateDir. -func (dfo *dirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { +func (dfo *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCtx *fs.DirCtx, offset int) (int, error) { dfo.dir.mu.Lock() defer dfo.dir.mu.Unlock() diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go index b31c46bf9..e60b63e75 100644 --- a/pkg/sentry/fs/sys/fs.go +++ b/pkg/sentry/fs/sys/fs.go @@ -30,7 +30,7 @@ func init() { fs.RegisterFilesystem(&filesystem{}) } -// FilesystemName is the name underwhich the filesystem is registered. +// FilesystemName is the name under which the filesystem is registered. // Name matches fs/sysfs/mount.c:sysfs_fs_type.name. const FilesystemName = "sysfs" diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index a5fcdf969..be98ad751 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -65,7 +65,7 @@ func init() { fs.RegisterFilesystem(&Filesystem{}) } -// FilesystemName is the name underwhich the filesystem is registered. +// FilesystemName is the name under which the filesystem is registered. // Name matches mm/shmem.c:shmem_fs_type.name. const FilesystemName = "tmpfs" @@ -133,6 +133,9 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou } // Construct a mount which will follow the cache options provided. + // + // TODO(gvisor.dev/issue/179): There should be no reason to disable + // caching once bind mounts are properly supported. var msrc *fs.MountSource switch options[cacheKey] { case "", cacheAll: diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 58a26742c..1d128532b 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -307,7 +307,7 @@ type dirFileOperations struct { var _ fs.FileOperations = (*dirFileOperations)(nil) // IterateDir implements DirIterator.IterateDir. -func (df *dirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { +func (df *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCtx *fs.DirCtx, offset int) (int, error) { df.di.mu.Lock() defer df.di.mu.Unlock() diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index 1bbfbb90a..edee56c12 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -94,6 +94,13 @@ func (superOperations) Keep(*fs.Dirent) bool { return false } +// CacheReaddir implements fs.DirentOperations.CacheReaddir. +// +// CacheReaddir returns false because entries change on master operations. +func (superOperations) CacheReaddir() bool { + return false +} + // ResetInodeMappings implements MountSourceOperations.ResetInodeMappings. func (superOperations) ResetInodeMappings() {} diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index 54dd3d6b7..92ec1ca18 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -144,7 +144,7 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm } // Ioctl implements fs.FileOperations.Ioctl. -func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ // Get the number of bytes in the output queue read buffer. diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 2d68978dd..e30266404 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -128,7 +128,7 @@ func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src userme } // Ioctl implements fs.FileOperations.Ioctl. -func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ // Get the number of bytes in the input queue read buffer. diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index 8290f2530..b7cecb2ed 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -38,9 +38,11 @@ type Terminal struct { func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal { termios := linux.DefaultSlaveTermios - return &Terminal{ + t := Terminal{ d: d, n: n, ld: newLineDiscipline(termios), } + t.EnableLeakCheck("tty.Terminal") + return &t } diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD new file mode 100644 index 000000000..d5d4f68df --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -0,0 +1,55 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +load("//tools/go_generics:defs.bzl", "go_template_instance") + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "memfs", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Dentry", + "Linker": "*Dentry", + }, +) + +go_library( + name = "memfs", + srcs = [ + "dentry_list.go", + "directory.go", + "filesystem.go", + "memfs.go", + "regular_file.go", + "symlink.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "benchmark_test", + size = "small", + srcs = ["benchmark_test.go"], + deps = [ + ":memfs", + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go new file mode 100644 index 000000000..a94b17db6 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go @@ -0,0 +1,464 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package benchmark_test + +import ( + "fmt" + "runtime" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Differences from stat_benchmark: +// +// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are +// not included. +// +// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp. +// Non-MountStat benchmarks use a tmpfs root mount and no submounts. +// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a +// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus +// stat_benchmark at depth 1 does a comparable amount of work to *MountStat +// benchmarks at depth 2, and non-MountStat benchmarks at depth 3. +var depths = []int{1, 2, 3, 8, 64, 100} + +const ( + mountPointName = "tmp" + filename = "gvisor_test_temp_0_1557494568" +) + +// This is copied from syscalls/linux/sys_file.go, with the dependency on +// kernel.Task stripped out. +func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error { + var ( + d *fs.Dirent // The file. + rel *fs.Dirent // The relative directory for search (if required.) + err error + ) + + // Extract the working directory (maybe). + if len(path) > 0 && path[0] == '/' { + // Absolute path; rel can be nil. + } else if dirFD == linux.AT_FDCWD { + // Need to reference the working directory. + rel = wd + } else { + // Need to extract the given FD. + return syserror.EBADF + } + + // Lookup the node. + remainingTraversals := uint(linux.MaxSymlinkTraversals) + if resolve { + d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals) + } else { + d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals) + } + if err != nil { + return err + } + + err = fn(root, d) + d.DecRef() + return err +} + +func BenchmarkVFS1TmpfsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + + // Create VFS. + tmpfsFS, ok := fs.FindFilesystem("tmpfs") + if !ok { + b.Fatalf("failed to find tmpfs filesystem type") + } + rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + mntns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + b.Fatalf("failed to create mount namespace: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + root := mntns.Root() + defer root.DecRef() + d := root + d.IncRef() + defer d.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + next, err := d.Walk(ctx, root, name) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + d.DecRef() + d = next + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + file.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + dirPath := false + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(ctx) + if err != nil { + return err + } + // Sanity check. + if uattr.Perms.User.Execute { + b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) + } + return nil + }) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + } + }) + } +} + +func BenchmarkVFS2MemfsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + root := mntns.Root() + defer root.DecRef() + vd := root + vd.IncRef() + defer vd.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + pop := vfs.PathOperation{ + Root: root, + Start: vd, + Pathname: name, + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + vd.DecRef() + vd = nextVD + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: vd, + Pathname: filename, + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + defer fd.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Pathname: filePath, + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Mode&^linux.S_IFMT != 0644 { + b.Fatalf("got wrong permissions (%0o)", stat.Mode) + } + } + }) + } +} + +func BenchmarkVFS1TmpfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + + // Create VFS. + tmpfsFS, ok := fs.FindFilesystem("tmpfs") + if !ok { + b.Fatalf("failed to find tmpfs filesystem type") + } + rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + mntns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + b.Fatalf("failed to create mount namespace: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create and mount the submount. + root := mntns.Root() + defer root.DecRef() + if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create mount point: %v", err) + } + mountPoint, err := root.Walk(ctx, root, mountPointName) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs submount: %v", err) + } + if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + filePathBuilder.WriteString(mountPointName) + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + d, err := root.Walk(ctx, root, mountPointName) + if err != nil { + b.Fatalf("failed to walk to mount root: %v", err) + } + defer d.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + next, err := d.Walk(ctx, root, name) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + d.DecRef() + d = next + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + file.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + dirPath := false + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(ctx) + if err != nil { + return err + } + // Sanity check. + if uattr.Perms.User.Execute { + b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) + } + return nil + }) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + } + }) + } +} + +func BenchmarkVFS2MemfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create the mount point. + root := mntns.Root() + defer root.DecRef() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Pathname: mountPointName, + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create mount point: %v", err) + } + // Save the mount point for later use. + mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + // Create and mount the submount. + if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + filePathBuilder.WriteString(mountPointName) + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount root: %v", err) + } + defer vd.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + pop := vfs.PathOperation{ + Root: root, + Start: vd, + Pathname: name, + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + vd.DecRef() + vd = nextVD + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Verify that we didn't create any directories under the mount + // point (i.e. they were all created on the submount). + firstDirName := fmt.Sprintf("%d", depth) + if child := mountPoint.Dentry().Child(firstDirName); child != nil { + b.Fatalf("created directory %q under root mount, not submount", firstDirName) + } + + // Create the file that will be stat'd. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: vd, + Pathname: filename, + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + fd.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Pathname: filePath, + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Mode&^linux.S_IFMT != 0644 { + b.Fatalf("got wrong permissions (%0o)", stat.Mode) + } + } + }) + } +} diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go new file mode 100644 index 000000000..b0c3ea39a --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/directory.go @@ -0,0 +1,178 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type directory struct { + inode Inode + + // childList is a list containing (1) child Dentries and (2) fake Dentries + // (with inode == nil) that represent the iteration position of + // directoryFDs. childList is used to support directoryFD.IterDirents() + // efficiently. childList is protected by Filesystem.mu. + childList dentryList +} + +func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode { + dir := &directory{} + dir.inode.init(dir, fs, creds, mode) + dir.inode.nlink = 2 // from "." and parent directory or ".." for root + return &dir.inode +} + +func (i *Inode) isDir() bool { + _, ok := i.impl.(*directory) + return ok +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + // Protected by Filesystem.mu. + iter *Dentry + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { + if fd.iter != nil { + fs := fd.filesystem() + dir := fd.inode().impl.(*directory) + fs.mu.Lock() + dir.childList.Remove(fd.iter) + fs.mu.Unlock() + fd.iter = nil + } +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fs := fd.filesystem() + d := fd.vfsfd.VirtualDentry().Dentry() + + fs.mu.Lock() + defer fs.mu.Unlock() + + if fd.off == 0 { + if !cb.Handle(vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + Ino: d.Impl().(*Dentry).inode.ino, + Off: 0, + }) { + return nil + } + fd.off++ + } + if fd.off == 1 { + parentInode := d.ParentOrSelf().Impl().(*Dentry).inode + if !cb.Handle(vfs.Dirent{ + Name: "..", + Type: parentInode.direntType(), + Ino: parentInode.ino, + Off: 1, + }) { + return nil + } + fd.off++ + } + + dir := d.Impl().(*Dentry).inode.impl.(*directory) + var child *Dentry + if fd.iter == nil { + // Start iteration at the beginning of dir. + child = dir.childList.Front() + fd.iter = &Dentry{} + } else { + // Continue iteration from where we left off. + child = fd.iter.Next() + dir.childList.Remove(fd.iter) + } + for child != nil { + // Skip other directoryFD iterators. + if child.inode != nil { + if !cb.Handle(vfs.Dirent{ + Name: child.vfsd.Name(), + Type: child.inode.direntType(), + Ino: child.inode.ino, + Off: fd.off, + }) { + dir.childList.InsertBefore(child, fd.iter) + return nil + } + fd.off++ + } + child = child.Next() + } + dir.childList.PushBack(fd.iter) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if whence != linux.SEEK_SET { + // TODO: Linux also allows SEEK_CUR. + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + + fd.off = offset + // Compensate for "." and "..". + var remChildren int64 + if offset < 2 { + remChildren = 0 + } else { + remChildren = offset - 2 + } + + fs := fd.filesystem() + dir := fd.inode().impl.(*directory) + + fs.mu.Lock() + defer fs.mu.Unlock() + + // Ensure that fd.iter exists and is not linked into dir.childList. + if fd.iter == nil { + fd.iter = &Dentry{} + } else { + dir.childList.Remove(fd.iter) + } + // Insert fd.iter before the remChildren'th child, or at the end of the + // list if remChildren >= number of children. + child := dir.childList.Front() + for child != nil { + // Skip other directoryFD iterators. + if child.inode != nil { + if remChildren == 0 { + dir.childList.InsertBefore(child, fd.iter) + return offset, nil + } + remChildren-- + } + child = child.Next() + } + dir.childList.PushBack(fd.iter) + return offset, nil +} diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go new file mode 100644 index 000000000..4d989eeaf --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -0,0 +1,542 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// stepLocked resolves rp.Component() in parent directory vfsd. +// +// stepLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode == +// vfsd.Impl().(*Dentry).inode. +func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) { + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, nil, err + } +afterSymlink: + nextVFSD, err := rp.ResolveComponent(vfsd) + if err != nil { + return nil, nil, err + } + if nextVFSD == nil { + // Since the Dentry tree is the sole source of truth for memfs, if it's + // not in the Dentry tree, it doesn't exist. + return nil, nil, syserror.ENOENT + } + nextInode := nextVFSD.Impl().(*Dentry).inode + if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO: symlink traversals update access time + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return nextVFSD, nextInode, nil +} + +// walkExistingLocked resolves rp to an existing file. +// +// walkExistingLocked is loosely analogous to Linux's +// fs/namei.c:path_lookupat(). +// +// Preconditions: Filesystem.mu must be locked. +func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + for !rp.Done() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode) + if err != nil { + return nil, nil, err + } + } + if rp.MustBeDir() && !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: Filesystem.mu must be locked. !rp.Done(). +func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + for !rp.Final() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode) + if err != nil { + return nil, nil, err + } + } + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// checkCreateLocked checks that a file named rp.Component() may be created in +// directory parentVFSD, then returns rp.Component(). +// +// Preconditions: Filesystem.mu must be locked. parentInode == +// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true. +func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) { + if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return "", err + } + pc := rp.Component() + if pc == "." || pc == ".." { + return "", syserror.EEXIST + } + childVFSD, err := rp.ResolveChild(parentVFSD, pc) + if err != nil { + return "", err + } + if childVFSD != nil { + return "", syserror.EEXIST + } + if parentVFSD.IsDisowned() { + return "", syserror.ENOENT + } + return pc, nil +} + +// checkDeleteLocked checks that the file represented by vfsd may be deleted. +func checkDeleteLocked(vfsd *vfs.Dentry) error { + parentVFSD := vfsd.Parent() + if parentVFSD == nil { + return syserror.EBUSY + } + if parentVFSD.IsDisowned() { + return syserror.ENOENT + } + return nil +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + vfsd, inode, err := walkExistingLocked(rp) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + } + inode.incRef() // vfsd.IncRef(&fs.vfsfs) + return vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + d := vd.Dentry().Impl().(*Dentry) + if d.inode.isDir() { + return syserror.EPERM + } + d.inode.incLinksLocked() + child := fs.newDentry(d.inode) + parentVFSD.InsertChild(&child.vfsd, pc) + parentInode.impl.(*directory).childList.PushBack(child) + return nil +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) + parentVFSD.InsertChild(&child.vfsd, pc) + parentInode.impl.(*directory).childList.PushBack(child) + parentInode.incLinksLocked() // from child's ".." + return nil +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + _, err = checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + // TODO: actually implement mknod + return syserror.EPERM +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Filter out flags that are not supported by memfs. O_DIRECTORY and + // O_NOFOLLOW have no effect here (they're handled by VFS by setting + // appropriate bits in rp), but are returned by + // FileDescriptionImpl.StatusFlags(). + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW + + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.mu.RUnlock() + vfsd, inode, err := walkExistingLocked(rp) + if err != nil { + return nil, err + } + return inode.open(rp, vfsd, opts.Flags, false) + } + + mustCreate := opts.Flags&linux.O_EXCL != 0 + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + // FIXME: ??? + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + return inode.open(rp, vfsd, opts.Flags, false) + } +afterTrailingSymlink: + // Walk to the parent directory of the last path component. + for !rp.Final() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode) + if err != nil { + return nil, err + } + } + if !inode.isDir() { + return nil, syserror.ENOTDIR + } + // Check for search permission in the parent directory. + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + pc := rp.Component() + if pc == "." || pc == ".." { + return nil, syserror.EISDIR + } + // Determine whether or not we need to create a file. + childVFSD, err := rp.ResolveChild(vfsd, pc) + if err != nil { + return nil, err + } + if childVFSD == nil { + // Already checked for searchability above; now check for writability. + if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + childInode := fs.newRegularFile(rp.Credentials(), opts.Mode) + child := fs.newDentry(childInode) + vfsd.InsertChild(&child.vfsd, pc) + inode.impl.(*directory).childList.PushBack(child) + return childInode.open(rp, &child.vfsd, opts.Flags, true) + } + // Open existing file or follow symlink. + if mustCreate { + return nil, syserror.EEXIST + } + childInode := childVFSD.Impl().(*Dentry).inode + if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO: symlink traversals update access time + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink + } + return childInode.open(rp, childVFSD, opts.Flags, false) +} + +func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(flags) + if !afterCreate { + if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil { + return nil, err + } + } + switch impl := i.impl.(type) { + case *regularFile: + var fd regularFileFD + fd.flags = flags + fd.readable = vfs.MayReadFileWithOpenFlags(flags) + fd.writable = vfs.MayWriteFileWithOpenFlags(flags) + if fd.writable { + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + // Mount.EndWrite() is called by regularFileFD.Release(). + } + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + if flags&linux.O_TRUNC != 0 { + impl.mu.Lock() + impl.data = impl.data[:0] + atomic.StoreInt64(&impl.dataLen, 0) + impl.mu.Unlock() + } + return &fd.vfsfd, nil + case *directory: + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + var fd directoryFD + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + fd.flags = flags + return &fd.vfsfd, nil + case *symlink: + // Can't open symlinks without O_PATH (which is unimplemented). + return nil, syserror.ELOOP + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + _, inode, err := walkExistingLocked(rp) + fs.mu.RUnlock() + if err != nil { + return "", err + } + symlink, ok := inode.impl.(*symlink) + if !ok { + return "", syserror.EINVAL + } + return symlink.target, nil +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { + if rp.Done() { + // FIXME + return syserror.ENOENT + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + _, err = checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + // TODO: actually implement RenameAt + return syserror.EPERM +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, inode, err := walkExistingLocked(rp) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(vfsd); err != nil { + return err + } + if !inode.isDir() { + return syserror.ENOTDIR + } + if vfsd.HasChildren() { + return syserror.ENOTEMPTY + } + if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + inode.decRef() + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + _, _, err := walkExistingLocked(rp) + fs.mu.RUnlock() + if err != nil { + return err + } + if opts.Stat.Mask == 0 { + return nil + } + // TODO: implement Inode.setStat + return syserror.EPERM +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + _, inode, err := walkExistingLocked(rp) + fs.mu.RUnlock() + if err != nil { + return linux.Statx{}, err + } + var stat linux.Statx + inode.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + _, _, err := walkExistingLocked(rp) + fs.mu.RUnlock() + if err != nil { + return linux.Statfs{}, err + } + // TODO: actually implement statfs + return linux.Statfs{}, syserror.ENOSYS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) + parentVFSD.InsertChild(&child.vfsd, pc) + parentInode.impl.(*directory).childList.PushBack(child) + return nil +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, inode, err := walkExistingLocked(rp) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(vfsd); err != nil { + return err + } + if inode.isDir() { + return syserror.EISDIR + } + if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + inode.decLinksLocked() + return nil +} diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go new file mode 100644 index 000000000..f381e1a88 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/memfs.go @@ -0,0 +1,299 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package memfs provides a filesystem implementation that behaves like tmpfs: +// the Dentry tree is the sole source of truth for the state of the filesystem. +// +// memfs is intended primarily to demonstrate filesystem implementation +// patterns. Real uses cases for an in-memory filesystem should use tmpfs +// instead. +// +// Lock order: +// +// Filesystem.mu +// regularFileFD.offMu +// regularFile.mu +// Inode.mu +package memfs + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Filesystem implements vfs.FilesystemImpl. +type Filesystem struct { + vfsfs vfs.Filesystem + + // mu serializes changes to the Dentry tree. + mu sync.RWMutex + + nextInoMinusOne uint64 // accessed using atomic memory operations +} + +// NewFilesystem implements vfs.FilesystemType.NewFilesystem. +func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + var fs Filesystem + fs.vfsfs.Init(&fs) + root := fs.newDentry(fs.newDirectory(creds, 01777)) + return &fs.vfsfs, &root.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *Filesystem) Release() { +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *Filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// Dentry implements vfs.DentryImpl. +type Dentry struct { + vfsd vfs.Dentry + + // inode is the inode represented by this Dentry. Multiple Dentries may + // share a single non-directory Inode (with hard links). inode is + // immutable. + inode *Inode + + // memfs doesn't count references on Dentries; because the Dentry tree is + // the sole source of truth, it is by definition always consistent with the + // state of the filesystem. However, it does count references on Inodes, + // because Inode resources are released when all references are dropped. + // (memfs doesn't really have resources to release, but we implement + // reference counting because tmpfs regular files will.) + + // dentryEntry (ugh) links Dentries into their parent directory.childList. + dentryEntry +} + +func (fs *Filesystem) newDentry(inode *Inode) *Dentry { + d := &Dentry{ + inode: inode, + } + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) { + d.inode.incRef() +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { + return d.inode.tryIncRef() +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) { + d.inode.decRef() +} + +// Inode represents a filesystem object. +type Inode struct { + // refs is a reference count. refs is accessed using atomic memory + // operations. + // + // A reference is held on all Inodes that are reachable in the filesystem + // tree. For non-directories (which may have multiple hard links), this + // means that a reference is dropped when nlink reaches 0. For directories, + // nlink never reaches 0 due to the "." entry; instead, + // Filesystem.RmdirAt() drops the reference. + refs int64 + + // Inode metadata; protected by mu and accessed using atomic memory + // operations unless otherwise specified. + mu sync.RWMutex + mode uint32 // excluding file type bits, which are based on impl + nlink uint32 // protected by Filesystem.mu instead of Inode.mu + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + ino uint64 // immutable + + impl interface{} // immutable +} + +func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) { + i.refs = 1 + i.mode = uint32(mode) + i.uid = uint32(creds.EffectiveKUID) + i.gid = uint32(creds.EffectiveKGID) + i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) + // i.nlink initialized by caller + i.impl = impl +} + +// Preconditions: Filesystem.mu must be locked for writing. +func (i *Inode) incLinksLocked() { + if atomic.AddUint32(&i.nlink, 1) <= 1 { + panic("memfs.Inode.incLinksLocked() called with no existing links") + } +} + +// Preconditions: Filesystem.mu must be locked for writing. +func (i *Inode) decLinksLocked() { + if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 { + i.decRef() + } else if nlink == ^uint32(0) { // negative overflow + panic("memfs.Inode.decLinksLocked() called with no existing links") + } +} + +func (i *Inode) incRef() { + if atomic.AddInt64(&i.refs, 1) <= 1 { + panic("memfs.Inode.incRef() called without holding a reference") + } +} + +func (i *Inode) tryIncRef() bool { + for { + refs := atomic.LoadInt64(&i.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { + return true + } + } +} + +func (i *Inode) decRef() { + if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + // This is unnecessary; it's mostly to simulate what tmpfs would do. + if regfile, ok := i.impl.(*regularFile); ok { + regfile.mu.Lock() + regfile.data = nil + atomic.StoreInt64(®file.dataLen, 0) + regfile.mu.Unlock() + } + } else if refs < 0 { + panic("memfs.Inode.decRef() called without holding a reference") + } +} + +func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { + return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) +} + +// Go won't inline this function, and returning linux.Statx (which is quite +// big) means spending a lot of time in runtime.duffcopy(), so instead it's an +// output parameter. +func (i *Inode) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + stat.Blksize = 1 // usermem.PageSize in tmpfs + stat.Nlink = atomic.LoadUint32(&i.nlink) + stat.UID = atomic.LoadUint32(&i.uid) + stat.GID = atomic.LoadUint32(&i.gid) + stat.Mode = uint16(atomic.LoadUint32(&i.mode)) + stat.Ino = i.ino + // TODO: device number + switch impl := i.impl.(type) { + case *regularFile: + stat.Mode |= linux.S_IFREG + stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS + stat.Size = uint64(atomic.LoadInt64(&impl.dataLen)) + // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in + // a uint64 accessed using atomic memory operations to avoid taking + // locks). + stat.Blocks = allocatedBlocksForSize(stat.Size) + case *directory: + stat.Mode |= linux.S_IFDIR + case *symlink: + stat.Mode |= linux.S_IFLNK + stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS + stat.Size = uint64(len(impl.target)) + stat.Blocks = allocatedBlocksForSize(stat.Size) + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +// allocatedBlocksForSize returns the number of 512B blocks needed to +// accommodate the given size in bytes, as appropriate for struct +// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block +// size is independent of the "preferred block size for I/O", struct +// stat::st_blksize and struct statx::stx_blksize.) +func allocatedBlocksForSize(size uint64) uint64 { + return (size + 511) / 512 +} + +func (i *Inode) direntType() uint8 { + switch i.impl.(type) { + case *regularFile: + return linux.DT_REG + case *directory: + return linux.DT_DIR + case *symlink: + return linux.DT_LNK + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +// fileDescription is embedded by memfs implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + + flags uint32 // status flags; immutable +} + +func (fd *fileDescription) filesystem() *Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem) +} + +func (fd *fileDescription) inode() *Inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode +} + +// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. +func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { + return fd.flags, nil +} + +// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. +func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + // None of the flags settable by fcntl(F_SETFL) are supported, so this is a + // no-op. + return nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + fd.inode().statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + // TODO: implement Inode.setStat + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go new file mode 100644 index 000000000..4a3603cc8 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/regular_file.go @@ -0,0 +1,155 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "io" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type regularFile struct { + inode Inode + + mu sync.RWMutex + data []byte + // dataLen is len(data), but accessed using atomic memory operations to + // avoid locking in Inode.stat(). + dataLen int64 +} + +func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode { + file := ®ularFile{} + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} + +type regularFileFD struct { + fileDescription + vfs.FileDescriptionDefaultImpl + + // These are immutable. + readable bool + writable bool + + // off is the file offset. off is accessed using atomic memory operations. + // offMu serializes operations that may mutate off. + off int64 + offMu sync.Mutex +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { + if fd.writable { + fd.vfsfd.VirtualDentry().Mount().EndWrite() + } +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EINVAL + } + f := fd.inode().impl.(*regularFile) + f.mu.RLock() + if offset >= int64(len(f.data)) { + f.mu.RUnlock() + return 0, io.EOF + } + n, err := dst.CopyOut(ctx, f.data[offset:]) + f.mu.RUnlock() + return int64(n), err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + srclen := src.NumBytes() + if srclen == 0 { + return 0, nil + } + f := fd.inode().impl.(*regularFile) + f.mu.Lock() + end := offset + srclen + if end < offset { + // Overflow. + f.mu.Unlock() + return 0, syserror.EFBIG + } + if end > f.dataLen { + f.data = append(f.data, make([]byte, end-f.dataLen)...) + atomic.StoreInt64(&f.dataLen, end) + } + n, err := src.CopyIn(ctx, f.data[offset:end]) + f.mu.Unlock() + return int64(n), err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.offMu.Lock() + defer fd.offMu.Unlock() + switch whence { + case linux.SEEK_SET: + // use offset as specified + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen) + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + return nil +} diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go new file mode 100644 index 000000000..e002d1727 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/symlink.go @@ -0,0 +1,36 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +type symlink struct { + inode Inode + target string // immutable +} + +func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode { + link := &symlink{ + target: target, + } + link.inode.init(link, fs, creds, 0777) + link.inode.nlink = 1 // from parent directory + return &link.inode +} + +// O_PATH is unimplemented, so there's no way to get a FileDescription +// representing a symlink yet. diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 7c104fd47..5b75a4a06 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -49,6 +49,9 @@ type Stack interface { // SetTCPSACKEnabled attempts to change TCP selective acknowledgement // settings. SetTCPSACKEnabled(enabled bool) error + + // Statistics reports stack statistics. + Statistics(stat interface{}, arg string) error } // Interface contains information about a network interface. @@ -102,3 +105,7 @@ type TCPBufferSize struct { // Max is the maximum size. Max int } + +// StatDev describes one line of /proc/net/dev, i.e., stats for one network +// interface. +type StatDev [16]uint64 diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index 624371eb6..75f9e7a77 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -81,3 +81,8 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error { s.TCPSACKFlag = enabled return nil } + +// Statistics implements inet.Stack.Statistics. +func (s *TestStack) Statistics(stat interface{}, arg string) error { + return nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c172d399e..e61d39c82 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -31,7 +31,7 @@ go_template_instance( go_template_instance( name = "seqatomic_taskgoroutineschedinfo", - out = "seqatomic_taskgoroutineschedinfo.go", + out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", template = "//third_party/gvsync:generic_seqatomic", @@ -96,7 +96,8 @@ go_library( srcs = [ "abstract_socket_namespace.go", "context.go", - "fd_map.go", + "fd_table.go", + "fd_table_unsafe.go", "fs_context.go", "ipc_namespace.go", "kernel.go", @@ -111,7 +112,7 @@ go_library( "ptrace_arm64.go", "rseq.go", "seccomp.go", - "seqatomic_taskgoroutineschedinfo.go", + "seqatomic_taskgoroutineschedinfo_unsafe.go", "session_list.go", "sessions.go", "signal.go", @@ -179,7 +180,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/futex", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/semaphore", "//pkg/sentry/kernel/shm", @@ -214,7 +214,7 @@ go_test( name = "kernel_test", size = "small", srcs = [ - "fd_map_test.go", + "fd_table_test.go", "table_test.go", "task_test.go", "timekeeper_test.go", @@ -223,9 +223,10 @@ go_test( deps = [ "//pkg/abi", "//pkg/sentry/arch", + "//pkg/sentry/context", "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", "//pkg/sentry/fs/filetest", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/sched", "//pkg/sentry/limits", "//pkg/sentry/pgalloc", diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 37cb8c8b9..1d00a6310 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -4,6 +4,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") load("//tools/go_stateify:defs.bzl", "go_library") go_template_instance( + name = "atomicptr_credentials", + out = "atomicptr_credentials_unsafe.go", + package = "auth", + suffix = "Credentials", + template = "//third_party/gvsync:generic_atomicptr", + types = { + "Value": "Credentials", + }, +) + +go_template_instance( name = "id_map_range", out = "id_map_range.go", package = "auth", @@ -34,6 +45,7 @@ go_template_instance( go_library( name = "auth", srcs = [ + "atomicptr_credentials_unsafe.go", "auth.go", "capability_set.go", "context.go", diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index fb99cfc8f..f46c43128 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -30,7 +30,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 33c7dccae..9c0a4e1b4 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -61,7 +60,7 @@ const ( // +stateify savable type FileIdentifier struct { File *fs.File `state:"wait"` - Fd kdefs.FD + Fd int32 } // pollEntry holds all the state associated with an event poll entry, that is, diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go deleted file mode 100644 index 786936a7d..000000000 --- a/pkg/sentry/kernel/fd_map.go +++ /dev/null @@ -1,364 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kernel - -import ( - "bytes" - "fmt" - "sort" - "sync" - "sync/atomic" - "syscall" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/limits" -) - -// FDs is an ordering of FD's that can be made stable. -type FDs []kdefs.FD - -func (f FDs) Len() int { - return len(f) -} - -func (f FDs) Swap(i, j int) { - f[i], f[j] = f[j], f[i] -} - -func (f FDs) Less(i, j int) bool { - return f[i] < f[j] -} - -// FDFlags define flags for an individual descriptor. -// -// +stateify savable -type FDFlags struct { - // CloseOnExec indicates the descriptor should be closed on exec. - CloseOnExec bool -} - -// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags -// representation. -func (f FDFlags) ToLinuxFileFlags() (mask uint) { - if f.CloseOnExec { - mask |= linux.O_CLOEXEC - } - return -} - -// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags -// representation. -func (f FDFlags) ToLinuxFDFlags() (mask uint) { - if f.CloseOnExec { - mask |= linux.FD_CLOEXEC - } - return -} - -// descriptor holds the details about a file descriptor, namely a pointer the -// file itself and the descriptor flags. -// -// +stateify savable -type descriptor struct { - file *fs.File - flags FDFlags -} - -// FDMap is used to manage File references and flags. -// -// +stateify savable -type FDMap struct { - refs.AtomicRefCount - k *Kernel - files map[kdefs.FD]descriptor - mu sync.RWMutex `state:"nosave"` - uid uint64 -} - -// ID returns a unique identifier for this FDMap. -func (f *FDMap) ID() uint64 { - return f.uid -} - -// NewFDMap allocates a new FDMap that may be used by tasks in k. -func (k *Kernel) NewFDMap() *FDMap { - return &FDMap{ - k: k, - files: make(map[kdefs.FD]descriptor), - uid: atomic.AddUint64(&k.fdMapUids, 1), - } -} - -// destroy removes all of the file descriptors from the map. -func (f *FDMap) destroy() { - f.RemoveIf(func(*fs.File, FDFlags) bool { - return true - }) -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FDMap) DecRef() { - f.DecRefWithDestructor(f.destroy) -} - -// Size returns the number of file descriptor slots currently allocated. -func (f *FDMap) Size() int { - f.mu.RLock() - defer f.mu.RUnlock() - - return len(f.files) -} - -// String is a stringer for FDMap. -func (f *FDMap) String() string { - f.mu.RLock() - defer f.mu.RUnlock() - - var b bytes.Buffer - for k, v := range f.files { - n, _ := v.file.Dirent.FullName(nil /* root */) - b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n)) - } - return b.String() -} - -// NewFDFrom allocates a new FD guaranteed to be the lowest number available -// greater than or equal to from. This property is important as Unix programs -// tend to count on this allocation order. -func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) { - if fd < 0 { - // Don't accept negative FDs. - return 0, syscall.EINVAL - } - - f.mu.Lock() - defer f.mu.Unlock() - - // Finds the lowest fd not in the handles map. - lim := limitSet.Get(limits.NumberOfFiles) - for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ { - if _, ok := f.files[i]; !ok { - file.IncRef() - f.files[i] = descriptor{file, flags} - return i, nil - } - } - - return -1, syscall.EMFILE -} - -// NewFDAt sets the file reference for the given FD. If there is an -// active reference for that FD, the ref count for that existing reference -// is decremented. -func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error { - if fd < 0 { - // Don't accept negative FDs. - return syscall.EBADF - } - - // In this one case we do not do a defer of the Unlock. The - // reason is that we must have done all the work needed for - // discarding any old open file before we return to the - // caller. In other words, the DecRef(), below, must have - // completed by the time we return to the caller to ensure - // side effects are, in fact, effected. A classic example is - // dup2(fd1, fd2); if fd2 was already open, it must be closed, - // and we don't want to resume the caller until it is; we have - // to block on the DecRef(). Hence we can not just do a 'go - // oldfile.DecRef()', since there would be no guarantee that - // it would be done before we the caller resumed. Since we - // must wait for the DecRef() to finish, and that could take - // time, it's best to first call f.muUnlock beore so we are - // not blocking other uses of this FDMap on the DecRef() call. - f.mu.Lock() - oldDesc, oldExists := f.files[fd] - lim := limitSet.Get(limits.NumberOfFiles).Cur - // if we're closing one then the effective limit is one - // more than the actual limit. - if oldExists && lim != limits.Infinity { - lim++ - } - if lim != limits.Infinity && fd >= kdefs.FD(lim) { - f.mu.Unlock() - return syscall.EMFILE - } - - file.IncRef() - f.files[fd] = descriptor{file, flags} - f.mu.Unlock() - - if oldExists { - oldDesc.file.DecRef() - } - return nil -} - -// SetFlags sets the flags for the given file descriptor, if it is valid. -func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) { - f.mu.Lock() - defer f.mu.Unlock() - - desc, ok := f.files[fd] - if !ok { - return - } - - f.files[fd] = descriptor{desc.file, flags} -} - -// GetDescriptor returns a reference to the file and the flags for the FD. It -// bumps its reference count as well. It returns nil if there is no File -// for the FD, i.e. if the FD is invalid. The caller must use DecRef -// when they are done. -func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) { - f.mu.RLock() - defer f.mu.RUnlock() - - if desc, ok := f.files[fd]; ok { - desc.file.IncRef() - return desc.file, desc.flags - } - return nil, FDFlags{} -} - -// GetFile returns a reference to the File for the FD and bumps -// its reference count as well. It returns nil if there is no File -// for the FD, i.e. if the FD is invalid. The caller must use DecRef -// when they are done. -func (f *FDMap) GetFile(fd kdefs.FD) *fs.File { - f.mu.RLock() - if desc, ok := f.files[fd]; ok { - desc.file.IncRef() - f.mu.RUnlock() - return desc.file - } - f.mu.RUnlock() - return nil -} - -// fds returns an ordering of FDs. -func (f *FDMap) fds() FDs { - fds := make(FDs, 0, len(f.files)) - for fd := range f.files { - fds = append(fds, fd) - } - sort.Sort(fds) - return fds -} - -// GetFDs returns a list of valid fds. -func (f *FDMap) GetFDs() FDs { - f.mu.RLock() - defer f.mu.RUnlock() - return f.fds() -} - -// GetRefs returns a stable slice of references to all files and bumps the -// reference count on each. The caller must use DecRef on each reference when -// they're done using the slice. -func (f *FDMap) GetRefs() []*fs.File { - f.mu.RLock() - defer f.mu.RUnlock() - - fds := f.fds() - fs := make([]*fs.File, 0, len(fds)) - for _, fd := range fds { - desc := f.files[fd] - desc.file.IncRef() - fs = append(fs, desc.file) - } - return fs -} - -// Fork returns an independent FDMap pointing to the same descriptors. -func (f *FDMap) Fork() *FDMap { - f.mu.RLock() - defer f.mu.RUnlock() - - clone := f.k.NewFDMap() - - // Grab a extra reference for every file. - for fd, desc := range f.files { - desc.file.IncRef() - clone.files[fd] = desc - } - - // That's it! - return clone -} - -// unlock releases all file locks held by this FDMap's uid. Must only be -// called on a non-nil *fs.File. -func (f *FDMap) unlock(file *fs.File) { - id := lock.UniqueID(f.ID()) - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF}) -} - -// inotifyFileClose generates the appropriate inotify events for f being closed. -func inotifyFileClose(f *fs.File) { - var ev uint32 - d := f.Dirent - - if fs.IsDir(d.Inode.StableAttr) { - ev |= linux.IN_ISDIR - } - - if f.Flags().Write { - ev |= linux.IN_CLOSE_WRITE - } else { - ev |= linux.IN_CLOSE_NOWRITE - } - - d.InotifyEvent(ev, 0) -} - -// Remove removes an FD from the FDMap, and returns (File, true) if a File -// one was found. Callers are expected to decrement the reference count on -// the File. Otherwise returns (nil, false). -func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) { - f.mu.Lock() - desc := f.files[fd] - delete(f.files, fd) - f.mu.Unlock() - if desc.file != nil { - f.unlock(desc.file) - inotifyFileClose(desc.file) - return desc.file, true - } - return nil, false -} - -// RemoveIf removes all FDs where cond is true. -func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) { - var removed []*fs.File - f.mu.Lock() - for fd, desc := range f.files { - if desc.file != nil && cond(desc.file, desc.flags) { - delete(f.files, fd) - removed = append(removed, desc.file) - } - } - f.mu.Unlock() - - for _, file := range removed { - f.unlock(file) - inotifyFileClose(file) - file.DecRef() - } -} diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go deleted file mode 100644 index 8571dbe59..000000000 --- a/pkg/sentry/kernel/fd_map_test.go +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kernel - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/sentry/fs/filetest" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/limits" -) - -const ( - // maxFD is the maximum FD to try to create in the map. - // This number of open files has been seen in the wild. - maxFD = 2 * 1024 -) - -func newTestFDMap() *FDMap { - return &FDMap{ - files: make(map[kdefs.FD]descriptor), - } -} - -// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap, -// until there is no room, then makes sure that NewFDAt works -// and also that if we remove one and add one that works too. -func TestFDMapMany(t *testing.T) { - file := filetest.NewTestFile(t) - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */) - - f := newTestFDMap() - for i := 0; i < maxFD; i++ { - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD) - } - } - - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error") - } - - if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) - } -} - -// TestFDMap does a set of simple tests to make sure simple adds, -// removes, GetRefs, and DecRefs work. The ordering is just weird -// enough that a table-driven approach seemed clumsy. -func TestFDMap(t *testing.T) { - file := filetest.NewTestFile(t) - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true /* privileged */) - - f := newTestFDMap() - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err) - } - - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error") - } - - largeLimit := limits.Limit{maxFD, maxFD} - limitSet.Set(limits.NumberOfFiles, largeLimit, true /* privileged */) - - if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) - } else if fd != kdefs.FD(1) { - t.Fatalf("Added an FD to a resized map: got %v, want 1", fd) - } - - if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) - } - - if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1) - } - - if ref := f.GetFile(1); ref == nil { - t.Fatalf("f.GetFile(1): got nil, wanted %v", file) - } - - if ref := f.GetFile(2); ref != nil { - t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref) - } - - ref, ok := f.Remove(1) - if !ok { - t.Fatalf("f.Remove(1) for an existing FD: failed, want success") - } - ref.DecRef() - - if ref, ok := f.Remove(1); ok { - ref.DecRef() - t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") - } - -} - -func TestDescriptorFlags(t *testing.T) { - file := filetest.NewTestFile(t) - f := newTestFDMap() - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */) - - origFlags := FDFlags{CloseOnExec: true} - - if err := f.NewFDAt(2, file, origFlags, limitSet); err != nil { - t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err) - } - - newFile, newFlags := f.GetDescriptor(2) - if newFile == nil { - t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile) - } - - if newFlags != origFlags { - t.Fatalf("new File flags %+v don't match original %+v", newFlags, origFlags) - } -} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go new file mode 100644 index 000000000..cc3f43a45 --- /dev/null +++ b/pkg/sentry/kernel/fd_table.go @@ -0,0 +1,382 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "fmt" + "math" + "sync" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/limits" +) + +// FDFlags define flags for an individual descriptor. +// +// +stateify savable +type FDFlags struct { + // CloseOnExec indicates the descriptor should be closed on exec. + CloseOnExec bool +} + +// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags +// representation. +func (f FDFlags) ToLinuxFileFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.O_CLOEXEC + } + return +} + +// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags +// representation. +func (f FDFlags) ToLinuxFDFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.FD_CLOEXEC + } + return +} + +// descriptor holds the details about a file descriptor, namely a pointer to +// the file itself and the descriptor flags. +// +// Note that this is immutable and can only be changed via operations on the +// descriptorTable. +// +// +stateify savable +type descriptor struct { + file *fs.File + flags FDFlags +} + +// FDTable is used to manage File references and flags. +// +// +stateify savable +type FDTable struct { + refs.AtomicRefCount + k *Kernel + + // uid is a unique identifier. + uid uint64 + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // used contains the number of non-nil entries. It must be accessed + // atomically. It may be read atomically without holding mu (but not + // written). + used int32 + + // descriptorTable holds descriptors. + descriptorTable `state:".(map[int32]descriptor)"` +} + +func (f *FDTable) saveDescriptorTable() map[int32]descriptor { + m := make(map[int32]descriptor) + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + m[fd] = descriptor{ + file: file, + flags: flags, + } + }) + return m +} + +func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { + f.init() // Initialize table. + for fd, d := range m { + f.set(fd, d.file, d.flags) + + // Note that we do _not_ need to acquire a extra table + // reference here. The table reference will already be + // accounted for in the file, so we drop the reference taken by + // set above. + d.file.DecRef() + } +} + +// drop drops the table reference. +func (f *FDTable) drop(file *fs.File) { + // Release locks. + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF}) + + // Send inotify events. + d := file.Dirent + var ev uint32 + if fs.IsDir(d.Inode.StableAttr) { + ev |= linux.IN_ISDIR + } + if file.Flags().Write { + ev |= linux.IN_CLOSE_WRITE + } else { + ev |= linux.IN_CLOSE_NOWRITE + } + d.InotifyEvent(ev, 0) + + // Drop the table reference. + file.DecRef() +} + +// ID returns a unique identifier for this FDTable. +func (f *FDTable) ID() uint64 { + return f.uid +} + +// NewFDTable allocates a new FDTable that may be used by tasks in k. +func (k *Kernel) NewFDTable() *FDTable { + f := &FDTable{ + k: k, + uid: atomic.AddUint64(&k.fdMapUids, 1), + } + f.init() + return f +} + +// destroy removes all of the file descriptors from the map. +func (f *FDTable) destroy() { + f.RemoveIf(func(*fs.File, FDFlags) bool { + return true + }) +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FDTable) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Size returns the number of file descriptor slots currently allocated. +func (f *FDTable) Size() int { + size := atomic.LoadInt32(&f.used) + return int(size) +} + +// forEach iterates over all non-nil files. +// +// It is the caller's responsibility to acquire an appropriate lock. +func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { + fd := int32(0) + for { + file, flags, ok := f.get(fd) + if !ok { + break + } + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + fn(int32(fd), file, flags) + file.DecRef() + } + fd++ + } +} + +// String is a stringer for FDTable. +func (f *FDTable) String() string { + var b bytes.Buffer + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + n, _ := file.Dirent.FullName(nil /* root */) + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + }) + return b.String() +} + +// NewFDs allocates new FDs guaranteed to be the lowest number available +// greater than or equal to the fd parameter. All files will share the set +// flags. Success is guaranteed to be all or none. +func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { + if fd < 0 { + // Don't accept negative FDs. + return nil, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if fd >= end { + return nil, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Install all entries. + for i := fd; i < end && len(fds) < len(files); i++ { + if d, _, _ := f.get(i); d == nil { + f.set(i, files[len(fds)], flags) // Set the descriptor. + fds = append(fds, i) // Record the file descriptor. + } + } + + // Failure? Unwind existing FDs. + if len(fds) < len(files) { + for _, i := range fds { + f.set(i, nil, FDFlags{}) // Zap entry. + } + return nil, syscall.EMFILE + } + + return fds, nil +} + +// NewFDAt sets the file reference for the given FD. If there is an active +// reference for that FD, the ref count for that existing reference is +// decremented. +func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Check the limit for the provided file. + if limitSet := limits.FromContext(ctx); limitSet != nil { + if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { + return syscall.EMFILE + } + } + + // Install the entry. + f.set(fd, file, flags) + return nil +} + +// SetFlags sets the flags for the given file descriptor. +// +// True is returned iff flags were changed. +func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + file, _, _ := f.get(fd) + if file == nil { + // No file found. + return syscall.EBADF + } + + // Update the flags. + f.set(fd, file, flags) + return nil +} + +// Get returns a reference to the file and the flags for the FD or nil if no +// file is defined for the given fd. +// +// N.B. Callers are required to use DecRef when they are done. +// +//go:nosplit +func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { + if fd < 0 { + return nil, FDFlags{} + } + + for { + file, flags, _ := f.get(fd) + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + // Reference acquired. + return file, flags + } + // No file available. + return nil, FDFlags{} + } +} + +// GetFDs returns a list of valid fds. +func (f *FDTable) GetFDs() []int32 { + fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + fds = append(fds, fd) + }) + return fds +} + +// GetRefs returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDTable) GetRefs() []*fs.File { + files := make([]*fs.File, 0, f.Size()) + f.forEach(func(_ int32, file *fs.File, flags FDFlags) { + file.IncRef() // Acquire a reference for caller. + files = append(files, file) + }) + return files +} + +// Fork returns an independent FDTable. +func (f *FDTable) Fork() *FDTable { + clone := f.k.NewFDTable() + + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + // The set function here will acquire an appropriate table + // reference for the clone. We don't need anything else. + clone.set(fd, file, flags) + }) + return clone +} + +// Remove removes an FD from and returns a non-file iff successful. +// +// N.B. Callers are required to use DecRef when they are done. +func (f *FDTable) Remove(fd int32) *fs.File { + if fd < 0 { + return nil + } + + f.mu.Lock() + defer f.mu.Unlock() + + orig, _, _ := f.get(fd) + if orig != nil { + orig.IncRef() // Reference for caller. + f.set(fd, nil, FDFlags{}) // Zap entry. + } + return orig +} + +// RemoveIf removes all FDs where cond is true. +func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) { + f.mu.Lock() + defer f.mu.Unlock() + + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + if cond(file, flags) { + f.set(fd, nil, FDFlags{}) // Clear from table. + } + }) +} diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go new file mode 100644 index 000000000..2413788e7 --- /dev/null +++ b/pkg/sentry/kernel/fd_table_test.go @@ -0,0 +1,192 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "runtime" + "sync" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/filetest" + "gvisor.dev/gvisor/pkg/sentry/limits" +) + +const ( + // maxFD is the maximum FD to try to create in the map. + // + // This number of open files has been seen in the wild. + maxFD = 2 * 1024 +) + +func runTest(t testing.TB, fn func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet)) { + t.Helper() // Don't show in stacks. + + // Create the limits and context. + limitSet := limits.NewLimitSet() + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true) + ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet) + + // Create a test file.; + file := filetest.NewTestFile(t) + + // Create the table. + fdTable := new(FDTable) + fdTable.init() + + // Run the test. + fn(ctx, fdTable, file, limitSet) +} + +// TestFDTableMany allocates maxFD FDs, i.e. maxes out the FDTable, until there +// is no room, then makes sure that NewFDAt works and also that if we remove +// one and add one that works too. +func TestFDTableMany(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + for i := 0; i < maxFD; i++ { + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD) + } + } + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(0, r) in full map: got nil, wanted error") + } + + if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { + t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + }) +} + +// TestFDTable does a set of simple tests to make sure simple adds, removes, +// GetRefs, and DecRefs work. The ordering is just weird enough that a +// table-driven approach seemed clumsy. +func TestFDTable(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet) { + // Cap the limit at one. + limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true) + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err) + } + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error") + } + + // Remove the previous limit. + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true) + + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) + } else if len(fds) != 1 || fds[0] != 1 { + t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds) + } + + if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { + t.Fatalf("Replacing FD 1 via fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + if err := fdTable.NewFDAt(ctx, maxFD+1, file, FDFlags{}); err == nil { + t.Fatalf("Using an FD that was too large via fdTable.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1) + } + + if ref, _ := fdTable.Get(1); ref == nil { + t.Fatalf("fdTable.Get(1): got nil, wanted %v", file) + } + + if ref, _ := fdTable.Get(2); ref != nil { + t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) + } + + ref := fdTable.Remove(1) + if ref == nil { + t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") + } + ref.DecRef() + + if ref := fdTable.Remove(1); ref != nil { + t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") + } + }) +} + +func TestDescriptorFlags(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + if err := fdTable.NewFDAt(ctx, 2, file, FDFlags{CloseOnExec: true}); err != nil { + t.Fatalf("fdTable.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err) + } + + newFile, flags := fdTable.Get(2) + if newFile == nil { + t.Fatalf("fdTable.Get(2): got a %v, wanted nil", newFile) + } + + if !flags.CloseOnExec { + t.Fatalf("new File flags %v don't match original %d\n", flags, 0) + } + }) +} + +func BenchmarkFDLookupAndDecRef(b *testing.B) { + b.StopTimer() // Setup. + + runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{}) + if err != nil { + b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err) + } + + b.StartTimer() // Benchmark. + for i := 0; i < b.N; i++ { + tf, _ := fdTable.Get(fds[i%len(fds)]) + tf.DecRef() + } + }) +} + +func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { + b.StopTimer() // Setup. + + runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{}) + if err != nil { + b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err) + } + + concurrency := runtime.GOMAXPROCS(0) + if concurrency < 4 { + concurrency = 4 + } + each := b.N / concurrency + + b.StartTimer() // Benchmark. + var wg sync.WaitGroup + for i := 0; i < concurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < each; i++ { + tf, _ := fdTable.Get(fds[i%len(fds)]) + tf.DecRef() + } + }() + } + wg.Wait() + }) +} diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go new file mode 100644 index 000000000..e009df974 --- /dev/null +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -0,0 +1,103 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +type descriptorTable struct { + // slice is a *[]unsafe.Pointer, where each element is actually + // *descriptor object, updated atomically. + // + // Changes to the slice itself requiring holding FDTable.mu. + slice unsafe.Pointer `state:".(map[int32]*descriptor)"` +} + +// init initializes the table. +func (f *FDTable) init() { + var slice []unsafe.Pointer // Empty slice. + atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) +} + +// get gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + if fd >= int32(len(slice)) { + return nil, FDFlags{}, false + } + d := (*descriptor)(atomic.LoadPointer(&slice[fd])) + if d == nil { + return nil, FDFlags{}, true + } + return d.file, d.flags, true +} + +// set sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + + // Grow the table as required. + if last := int32(len(slice)); fd >= last { + end := fd + 1 + if end < 2*last { + end = 2 * last + } + slice = append(slice, make([]unsafe.Pointer, end-last)...) + atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) + } + + // Create the new element. + var d *descriptor + if file != nil { + d = &descriptor{ + file: file, + flags: flags, + } + } + + // Update the single element. + orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d))) + + // Acquire a table reference. + if file != nil && (orig == nil || file != orig.file) { + file.IncRef() + } + + // Drop the table reference. + if orig != nil && file != orig.file { + f.drop(orig.file) + } + + // Adjust used. + switch { + case orig == nil && file != nil: + atomic.AddInt32(&f.used, 1) + case orig != nil && file == nil: + atomic.AddInt32(&f.used, -1) + } +} diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index a08917889..ded27d668 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -51,18 +51,20 @@ type FSContext struct { func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { root.IncRef() cwd.IncRef() - return &FSContext{ + f := FSContext{ root: root, cwd: cwd, umask: umask, } + f.EnableLeakCheck("kernel.FSContext") + return &f } // destroy is the destructor for an FSContext. // // This will call DecRef on both root and cwd Dirents. If either call to -// DecRef returns an error, then it will be propigated. If both calls to -// DecRef return an error, then the one from root.DecRef will be propigated. +// DecRef returns an error, then it will be propagated. If both calls to +// DecRef return an error, then the one from root.DecRef will be propagated. // // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index a5cf1f627..6a31dc044 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -5,7 +5,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test") go_template_instance( name = "atomicptr_bucket", - out = "atomicptr_bucket.go", + out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", template = "//third_party/gvsync:generic_atomicptr", @@ -29,7 +29,7 @@ go_template_instance( go_library( name = "futex", srcs = [ - "atomicptr_bucket.go", + "atomicptr_bucket_unsafe.go", "futex.go", "waiter_list.go", ], diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 3bd5c04af..278cc8143 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -729,14 +729,14 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool } b := m.lockBucket(&k) - err = m.unlockPILocked(t, addr, tid, b) + err = m.unlockPILocked(t, addr, tid, b, &k) k.release() b.mu.Unlock() return err } -func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error { +func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error { cur, err := t.LoadUint32(addr) if err != nil { return err @@ -746,7 +746,22 @@ func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *buc return syserror.EPERM } - if b.waiters.Empty() { + var next *Waiter // Who's the next owner? + var next2 *Waiter // Who's the one after that? + for w := b.waiters.Front(); w != nil; w = w.Next() { + if !w.key.matches(key) { + continue + } + + if next == nil { + next = w + } else { + next2 = w + break + } + } + + if next == nil { // It's safe to set 0 because there are no waiters, no new owner, and the // executing task is the current owner (no owner died bit). prev, err := t.CompareAndSwapUint32(addr, cur, 0) @@ -761,12 +776,10 @@ func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *buc return nil } - next := b.waiters.Front() - // Set next owner's TID, waiters if there are any. Resets owner died bit, if // set, because the executing task takes over as the owner. val := next.tid - if next.Next() != nil { + if next2 != nil { val |= linux.FUTEX_WAITERS } diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index f0db0838d..80a070d7e 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -40,7 +40,7 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { } } -// SemaphoreRegistry returns the semanphore set registry for this namespace. +// SemaphoreRegistry returns the semaphore set registry for this namespace. func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry { return i.semaphores } diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD deleted file mode 100644 index 5d62f406a..000000000 --- a/pkg/sentry/kernel/kdefs/BUILD +++ /dev/null @@ -1,10 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "kdefs", - srcs = ["kdefs.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs", - visibility = ["//:sandbox"], -) diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 9fe9eb914..38b49cba2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -155,7 +155,7 @@ type Kernel struct { // cpuClockTicker increments cpuClock. cpuClockTicker *ktime.Timer `state:"nosave"` - // fdMapUids is an ever-increasing counter for generating FDMap uids. + // fdMapUids is an ever-increasing counter for generating FDTable uids. // // fdMapUids is mutable, and is accessed using atomic memory operations. fdMapUids uint64 @@ -400,8 +400,8 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(desc descriptor) error { - desc.file.Dirent.Inode.MountSource.FlushDirentRefs() + return k.tasks.forEachFDPaused(func(file *fs.File) error { + file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) } @@ -410,35 +410,35 @@ func (k *Kernel) flushMountSourceRefs() error { // task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error { +func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if t.fds == nil { + if t.fdTable == nil { continue } - for _, desc := range t.fds.files { - if err := f(desc); err != nil { - return err + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if lastErr := f(file); lastErr != nil && err == nil { + err = lastErr } - } + }) } - return nil + return err } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - return ts.forEachFDPaused(func(desc descriptor) error { - if flags := desc.file.Flags(); !flags.Write { + return ts.forEachFDPaused(func(file *fs.File) error { + if flags := file.Flags(); !flags.Write { return nil } - if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { return nil } // Here we need all metadata synced. - syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) if err := fs.SaveFileFsyncError(syncErr); err != nil { - name, _ := desc.file.Dirent.FullName(nil /* root */) + name, _ := file.Dirent.FullName(nil /* root */) // Wrap this error in ErrSaveRejection // so that it will trigger a save // error, rather than a panic. This @@ -483,14 +483,12 @@ func (ts *TaskSet) unregisterEpollWaiters() { defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if fdmap := t.fds; fdmap != nil { - for _, desc := range fdmap.files { - if desc.file != nil { - if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() } - } + }) } } } @@ -538,6 +536,8 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { } log.Infof("Memory load took [%s].", time.Since(memoryStart)) + log.Infof("Overall load took [%s]", time.Since(loadStart)) + // Ensure that all pending asynchronous work is complete: // - namedpipe opening // - inode file opening @@ -602,9 +602,9 @@ type CreateProcessArgs struct { // Credentials is the initial credentials. Credentials *auth.Credentials - // FDMap is the initial set of file descriptors. If CreateProcess succeeds, - // it takes a reference on FDMap. - FDMap *FDMap + // FDTable is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDTable. + FDTable *FDTable // Umask is the initial umask. Umask uint @@ -679,7 +679,7 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.args.Credentials case fs.CtxRoot: if ctx.args.Root != nil { - // Take a refernce on the root dirent that will be + // Take a reference on the root dirent that will be // given to the caller. ctx.args.Root.IncRef() return ctx.args.Root @@ -789,9 +789,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, return nil, 0, errors.New(se.String()) } - // Take a reference on the FDMap, which will be transferred to + // Take a reference on the FDTable, which will be transferred to // TaskSet.NewTask(). - args.FDMap.IncRef() + args.FDTable.IncRef() // Create the task. config := &TaskConfig{ @@ -799,7 +799,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, ThreadGroup: tg, TaskContext: tc, FSContext: newFSContext(root, wd, args.Umask), - FDMap: args.FDMap, + FDTable: args.FDTable, Credentials: args.Credentials, AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, @@ -871,7 +871,7 @@ func (k *Kernel) pauseTimeLocked() { } // By precondition, nothing else can be interacting with PIDNamespace.tids - // or FDMap.files, so we can iterate them without synchronization. (We + // or FDTable.files, so we can iterate them without synchronization. (We // can't hold the TaskSet mutex when pausing thread group timers because // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet // mutex, while holding the Timer mutex.) @@ -882,14 +882,14 @@ func (k *Kernel) pauseTimeLocked() { it.PauseTimer() } } - // This means we'll iterate FDMaps shared by multiple tasks repeatedly, + // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. - if fdm := t.fds; fdm != nil { - for _, desc := range fdm.files { - if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } - } + }) } } k.timekeeper.PauseUpdates() @@ -914,12 +914,12 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if fdm := t.fds; fdm != nil { - for _, desc := range fdm.files { - if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } - } + }) } } } diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 8e49070a9..247e2928e 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -39,19 +39,6 @@ const ( MaximumPipeSize = 8 << 20 ) -// Sizer is an interface for setting and getting the size of a pipe. -// -// It is implemented by Pipe and, through embedding, all other types. -type Sizer interface { - // PipeSize returns the pipe capacity in bytes. - PipeSize() int64 - - // SetPipeSize sets the new pipe capacity in bytes. - // - // The new size is returned (which may be capped). - SetPipeSize(int64) (int64, error) -} - // Pipe is an encapsulation of a platform-independent pipe. // It manages a buffered byte queue shared between a reader/writer // pair. @@ -399,15 +386,15 @@ func (p *Pipe) queued() int64 { return p.size } -// PipeSize implements PipeSizer.PipeSize. -func (p *Pipe) PipeSize() int64 { +// FifoSize implements fs.FifoSizer.FifoSize. +func (p *Pipe) FifoSize(context.Context, *fs.File) (int64, error) { p.mu.Lock() defer p.mu.Unlock() - return p.max + return p.max, nil } -// SetPipeSize implements PipeSize.SetPipeSize. -func (p *Pipe) SetPipeSize(size int64) (int64, error) { +// SetFifoSize implements fs.FifoSizer.SetFifoSize. +func (p *Pipe) SetFifoSize(size int64) (int64, error) { if size < 0 { return 0, syserror.EINVAL } diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go index fedfcd921..f69dbf27b 100644 --- a/pkg/sentry/kernel/pipe/reader_writer.go +++ b/pkg/sentry/kernel/pipe/reader_writer.go @@ -77,7 +77,7 @@ func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask { } // Ioctl implements fs.FileOperations.Ioctl. -func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (rw *ReaderWriter) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { // Switch on ioctl request. switch int(args[1].Int()) { case linux.FIONREAD: diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index fb4a0e1e0..93fe68a3e 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -86,7 +86,7 @@ type Set struct { dead bool } -// sem represents a single semanphore from a set. +// sem represents a single semaphore from a set. // // +stateify savable type sem struct { diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 355984140..81fcd8258 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -294,6 +294,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } + s.refs.EnableLeakCheck("kernel.Session") // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -307,6 +308,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } + pg.refs.EnableLeakCheck("kernel.ProcessGroup") // Tie them and return the result. s.processGroups.PushBack(pg) @@ -378,11 +380,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // We manually adjust the ancestors if the parent is in the same // session. tg.processGroup.session.incRef() - pg := &ProcessGroup{ + pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } + pg.refs.EnableLeakCheck("kernel.ProcessGroup") + if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ } @@ -390,20 +394,20 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // Assign the new process group; adjust children. oldParentPG := tg.parentPG() tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { - childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.incRefWithParent(&pg) childTG.processGroup.decRefWithParent(oldParentPG) }) tg.processGroup.decRefWithParent(oldParentPG) - tg.processGroup = pg + tg.processGroup = &pg // Add the new process group to the session. - pg.session.processGroups.PushBack(pg) + pg.session.processGroups.PushBack(&pg) // Ensure this translation is added to all namespaces. for ns := tg.pidns; ns != nil; ns = ns.parent { local := ns.tgids[tg] - ns.pgids[pg] = ProcessGroupID(local) - ns.processGroups[ProcessGroupID(local)] = pg + ns.pgids[&pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = &pg } return nil diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 3e9fe70e2..5bd610f68 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -224,6 +224,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } + shm.EnableLeakCheck("kernel.Shm") // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index 175d1b247..8227ecf1d 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -67,6 +67,7 @@ func (s *syslog) Log() []byte { "Creating process schedule...", "Generating random numbers by fair dice roll...", "Rewriting operating system in Javascript...", + "Reticulating splines...", "Consulting tar man page...", "Forking spaghetti code...", "Checking naughty and nice process list...", diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c297c5973..e91f82bb3 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -236,15 +236,15 @@ type Task struct { // tc is protected by mu, and is owned by the task goroutine. tc TaskContext - // fsc is the task's filesystem context. + // fsContext is the task's filesystem context. // - // fsc is protected by mu, and is owned by the task goroutine. - fsc *FSContext + // fsContext is protected by mu, and is owned by the task goroutine. + fsContext *FSContext - // fds is the task's file descriptor table. + // fdTable is the task's file descriptor table. // - // fds is protected by mu, and is owned by the task goroutine. - fds *FDMap + // fdTable is protected by mu, and is owned by the task goroutine. + fdTable *FDTable // If vforkParent is not nil, it is the task that created this task with // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when @@ -386,10 +386,11 @@ type Task struct { // creds is the task's credentials. // - // creds is protected by mu, however the value itself is immutable and can - // only be changed by a copy. After reading the pointer, access will - // proceed outside the scope of mu. creds is owned by the task goroutine. - creds *auth.Credentials + // creds.Load() may be called without synchronization. creds.Store() is + // serialized by mu. creds is owned by the task goroutine. All + // auth.Credentials objects that creds may point to, or have pointed to + // in the past, must be treated as immutable. + creds auth.AtomicPtrCredentials // utsns is the task's UTS namespace. // @@ -597,11 +598,11 @@ func (t *Task) Value(key interface{}) interface{} { case CtxTask: return t case auth.CtxCredentials: - return t.creds + return t.Credentials() case context.CtxThreadGroupID: return int32(t.ThreadGroup().ID()) case fs.CtxRoot: - return t.fsc.RootDirectory() + return t.fsContext.RootDirectory() case fs.CtxDirentCacheLimiter: return t.k.DirentCacheLimiter case inet.CtxStack: @@ -667,7 +668,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { func (t *Task) IsChrooted() bool { realRoot := t.tg.mounts.Root() defer realRoot.DecRef() - root := t.fsc.RootDirectory() + root := t.fsContext.RootDirectory() if root != nil { defer root.DecRef() } @@ -688,16 +689,55 @@ func (t *Task) TaskContext() *TaskContext { // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. func (t *Task) FSContext() *FSContext { - return t.fsc + return t.fsContext } -// FDMap returns t's FDMap. FDMap does not take an additional reference on the -// returned FDMap. +// FDTable returns t's FDTable. FDMTable does not take an additional reference +// on the returned FDMap. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. -func (t *Task) FDMap() *FDMap { - return t.fds +func (t *Task) FDTable() *FDTable { + return t.fdTable +} + +// GetFile is a convenience wrapper t.FDTable().GetFile. +// +// Precondition: same as FDTable. +func (t *Task) GetFile(fd int32) *fs.File { + f, _ := t.fdTable.Get(fd) + return f +} + +// NewFDs is a convenience wrapper for t.FDTable().NewFDs. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) { + return t.fdTable.NewFDs(t, fd, files, flags) +} + +// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) { + fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags) + if err != nil { + return 0, err + } + return fds[0], nil +} + +// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error { + return t.fdTable.NewFDAt(t, fd, file, flags) } // WithMuLocked executes f with t.mu locked. diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 0e621f0d1..0916fd658 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -214,20 +214,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } } - var fsc *FSContext + var fsContext *FSContext if opts.NewFSContext { - fsc = t.fsc.Fork() + fsContext = t.fsContext.Fork() } else { - fsc = t.fsc - fsc.IncRef() + fsContext = t.fsContext + fsContext.IncRef() } - var fds *FDMap + var fdTable *FDTable if opts.NewFiles { - fds = t.fds.Fork() + fdTable = t.fdTable.Fork() } else { - fds = t.fds - fds.IncRef() + fdTable = t.fdTable + fdTable.IncRef() } pidns := t.tg.pidns @@ -251,8 +251,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { ThreadGroup: tg, SignalMask: t.SignalMask(), TaskContext: tc, - FSContext: fsc, - FDMap: fds, + FSContext: fsContext, + FDTable: fdTable, Credentials: creds, Niceness: t.Niceness(), NetworkNamespaced: t.netns, @@ -425,6 +425,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { if opts.NewAddressSpace || opts.NewSignalHandlers { return syserror.EINVAL } + creds := t.Credentials() if opts.NewThreadGroup { t.tg.signalHandlers.mu.Lock() if t.tg.tasksCount != 1 { @@ -439,8 +440,6 @@ func (t *Task) Unshare(opts *SharingOptions) error { if t.IsChrooted() { return syserror.EPERM } - // This temporary is needed because Go. - creds := t.Credentials() newUserNS, err := creds.NewChildUserNamespace() if err != nil { return err @@ -449,6 +448,8 @@ func (t *Task) Unshare(opts *SharingOptions) error { if err != nil { return err } + // Need to reload creds, becaue t.SetUserNamespace() changed task credentials. + creds = t.Credentials() } haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) if opts.NewPIDNamespace { @@ -473,7 +474,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that this must happen after NewUserNamespace, so the // new user namespace is used if there is one. - t.utsns = t.utsns.Clone(t.creds.UserNamespace) + t.utsns = t.utsns.Clone(creds.UserNamespace) } if opts.NewIPCNamespace { if !haveCapSysAdmin { @@ -482,24 +483,24 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" - t.ipcns = NewIPCNamespace(t.creds.UserNamespace) + t.ipcns = NewIPCNamespace(creds.UserNamespace) } - var oldfds *FDMap + var oldFDTable *FDTable if opts.NewFiles { - oldfds = t.fds - t.fds = oldfds.Fork() + oldFDTable = t.fdTable + t.fdTable = oldFDTable.Fork() } - var oldfsc *FSContext + var oldFSContext *FSContext if opts.NewFSContext { - oldfsc = t.fsc - t.fsc = oldfsc.Fork() + oldFSContext = t.fsContext + t.fsContext = oldFSContext.Fork() } t.mu.Unlock() - if oldfds != nil { - oldfds.DecRef() + if oldFDTable != nil { + oldFDTable.DecRef() } - if oldfsc != nil { - oldfsc.DecRef() + if oldFSContext != nil { + oldFSContext.DecRef() } return nil } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index cd85acaef..17a089b90 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -195,7 +195,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() // Remove FDs with the CloseOnExec flag set. - t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool { + t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index b97d65185..535f03e50 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -265,8 +265,8 @@ func (*runExitMain) execute(t *Task) taskRunState { // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() - t.fsc.DecRef() - t.fds.DecRef() + t.fsContext.DecRef() + t.fdTable.DecRef() // If this is the last task to exit from the thread group, release the // thread group's resources. diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index d77dabc05..c211b5b74 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -34,14 +34,14 @@ func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { }) } -// CompareAndSwapUint32 implemets futex.Target.CompareAndSwapUint32. +// CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32. func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{ AddressSpaceActive: true, }) } -// LoadUint32 implemets futex.Target.LoadUint32. +// LoadUint32 implements futex.Target.LoadUint32. func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{ AddressSpaceActive: true, diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index 39c138925..78ff14b20 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -25,30 +25,22 @@ import ( // // This value must be considered immutable. func (t *Task) Credentials() *auth.Credentials { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds + return t.creds.Load() } // UserNamespace returns the user namespace associated with the task. func (t *Task) UserNamespace() *auth.UserNamespace { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.UserNamespace + return t.Credentials().UserNamespace } // HasCapabilityIn checks if the task has capability cp in user namespace ns. func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.HasCapabilityIn(cp, ns) + return t.Credentials().HasCapabilityIn(cp, ns) } // HasCapability checks if the task has capability cp in its user namespace. func (t *Task) HasCapability(cp linux.Capability) bool { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.HasCapability(cp) + return t.Credentials().HasCapability(cp) } // SetUID implements the semantics of setuid(2). @@ -57,9 +49,12 @@ func (t *Task) SetUID(uid auth.UID) error { if !uid.Ok() { return syserror.EINVAL } + t.mu.Lock() defer t.mu.Unlock() - kuid := t.creds.UserNamespace.MapToKUID(uid) + + creds := t.Credentials() + kuid := creds.UserNamespace.MapToKUID(uid) if !kuid.Ok() { return syserror.EINVAL } @@ -67,17 +62,17 @@ func (t *Task) SetUID(uid auth.UID) error { // effective UID of the caller is root (more precisely: if the caller has // the CAP_SETUID capability), the real UID and saved set-user-ID are also // set." - setuid(2) - if t.creds.HasCapability(linux.CAP_SETUID) { + if creds.HasCapability(linux.CAP_SETUID) { t.setKUIDsUncheckedLocked(kuid, kuid, kuid) return nil } // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID // capability) and uid does not match the real UID or saved set-user-ID of // the calling process." - if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID { + if kuid != creds.RealKUID && kuid != creds.SavedKUID { return syserror.EPERM } - t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID) + t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID) return nil } @@ -87,37 +82,38 @@ func (t *Task) SetREUID(r, e auth.UID) error { defer t.mu.Unlock() // "Supplying a value of -1 for either the real or effective user ID forces // the system to leave that ID unchanged." - setreuid(2) - newR := t.creds.RealKUID + creds := t.Credentials() + newR := creds.RealKUID if r.Ok() { - newR = t.creds.UserNamespace.MapToKUID(r) + newR = creds.UserNamespace.MapToKUID(r) if !newR.Ok() { return syserror.EINVAL } } - newE := t.creds.EffectiveKUID + newE := creds.EffectiveKUID if e.Ok() { - newE = t.creds.UserNamespace.MapToKUID(e) + newE = creds.UserNamespace.MapToKUID(e) if !newE.Ok() { return syserror.EINVAL } } - if !t.creds.HasCapability(linux.CAP_SETUID) { + if !creds.HasCapability(linux.CAP_SETUID) { // "Unprivileged processes may only set the effective user ID to the // real user ID, the effective user ID, or the saved set-user-ID." - if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID { + if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID { return syserror.EPERM } // "Unprivileged users may only set the real user ID to the real user // ID or the effective user ID." - if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID { + if newR != creds.RealKUID && newR != creds.EffectiveKUID { return syserror.EPERM } } // "If the real user ID is set (i.e., ruid is not -1) or the effective user // ID is set to a value not equal to the previous real user ID, the saved // set-user-ID will be set to the new effective user ID." - newS := t.creds.SavedKUID - if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) { + newS := creds.SavedKUID + if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) { newS = newE } t.setKUIDsUncheckedLocked(newR, newE, newS) @@ -136,23 +132,24 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error { // arguments equals -1, the corresponding value is not changed." - // setresuid(2) var err error - newR := t.creds.RealKUID + creds := t.Credentials() + newR := creds.RealKUID if r.Ok() { - newR, err = t.creds.UseUID(r) + newR, err = creds.UseUID(r) if err != nil { return err } } - newE := t.creds.EffectiveKUID + newE := creds.EffectiveKUID if e.Ok() { - newE, err = t.creds.UseUID(e) + newE, err = creds.UseUID(e) if err != nil { return err } } - newS := t.creds.SavedKUID + newS := creds.SavedKUID if s.Ok() { - newS, err = t.creds.UseUID(s) + newS, err = creds.UseUID(s) if err != nil { return err } @@ -163,10 +160,10 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error { // Preconditions: t.mu must be locked. func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { - root := t.creds.UserNamespace.MapToKUID(auth.RootUID) - oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID - t.creds = t.creds.Fork() // See doc for creds. - t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + root := creds.UserNamespace.MapToKUID(auth.RootUID) + oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID + creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS // "1. If one or more of the real, effective or saved set user IDs was // previously 0, and as a result of the UID changes all of these IDs have a @@ -184,9 +181,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // being cleared." (A thread's effective capability set is always // cleared when such a credential change is made, // regardless of the setting of the "keep capabilities" flag.) - if !t.creds.KeepCaps { - t.creds.PermittedCaps = 0 - t.creds.EffectiveCaps = 0 + if !creds.KeepCaps { + creds.PermittedCaps = 0 + creds.EffectiveCaps = 0 } } // """ @@ -197,9 +194,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // permitted set is copied to the effective set. // """ if oldE == root && newE != root { - t.creds.EffectiveCaps = 0 + creds.EffectiveCaps = 0 } else if oldE != root && newE == root { - t.creds.EffectiveCaps = t.creds.PermittedCaps + creds.EffectiveCaps = creds.PermittedCaps } // "4. If the filesystem user ID is changed from 0 to nonzero (see // setfsuid(2)), then the following capabilities are cleared from the @@ -220,6 +217,7 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // Not documented, but compare Linux's kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } + t.creds.Store(creds) } // SetGID implements the semantics of setgid(2). @@ -227,20 +225,23 @@ func (t *Task) SetGID(gid auth.GID) error { if !gid.Ok() { return syserror.EINVAL } + t.mu.Lock() defer t.mu.Unlock() - kgid := t.creds.UserNamespace.MapToKGID(gid) + + creds := t.Credentials() + kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return syserror.EINVAL } - if t.creds.HasCapability(linux.CAP_SETGID) { + if creds.HasCapability(linux.CAP_SETGID) { t.setKGIDsUncheckedLocked(kgid, kgid, kgid) return nil } - if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID { + if kgid != creds.RealKGID && kgid != creds.SavedKGID { return syserror.EPERM } - t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID) + t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID) return nil } @@ -248,30 +249,32 @@ func (t *Task) SetGID(gid auth.GID) error { func (t *Task) SetREGID(r, e auth.GID) error { t.mu.Lock() defer t.mu.Unlock() - newR := t.creds.RealKGID + + creds := t.Credentials() + newR := creds.RealKGID if r.Ok() { - newR = t.creds.UserNamespace.MapToKGID(r) + newR = creds.UserNamespace.MapToKGID(r) if !newR.Ok() { return syserror.EINVAL } } - newE := t.creds.EffectiveKGID + newE := creds.EffectiveKGID if e.Ok() { - newE = t.creds.UserNamespace.MapToKGID(e) + newE = creds.UserNamespace.MapToKGID(e) if !newE.Ok() { return syserror.EINVAL } } - if !t.creds.HasCapability(linux.CAP_SETGID) { - if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID { + if !creds.HasCapability(linux.CAP_SETGID) { + if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID { return syserror.EPERM } - if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID { + if newR != creds.RealKGID && newR != creds.EffectiveKGID { return syserror.EPERM } } - newS := t.creds.SavedKGID - if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) { + newS := creds.SavedKGID + if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) { newS = newE } t.setKGIDsUncheckedLocked(newR, newE, newS) @@ -280,26 +283,29 @@ func (t *Task) SetREGID(r, e auth.GID) error { // SetRESGID implements the semantics of the setresgid(2) syscall. func (t *Task) SetRESGID(r, e, s auth.GID) error { + var err error + t.mu.Lock() defer t.mu.Unlock() - var err error - newR := t.creds.RealKGID + + creds := t.Credentials() + newR := creds.RealKGID if r.Ok() { - newR, err = t.creds.UseGID(r) + newR, err = creds.UseGID(r) if err != nil { return err } } - newE := t.creds.EffectiveKGID + newE := creds.EffectiveKGID if e.Ok() { - newE, err = t.creds.UseGID(e) + newE, err = creds.UseGID(e) if err != nil { return err } } - newS := t.creds.SavedKGID + newS := creds.SavedKGID if s.Ok() { - newS, err = t.creds.UseGID(s) + newS, err = creds.UseGID(s) if err != nil { return err } @@ -309,9 +315,9 @@ func (t *Task) SetRESGID(r, e, s auth.GID) error { } func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { - oldE := t.creds.EffectiveKGID - t.creds = t.creds.Fork() // See doc for creds. - t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + oldE := creds.EffectiveKGID + creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS if oldE != newE { // "[dumpability] is reset to the current value contained in @@ -327,6 +333,7 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { // kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } + t.creds.Store(creds) } // SetExtraGIDs attempts to change t's supplemental groups. All IDs are @@ -334,19 +341,21 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { func (t *Task) SetExtraGIDs(gids []auth.GID) error { t.mu.Lock() defer t.mu.Unlock() - if !t.creds.HasCapability(linux.CAP_SETGID) { + creds := t.Credentials() + if !creds.HasCapability(linux.CAP_SETGID) { return syserror.EPERM } kgids := make([]auth.KGID, len(gids)) for i, gid := range gids { - kgid := t.creds.UserNamespace.MapToKGID(gid) + kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return syserror.EINVAL } kgids[i] = kgid } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.ExtraKGIDs = kgids + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.ExtraKGIDs = kgids + t.creds.Store(creds) return nil } @@ -360,27 +369,29 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili if effective & ^permitted != 0 { return syserror.EPERM } + creds := t.Credentials() // "It is also a limiting superset for the capabilities that may be added // to the inheritable set by a thread that does not have the CAP_SETPCAP // capability in its effective set." - if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) { + if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) { return syserror.EPERM } // "If a thread drops a capability from its permitted set, it can never // reacquire that capability (unless it execve(2)s ..." - if permitted & ^t.creds.PermittedCaps != 0 { + if permitted & ^creds.PermittedCaps != 0 { return syserror.EPERM } // "... if a capability is not in the bounding set, then a thread can't add // this capability to its inheritable set, even if it was in its permitted // capabilities ..." - if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 { + if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.PermittedCaps = permitted - t.creds.InheritableCaps = inheritable - t.creds.EffectiveCaps = effective + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.PermittedCaps = permitted + creds.InheritableCaps = inheritable + creds.EffectiveCaps = effective + t.creds.Store(creds) return nil } @@ -389,11 +400,13 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili func (t *Task) DropBoundingCapability(cp linux.Capability) error { t.mu.Lock() defer t.mu.Unlock() - if !t.creds.HasCapability(linux.CAP_SETPCAP) { + creds := t.Credentials() + if !creds.HasCapability(linux.CAP_SETPCAP) { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + t.creds.Store(creds) return nil } @@ -402,31 +415,33 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { t.mu.Lock() defer t.mu.Unlock() + creds := t.Credentials() // "A process reassociating itself with a user namespace must have the // CAP_SYS_ADMIN capability in the target user namespace." - setns(2) // // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN // in ns (by rule 3 in auth.Credentials.HasCapability). - if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.UserNamespace = ns + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.UserNamespace = ns // "The child process created by clone(2) with the CLONE_NEWUSER flag // starts out with a complete set of capabilities in the new user // namespace. Likewise, a process that creates a new user namespace using // unshare(2) or joins an existing user namespace using setns(2) gains a // full set of capabilities in that namespace." - t.creds.PermittedCaps = auth.AllCapabilities - t.creds.InheritableCaps = 0 - t.creds.EffectiveCaps = auth.AllCapabilities - t.creds.BoundingCaps = auth.AllCapabilities + creds.PermittedCaps = auth.AllCapabilities + creds.InheritableCaps = 0 + creds.EffectiveCaps = auth.AllCapabilities + creds.BoundingCaps = auth.AllCapabilities // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER // flag sets the "securebits" flags (see capabilities(7)) to their default // values (all flags disabled) in the child (for clone(2)) or caller (for // unshare(2), or setns(2)." - user_namespaces(7) - t.creds.KeepCaps = false + creds.KeepCaps = false + t.creds.Store(creds) return nil } @@ -435,8 +450,9 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { func (t *Task) SetKeepCaps(k bool) { t.mu.Lock() defer t.mu.Unlock() - t.creds = t.creds.Fork() // See doc for creds. - t.creds.KeepCaps = k + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + creds.KeepCaps = k + t.creds.Store(creds) } // updateCredsForExec updates t.creds to reflect an execve(). @@ -512,15 +528,16 @@ func (t *Task) updateCredsForExecLocked() { // the effective user ID. var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0 fileEffective := false - root := t.creds.UserNamespace.MapToKUID(auth.RootUID) - if t.creds.EffectiveKUID == root || t.creds.RealKUID == root { - newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps - if t.creds.EffectiveKUID == root { + creds := t.Credentials() + root := creds.UserNamespace.MapToKUID(auth.RootUID) + if creds.EffectiveKUID == root || creds.RealKUID == root { + newPermitted = creds.InheritableCaps | creds.BoundingCaps + if creds.EffectiveKUID == root { fileEffective = true } } - t.creds = t.creds.Fork() // See doc for creds. + creds = creds.Fork() // The credentials object is immutable. See doc for creds. // Now we enter poorly-documented, somewhat confusing territory. (The // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds @@ -562,27 +579,28 @@ func (t *Task) updateCredsForExecLocked() { // But since no_new_privs is always set (A3 is always true), this becomes // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1 // is a no-op. So we can just do C1 and C2 unconditionally. - if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID { - t.creds.EffectiveKUID = t.creds.RealKUID - t.creds.EffectiveKGID = t.creds.RealKGID + if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID { + creds.EffectiveKUID = creds.RealKUID + creds.EffectiveKGID = creds.RealKGID t.parentDeathSignal = 0 } // (Saved set-user-ID is always set to the new effective user ID, and saved // set-group-ID is always set to the new effective group ID, regardless of // the above.) - t.creds.SavedKUID = t.creds.RealKUID - t.creds.SavedKGID = t.creds.RealKGID - t.creds.PermittedCaps &= newPermitted + creds.SavedKUID = creds.RealKUID + creds.SavedKGID = creds.RealKGID + creds.PermittedCaps &= newPermitted if fileEffective { - t.creds.EffectiveCaps = t.creds.PermittedCaps + creds.EffectiveCaps = creds.PermittedCaps } else { - t.creds.EffectiveCaps = 0 + creds.EffectiveCaps = 0 } // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent // calls to execve(2). - t.creds.KeepCaps = false + creds.KeepCaps = false // "The bounding set is inherited at fork(2) from the thread's parent, and // is preserved across an execve(2)". So we're done. + t.creds.Store(creds) } diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index cf48663b6..a29e9b9eb 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -63,7 +63,7 @@ func (t *Task) DebugDumpState() { if mm := t.MemoryManager(); mm != nil { t.Debugf("Mappings:\n%s", mm) } - t.Debugf("FDMap:\n%s", t.fds) + t.Debugf("FDTable:\n%s", t.fdTable) } // debugDumpRegisters logs register state at log level debug. diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 9458f5c2a..d60cd62c7 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -52,9 +52,10 @@ type TaskConfig struct { // succeeds. FSContext *FSContext - // FDMap is the FDMap of the new task. A reference must be held on FDMap, - // which is transferred to TaskSet.NewTask whether or not it succeeds. - FDMap *FDMap + // FDTable is the FDTableof the new task. A reference must be held on + // FDMap, which is transferred to TaskSet.NewTask whether or not it + // succeeds. + FDTable *FDTable // Credentials is the Credentials of the new task. Credentials *auth.Credentials @@ -90,7 +91,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { if err != nil { cfg.TaskContext.release() cfg.FSContext.DecRef() - cfg.FDMap.DecRef() + cfg.FDTable.DecRef() return nil, err } return t, nil @@ -112,14 +113,13 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { signalMask: cfg.SignalMask, signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, tc: *tc, - fsc: cfg.FSContext, - fds: cfg.FDMap, + fsContext: cfg.FSContext, + fdTable: cfg.FDTable, p: cfg.Kernel.Platform.NewContext(), k: cfg.Kernel, ptraceTracees: make(map[*Task]struct{}), allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, - creds: cfg.Credentials, niceness: cfg.Niceness, netns: cfg.NetworkNamespaced, utsns: cfg.UTSNamespace, @@ -129,6 +129,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, } + t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu t.ptraceTracer.Store((*Task)(nil)) // We don't construct t.blockingTimer until Task.run(); see that function @@ -249,8 +250,20 @@ func (ns *PIDNamespace) allocateTID() (ThreadID, error) { } // Is it available? - _, ok := ns.tasks[tid] - if !ok { + tidInUse := func() bool { + if _, ok := ns.tasks[tid]; ok { + return true + } + if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { + return true + } + if _, ok := ns.sessions[SessionID(tid)]; ok { + return true + } + return false + }() + + if !tidInUse { ns.last = tid return tid, nil } diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go index e735a5dd0..10c6e455c 100644 --- a/pkg/sentry/kernel/task_stop.go +++ b/pkg/sentry/kernel/task_stop.go @@ -172,7 +172,7 @@ func (t *Task) beginStopLocked() { } } -// endStopLocked decerements t.stopCount to indicate that an existing internal +// endStopLocked decrements t.stopCount to indicate that an existing internal // or external stop no longer applies to t. // // Preconditions: The signal mutex must be locked. diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 3562ef179..2a97e3e8e 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -268,7 +268,7 @@ func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh return tg } -// saveRscr is invopked by stateify. +// saveRscr is invoked by stateify. func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion { return tg.rscr.Load().(*RSEQCriticalRegion) } diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index eadacfea2..76417342a 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -122,7 +122,7 @@ func (t *Timekeeper) SetClocks(c sentrytime.Clocks) { // // In a restored sentry, monotonic time jumps forward by approximately // the same amount as real time. There are no guarantees here, we are - // just making a best-effort attempt to to make it appear that the app + // just making a best-effort attempt to make it appear that the app // was simply not scheduled for a long period, rather than that the // real time clock was changed. // diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index edfdac2a7..baa12d9a0 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -54,7 +54,7 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in // openPath opens name for loading. // // openPath returns the fs.Dirent and an *fs.File for name, which is not -// installed in the Task FDMap. The caller takes ownership of both. +// installed in the Task FDTable. The caller takes ownership of both. // // name must be a readable, executable, regular file. func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) { diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go index c87d4687a..0a5b7ce45 100644 --- a/pkg/sentry/memmap/mapping_set.go +++ b/pkg/sentry/memmap/mapping_set.go @@ -85,7 +85,7 @@ func (mappingSetFunctions) Merge(r1 MappableRange, val1 MappingsOfRange, r2 Mapp // Each MappingOfRange in val1 must have a matching region in val2, forming // one contiguous region. for k1 := range val1 { - // We expect val2 to to contain a key that forms a contiguous + // We expect val2 to contain a key that forms a contiguous // region with k1. k2 := MappingOfRange{ MappingSpace: k1.MappingSpace, diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index c9a942023..1b746d030 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -213,7 +213,9 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { if err != nil { return nil, err } - return &aioMappable{mfp: mfp, fr: fr}, nil + m := aioMappable{mfp: mfp, fr: fr} + m.EnableLeakCheck("mm.aioMappable") + return &m, nil } // DecRef implements refs.RefCounter.DecRef. diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 7bb96b159..f350e0109 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -74,7 +74,7 @@ type MemoryManager struct { // privateRefs is immutable. privateRefs *privateRefs - // users is the number of dependences on the mappings in the MemoryManager. + // users is the number of dependencies on the mappings in the MemoryManager. // When the number of references in users reaches zero, all mappings are // unmapped. // diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 2f8651a0a..ea2d7af74 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -45,7 +45,9 @@ type SpecialMappable struct { // // Preconditions: fr.Length() != 0. func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { - return &SpecialMappable{mfp: mfp, fr: fr, name: name} + m := SpecialMappable{mfp: mfp, fr: fr, name: name} + m.EnableLeakCheck("mm.SpecialMappable") + return &m } // DecRef implements refs.RefCounter.DecRef. diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 074e2b141..f2fd70799 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -34,7 +34,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms)) } - // Find a useable range. + // Find a usable range. addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{ Addr: opts.Addr, Fixed: opts.Fixed, diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 0b9962b2b..9aa6ec507 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/abi/linux", "//pkg/atomicbitops", "//pkg/log", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/context", "//pkg/sentry/platform/safecopy", diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 9ccf77fdf..ad8b95744 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -14,6 +14,7 @@ go_library( "bluepill_fault.go", "bluepill_unsafe.go", "context.go", + "filters.go", "kvm.go", "kvm_amd64.go", "kvm_amd64_unsafe.go", @@ -33,6 +34,7 @@ go_library( "//pkg/cpuid", "//pkg/log", "//pkg/procid", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index 0effd33ac..9d8af143e 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -30,7 +30,7 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { return &((*arch.UContext64)(context).MContext) } -// dieArchSetup initialies the state for dieTrampoline. +// dieArchSetup initializes the state for dieTrampoline. // // The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP // to be in AX. The trampoline then simulates a call to dieHandler from the diff --git a/pkg/sentry/platform/kvm/filters.go b/pkg/sentry/platform/kvm/filters.go new file mode 100644 index 000000000..7d949f1dd --- /dev/null +++ b/pkg/sentry/platform/kvm/filters.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// SyscallFilters returns syscalls made exclusively by the KVM platform. +func (*KVM) SyscallFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_ARCH_PRCTL: {}, + syscall.SYS_IOCTL: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_RT_SIGSUSPEND: {}, + syscall.SYS_RT_SIGTIMEDWAIT: {}, + 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host. + } +} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index b49d7f3c4..ee4cd2f4d 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -141,3 +141,17 @@ func (k *KVM) NewContext() platform.Context { machine: k.machine, } } + +type constructor struct{} + +func (*constructor) New(f *os.File) (platform.Platform, error) { + return New(f) +} + +func (*constructor) OpenDevice() (*os.File, error) { + return OpenDevice() +} + +func init() { + platform.Register("kvm", &constructor{}) +} diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 69b2f92a7..679087e25 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -135,7 +135,7 @@ type dieState struct { // newVCPU creates a returns a new vCPU. // -// Precondtion: mu must be held. +// Precondition: mu must be held. func (m *machine) newVCPU() *vCPU { id := len(m.vCPUs) @@ -426,7 +426,12 @@ func (c *vCPU) unlock() { // Normal state. case vCPUUser | vCPUGuest | vCPUWaiter: // Force a transition: this must trigger a notification when we - // return from guest mode. + // return from guest mode. We must clear vCPUWaiter here + // anyways, because BounceToKernel will force a transition only + // from ring3 to ring0, which will not clear this bit. Halt may + // workaround the issue, but if there is no exception or + // syscall in this period, BounceToKernel will hang. + atomicbitops.AndUint32(&c.state, ^vCPUWaiter) c.notify() case vCPUUser | vCPUWaiter: // Waiting for the lock to be released; the responsibility is diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index c87fa7b7c..506ec9af1 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -87,7 +87,7 @@ func (c *vCPU) setCPUID() error { // setSystemTime sets the TSC for the vCPU. // -// This has to make the call many times in order to minimize the intrinstic +// This has to make the call many times in order to minimize the intrinsic // error in the offset. Unfortunately KVM does not expose a relative offset via // the API, so this is an approximation. We do this via an iterative algorithm. // This has the advantage that it can generally deal with highly variable diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index 8d76e106e..405e00292 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -21,6 +21,7 @@ package kvm import ( "fmt" + "math" "sync/atomic" "syscall" "unsafe" @@ -134,7 +135,7 @@ func (c *vCPU) notify() { syscall.SYS_FUTEX, uintptr(unsafe.Pointer(&c.state)), linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, - ^uintptr(0), // Number of waiters. + math.MaxInt32, // Number of waiters. 0, 0, 0) if errno != 0 { throw("futex wake error") diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 5ad98a329..ec22dbf87 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -19,8 +19,10 @@ package platform import ( "fmt" + "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -93,6 +95,9 @@ type Platform interface { // Platforms for which this does not hold may panic if PreemptAllCPUs is // called. PreemptAllCPUs() error + + // SyscallFilters returns syscalls made exclusively by this platform. + SyscallFilters() seccomp.SyscallRules } // NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and @@ -256,7 +261,7 @@ type AddressSpaceIO interface { LoadUint32(addr usermem.Addr) (uint32, error) } -// NoAddressSpaceIO implements AddressSpaceIO methods by panicing. +// NoAddressSpaceIO implements AddressSpaceIO methods by panicking. type NoAddressSpaceIO struct{} // CopyOut implements AddressSpaceIO.CopyOut. @@ -347,3 +352,26 @@ type File interface { func (fr FileRange) String() string { return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) } + +// Constructor represents a platform type. +type Constructor interface { + New(deviceFile *os.File) (Platform, error) + OpenDevice() (*os.File, error) +} + +// platforms contains all available platform types. +var platforms = map[string]Constructor{} + +// Register registers a new platform type. +func Register(name string, platform Constructor) { + platforms[name] = platform +} + +// Lookup looks up the platform constructor by name. +func Lookup(name string) (Constructor, error) { + p, ok := platforms[name] + if !ok { + return nil, fmt.Errorf("unknown platform: %v", name) + } + return p, nil +} diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 6a1343f47..1b6c54e96 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "ptrace", srcs = [ + "filters.go", "ptrace.go", "ptrace_unsafe.go", "stub_amd64.s", diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go new file mode 100644 index 000000000..1e07cfd0d --- /dev/null +++ b/pkg/sentry/platform/ptrace/filters.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ptrace + +import ( + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/seccomp" +) + +// SyscallFilters returns syscalls made exclusively by the ptrace platform. +func (*PTrace) SyscallFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + unix.SYS_GETCPU: {}, + unix.SYS_SCHED_SETAFFINITY: {}, + syscall.SYS_PTRACE: {}, + syscall.SYS_TGKILL: {}, + syscall.SYS_WAIT4: {}, + } +} diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index ee7e0640c..6fd30ed25 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -45,6 +45,7 @@ package ptrace import ( + "os" "sync" "gvisor.dev/gvisor/pkg/abi/linux" @@ -236,3 +237,17 @@ func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan s func (*PTrace) NewContext() platform.Context { return &context{} } + +type constructor struct{} + +func (*constructor) New(*os.File) (platform.Platform, error) { + return New() +} + +func (*constructor) OpenDevice() (*os.File, error) { + return nil, nil +} + +func init() { + platform.Register("ptrace", &constructor{}) +} diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index f15b3415a..15e84735e 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -123,7 +123,7 @@ type subprocess struct { contexts map[*context]struct{} } -// newSubprocess returns a useable subprocess. +// newSubprocess returns a usable subprocess. // // This will either be a newly created subprocess, or one from the global pool. // The create function will be called in the latter case, which is guaranteed @@ -155,6 +155,7 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { errChan <- err return } + firstThread.grabInitRegs() // Ready to handle requests. errChan <- nil @@ -179,6 +180,7 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { // Detach the thread. t.detach() + t.initRegs = firstThread.initRegs // Return the thread. r <- t @@ -253,7 +255,7 @@ func (s *subprocess) newThread() *thread { return t } -// attach attachs to the thread. +// attach attaches to the thread. func (t *thread) attach() { if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_ATTACH, uintptr(t.tid), 0); errno != 0 { panic(fmt.Sprintf("unable to attach: %v", errno)) @@ -269,7 +271,9 @@ func (t *thread) attach() { // Initialize options. t.init() +} +func (t *thread) grabInitRegs() { // Grab registers. // // Note that we adjust the current register RIP value to be just before @@ -281,9 +285,9 @@ func (t *thread) attach() { t.initRegs.Rip -= initRegsRipAdjustment } -// detach detachs from the thread. +// detach detaches from the thread. // -// Because the SIGSTOP is not supressed, the thread will enter group-stop. +// Because the SIGSTOP is not suppressed, the thread will enter group-stop. func (t *thread) detach() { if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 { panic(fmt.Sprintf("can't detach new clone: %v", errno)) @@ -370,7 +374,7 @@ func (t *thread) destroy() { // init initializes trace options. func (t *thread) init() { - // Set our TRACESYSGOOD option to differeniate real SIGTRAP. We also + // Set the TRACESYSGOOD option to differentiate real SIGTRAP. // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the // sentry will immediately kill the associated stubs. const PTRACE_O_EXITKILL = 0x100000 @@ -554,7 +558,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { if c.signalInfo.Code > 0 { // The signal was generated by the kernel. We inspect // the signal information, and may patch it in order to - // faciliate vsyscall emulation. See patchSignalInfo. + // facilitate vsyscall emulation. See patchSignalInfo. patchSignalInfo(regs, &c.signalInfo) return false } else if c.signalInfo.Code <= 0 && c.signalInfo.Pid() == int32(os.Getpid()) { diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 84d4cf0bd..87ded0bbd 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -235,6 +235,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) } t.attach() + t.grabInitRegs() return t, nil } @@ -305,7 +306,7 @@ func (s *subprocess) createStub() (*thread, error) { arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) if err != nil { - return nil, err + return nil, fmt.Errorf("creating stub process: %v", err) } // Wait for child to enter group-stop, so we don't stop its @@ -324,7 +325,7 @@ func (s *subprocess) createStub() (*thread, error) { arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) if err != nil { - return nil, err + return nil, fmt.Errorf("waiting on stub process: %v", err) } childT := &thread{ diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s index 8cb8c4996..02df38331 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ b/pkg/sentry/platform/ring0/entry_amd64.s @@ -15,7 +15,7 @@ #include "funcdata.h" #include "textflag.h" -// NB: Offsets are programatically generated (see BUILD). +// NB: Offsets are programmatically generated (see BUILD). // // This file is concatenated with the definitions. diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index 3577b5127..0feff8778 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -70,6 +70,14 @@ func (c *CPU) init() { c.tss.ist1Lo = uint32(stackAddr) c.tss.ist1Hi = uint32(stackAddr >> 32) + // Set the I/O bitmap base address beyond the last byte in the TSS + // to block access to the entire I/O address range. + // + // From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1: + // I/O addresses not spanned by the map are treated as if they had set + // bits in the map. + c.tss.ioPerm = tssLimit + 1 + // Permanently set the kernel segments. c.registers.Cs = uint64(Kcode) c.registers.Ds = uint64(Kdata) diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 7a24d4806..2b03ea87c 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -14,7 +14,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 39de46c39..81dbd7309 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -17,7 +17,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", "//pkg/syserror", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index b646dc258..4e95101b7 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -63,7 +62,7 @@ type RightsFiles []*fs.File func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { files := make(RightsFiles, 0, len(fds)) for _, fd := range fds { - file, _ := t.FDMap().GetDescriptor(kdefs.FD(fd)) + file := t.GetFile(fd) if file == nil { files.Release() return nil, syserror.EBADF @@ -109,7 +108,9 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 files, trunc := rights.Files(t, max) fds := make([]int32, 0, len(files)) for i := 0; i < max && len(files) > 0; i++ { - fd, err := t.FDMap().NewFDFrom(0, files[0], kernel.FDFlags{cloexec}, t.ThreadGroup().Limits()) + fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ + CloseOnExec: cloexec, + }) files[0].DecRef() files = files[1:] if err != nil { @@ -224,14 +225,14 @@ func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([ return alignSlice(buf, align), flags } -func putCmsgStruct(buf []byte, msgType uint32, align uint, data interface{}) []byte { +func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interface{}) []byte { if cap(buf)-len(buf) < linux.SizeOfControlMessageHeader { return buf } ob := buf buf = putUint64(buf, uint64(linux.SizeOfControlMessageHeader)) - buf = putUint32(buf, linux.SOL_SOCKET) + buf = putUint32(buf, msgLevel) buf = putUint32(buf, msgType) hdrBuf := buf @@ -306,17 +307,28 @@ func alignSlice(buf []byte, align uint) []byte { func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte { return putCmsgStruct( buf, + linux.SOL_SOCKET, linux.SO_TIMESTAMP, t.Arch().Width(), linux.NsecToTimeval(timestamp), ) } +// PackInq packs a TCP_INQ socket control message. +func PackInq(t *kernel.Task, inq int32, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_TCP, + linux.TCP_INQ, + 4, + inq, + ) +} + // Parse parses a raw socket control message into portable objects. func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) { var ( - fds linux.ControlMessageRights - + fds linux.ControlMessageRights haveCreds bool creds linux.ControlMessageCredentials ) diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD index 45bb24a3f..1f014f399 100644 --- a/pkg/sentry/socket/epsocket/BUILD +++ b/pkg/sentry/socket/epsocket/BUILD @@ -28,7 +28,6 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index 8e65e1b3f..e57aed927 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -40,7 +40,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -208,6 +207,10 @@ type commonEndpoint interface { // GetSockOpt implements tcpip.Endpoint.GetSockOpt and // transport.Endpoint.GetSockOpt. GetSockOpt(interface{}) *tcpip.Error + + // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and + // transport.Endpoint.GetSockOpt. + GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) } // SocketOperations encapsulates all the state needed to represent a network stack @@ -250,6 +253,10 @@ type SocketOperations struct { // timestampNS holds the timestamp to use with SIOCTSTAMP. It is only // valid when timestampValid is true. It is protected by readMu. timestampNS int64 + + // sockOptInq corresponds to TCP_INQ. It is implemented on the epsocket + // level, because it takes into account data from readView. + sockOptInq bool } // New creates a new endpoint socket. @@ -286,14 +293,14 @@ func bytesToIPAddress(addr []byte) tcpip.Address { // GetAddress reads an sockaddr struct from the given address and converts it // to the FullAddress format. It supports AF_UNIX, AF_INET and AF_INET6 // addresses. -func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) { +func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syserr.Error) { // Make sure we have at least 2 bytes for the address family. if len(addr) < 2 { return tcpip.FullAddress{}, syserr.ErrInvalidArgument } family := usermem.ByteOrder.Uint16(addr) - if family != uint16(sfamily) { + if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) { return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported } @@ -318,7 +325,7 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) { case linux.AF_INET: var a linux.SockAddrInet if len(addr) < sockAddrInetSize { - return tcpip.FullAddress{}, syserr.ErrBadAddress + return tcpip.FullAddress{}, syserr.ErrInvalidArgument } binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a) @@ -331,7 +338,7 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) { case linux.AF_INET6: var a linux.SockAddrInet6 if len(addr) < sockAddrInet6Size { - return tcpip.FullAddress{}, syserr.ErrBadAddress + return tcpip.FullAddress{}, syserr.ErrInvalidArgument } binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a) @@ -344,6 +351,9 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) { } return out, nil + case linux.AF_UNSPEC: + return tcpip.FullAddress{}, nil + default: return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported } @@ -466,7 +476,7 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { - addr, err := GetAddress(s.family, sockaddr) + addr, err := GetAddress(s.family, sockaddr, false /* strict */) if err != nil { return err } @@ -499,7 +509,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { - addr, err := GetAddress(s.family, sockaddr) + addr, err := GetAddress(s.family, sockaddr, true /* strict */) if err != nil { return err } @@ -537,7 +547,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *wait // Accept implements the linux syscall accept(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Issue the accept request to get the new endpoint. ep, wq, terr := s.Endpoint.Accept() if terr != nil { @@ -575,10 +585,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } } - fdFlags := kernel.FDFlags{ + fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits()) + }) t.Kernel().RecordSocket(ns) @@ -633,6 +642,18 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) ( } return val, nil } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptInq { + val = 1 + } + return val, nil + } return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen) } @@ -866,6 +887,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(v), nil + case linux.TCP_MAXSEG: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.MaxSegOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + case linux.TCP_KEEPIDLE: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument @@ -1035,6 +1068,15 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0 return nil } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptInq = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } return SetSockOpt(t, s, s.Endpoint, level, name, optVal) } @@ -1218,6 +1260,14 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * v := usermem.ByteOrder.Uint32(optVal) return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v))) + case linux.TCP_MAXSEG: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MaxSegOption(v))) + case linux.TCP_KEEPIDLE: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument @@ -1246,6 +1296,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * return syserr.TranslateNetstackError(err) } return nil + case linux.TCP_REPAIR_OPTIONS: t.Kernel().EmitUnimplementedEvent(t) @@ -1471,7 +1522,6 @@ func emitUnimplementedEventTCP(t *kernel.Task, name int) { linux.TCP_FASTOPEN_CONNECT, linux.TCP_FASTOPEN_KEY, linux.TCP_FASTOPEN_NO_COOKIE, - linux.TCP_INQ, linux.TCP_KEEPCNT, linux.TCP_KEEPIDLE, linux.TCP_KEEPINTVL, @@ -1726,6 +1776,18 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq return 0, err } +func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) { + if !s.sockOptInq { + return + } + rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) + if err != nil { + return + } + cmsg.IP.HasInq = true + cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed) +} + // nonBlockingRead issues a non-blocking read. // // TODO(b/78348848): Support timestamps for stream sockets. @@ -1745,7 +1807,9 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe s.readMu.Lock() n, err := s.coalescingRead(ctx, dst, trunc) s.readMu.Unlock() - return n, 0, nil, 0, socket.ControlMessages{}, err + cmsg := s.controlMessages() + s.fillCmsgInq(&cmsg) + return n, 0, nil, 0, cmsg, err } s.readMu.Lock() @@ -1758,8 +1822,8 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe if !isPacket && peek && trunc { // MSG_TRUNC with MSG_PEEK on a TCP socket returns the // amount that could be read. - var rql tcpip.ReceiveQueueSizeOption - if err := s.Endpoint.GetSockOpt(&rql); err != nil { + rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) + if err != nil { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) } available := len(s.readView) + int(rql) @@ -1827,7 +1891,9 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe n = msgLen } - return n, flags, addr, addrLen, s.controlMessages(), syserr.FromError(err) + cmsg := s.controlMessages() + s.fillCmsgInq(&cmsg) + return n, flags, addr, addrLen, cmsg, syserr.FromError(err) } func (s *SocketOperations) controlMessages() socket.ControlMessages { @@ -1924,7 +1990,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] var addr *tcpip.FullAddress if len(to) > 0 { - addrBuf, err := GetAddress(s.family, to) + addrBuf, err := GetAddress(s.family, to, true /* strict */) if err != nil { return 0, err } @@ -1993,7 +2059,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] } // Ioctl implements fs.FileOperations.Ioctl. -func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { // SIOCGSTAMP is implemented by epsocket rather than all commonEndpoint // sockets. // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. @@ -2065,9 +2131,9 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc return 0, err case linux.TIOCINQ: - var v tcpip.ReceiveQueueSizeOption - if err := ep.GetSockOpt(&v); err != nil { - return 0, syserr.TranslateNetstackError(err).ToError() + v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption) + if terr != nil { + return 0, syserr.TranslateNetstackError(terr).ToError() } if v > math.MaxInt32 { diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go index 6d2b5d038..421f93dc4 100644 --- a/pkg/sentry/socket/epsocket/provider.go +++ b/pkg/sentry/socket/epsocket/provider.go @@ -40,42 +40,49 @@ type provider struct { } // getTransportProtocol figures out transport protocol. Currently only TCP, -// UDP, and ICMP are supported. -func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) { +// UDP, and ICMP are supported. The bool return value is true when this socket +// is associated with a transport protocol. This is only false for SOCK_RAW, +// IPPROTO_IP sockets. +func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, bool, *syserr.Error) { switch stype { case linux.SOCK_STREAM: if protocol != 0 && protocol != syscall.IPPROTO_TCP { - return 0, syserr.ErrInvalidArgument + return 0, true, syserr.ErrInvalidArgument } - return tcp.ProtocolNumber, nil + return tcp.ProtocolNumber, true, nil case linux.SOCK_DGRAM: switch protocol { case 0, syscall.IPPROTO_UDP: - return udp.ProtocolNumber, nil + return udp.ProtocolNumber, true, nil case syscall.IPPROTO_ICMP: - return header.ICMPv4ProtocolNumber, nil + return header.ICMPv4ProtocolNumber, true, nil case syscall.IPPROTO_ICMPV6: - return header.ICMPv6ProtocolNumber, nil + return header.ICMPv6ProtocolNumber, true, nil } case linux.SOCK_RAW: // Raw sockets require CAP_NET_RAW. creds := auth.CredentialsFromContext(ctx) if !creds.HasCapability(linux.CAP_NET_RAW) { - return 0, syserr.ErrPermissionDenied + return 0, true, syserr.ErrPermissionDenied } switch protocol { case syscall.IPPROTO_ICMP: - return header.ICMPv4ProtocolNumber, nil + return header.ICMPv4ProtocolNumber, true, nil case syscall.IPPROTO_UDP: - return header.UDPProtocolNumber, nil + return header.UDPProtocolNumber, true, nil case syscall.IPPROTO_TCP: - return header.TCPProtocolNumber, nil + return header.TCPProtocolNumber, true, nil + // IPPROTO_RAW signifies that the raw socket isn't assigned to + // a transport protocol. Users will be able to write packets' + // IP headers and won't receive anything. + case syscall.IPPROTO_RAW: + return tcpip.TransportProtocolNumber(0), false, nil } } - return 0, syserr.ErrProtocolNotSupported + return 0, true, syserr.ErrProtocolNotSupported } // Socket creates a new socket object for the AF_INET or AF_INET6 family. @@ -93,7 +100,7 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* } // Figure out the transport protocol. - transProto, err := getTransportProtocol(t, stype, protocol) + transProto, associated, err := getTransportProtocol(t, stype, protocol) if err != nil { return nil, err } @@ -103,7 +110,7 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* var e *tcpip.Error wq := &waiter.Queue{} if stype == linux.SOCK_RAW { - ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq) + ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) } else { ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) } diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go index 1627a4f68..7eef19f74 100644 --- a/pkg/sentry/socket/epsocket/stack.go +++ b/pkg/sentry/socket/epsocket/stack.go @@ -138,3 +138,8 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError() } + +// Statistics implements inet.Stack.Statistics. +func (s *Stack) Statistics(stat interface{}, arg string) error { + return syserr.ErrEndpointOperation.ToError() +} diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 4f670beb4..a951f1bb0 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -26,7 +26,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index c63f3aacf..7f69406b7 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -190,7 +189,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { var peerAddr []byte var peerAddrlen uint32 var peerAddrPtr *byte @@ -236,11 +235,11 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } defer f.DecRef() - fdFlags := kernel.FDFlags{ + kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, - } - kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits()) + }) t.Kernel().RecordSocket(f) + return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) } diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go index 7bd3a70c4..6c69ba9c7 100644 --- a/pkg/sentry/socket/hostinet/socket_unsafe.go +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" @@ -52,7 +53,7 @@ func writev(fd int, srcs []syscall.Iovec) (uint64, error) { } // Ioctl implements fs.FileOperations.Ioctl. -func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := uintptr(args[1].Int()); cmd { case syscall.TIOCINQ, syscall.TIOCOUTQ: var val int32 diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index 11f94281c..cc1f66fa1 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -244,3 +244,8 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserror.EACCES } + +// Statistics implements inet.Stack.Statistics. +func (s *Stack) Statistics(stat interface{}, arg string) error { + return syserror.EOPNOTSUPP +} diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index f6b001b63..45ebb2a0e 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/netlink/port", diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 87c0c77bc..f3d6c1e9b 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" @@ -173,7 +172,7 @@ func (s *Socket) EventUnregister(e *waiter.Entry) { } // Ioctl implements fs.FileOperations.Ioctl. -func (s *Socket) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (*Socket) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) { // TODO(b/68878065): no ioctls supported. return 0, syserror.ENOTTY } @@ -272,7 +271,7 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr } // Accept implements socket.Socket.Accept. -func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Netlink sockets never support accept. return 0, nil, 0, syserr.ErrNotSupported } diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD index 96d374383..5061dcbde 100644 --- a/pkg/sentry/socket/rpcinet/BUILD +++ b/pkg/sentry/socket/rpcinet/BUILD @@ -25,7 +25,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/hostinet", diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go index c76b48ead..ccaaddbfc 100644 --- a/pkg/sentry/socket/rpcinet/socket.go +++ b/pkg/sentry/socket/rpcinet/socket.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" @@ -286,7 +285,7 @@ func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultP } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { payload, se := rpcAccept(t, s.fd, peerRequested) // Check if we need to block. @@ -336,10 +335,9 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, }) defer file.DecRef() - fdFlags := kernel.FDFlags{ + fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, 0, syserr.FromError(err) } @@ -564,7 +562,7 @@ func ifconfIoctlFromStack(ctx context.Context, io usermem.IO, ifc *linux.IFConf) } // Ioctl implements fs.FileOperations.Ioctl. -func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { t := ctx.(*kernel.Task) cmd := uint32(args[1].Int()) diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go index 3038f25a7..49bd3a220 100644 --- a/pkg/sentry/socket/rpcinet/stack.go +++ b/pkg/sentry/socket/rpcinet/stack.go @@ -133,3 +133,8 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { func (s *Stack) SetTCPSACKEnabled(enabled bool) error { panic("rpcinet handles procfs directly this method should not be called") } + +// Statistics implements inet.Stack.Statistics. +func (s *Stack) Statistics(stat interface{}, arg string) error { + return syserr.ErrEndpointOperation.ToError() +} diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 933120f34..0efa58a58 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -53,7 +52,7 @@ type Socket interface { // Accept implements the accept4(2) linux syscall. // Returns fd, real peer address length and error. Real peer address // length is only set if len(peer) > 0. - Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) + Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) // Bind implements the bind(2) linux syscall. Bind(t *kernel.Task, sockaddr []byte) *syserr.Error diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 8580eb87d..da9977fde 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index e4c416233..73d2df15d 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -143,7 +143,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E } q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} + q1.EnableLeakCheck("transport.queue") q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} + q2.EnableLeakCheck("transport.queue") if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} @@ -294,12 +296,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn } readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} + readQueue.EnableLeakCheck("transport.queue") ne.connected = &connectedEndpoint{ endpoint: ce, writeQueue: readQueue, } writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} + writeQueue.EnableLeakCheck("transport.queue") if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index cb2b60339..c7f7c5b16 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -23,7 +23,7 @@ import ( ) // connectionlessEndpoint is a unix endpoint for unix sockets that support operating in -// a conectionless fashon. +// a connectionless fashon. // // Specifically, this means datagram unix sockets not created with // socketpair(2). @@ -41,7 +41,9 @@ var ( // NewConnectionless creates a new unbound dgram endpoint. func NewConnectionless(ctx context.Context) Endpoint { ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}} - ep.receiver = &queueReceiver{readQueue: &queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}} + q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} + q.EnableLeakCheck("transport.queue") + ep.receiver = &queueReceiver{readQueue: &q} return ep } @@ -52,29 +54,24 @@ func (e *connectionlessEndpoint) isBound() bool { // Close puts the endpoint in a closed state and frees all resources associated // with it. -// -// The socket will be a fresh state after a call to close and may be reused. -// That is, close may be used to "unbind" or "disconnect" the socket in error -// paths. func (e *connectionlessEndpoint) Close() { e.Lock() - var r Receiver - if e.Connected() { - e.receiver.CloseRecv() - r = e.receiver - e.receiver = nil - + if e.connected != nil { e.connected.Release() e.connected = nil } + if e.isBound() { e.path = "" } + + e.receiver.CloseRecv() + r := e.receiver + e.receiver = nil e.Unlock() - if r != nil { - r.CloseNotify() - r.Release() - } + + r.CloseNotify() + r.Release() } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. @@ -137,6 +134,9 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi } e.Lock() + if e.connected != nil { + e.connected.Release() + } e.connected = connected e.Unlock() diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index f67f6fee0..0415fae9a 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -100,7 +100,7 @@ func (q *queue) IsWritable() bool { // Enqueue adds an entry to the data queue if room is available. // -// If truncate is true, Enqueue may truncate the message beforing enqueuing it. +// If truncate is true, Enqueue may truncate the message before enqueuing it. // Otherwise, the entire message must fit. If n < e.Length(), err indicates why. // // If notify is true, ReaderQueue.Notify must be called: diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index b0765ba55..7fb9cb1e0 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -179,6 +179,10 @@ type Endpoint interface { // tcpip.*Option types. GetSockOpt(opt interface{}) *tcpip.Error + // GetSockOptInt gets a socket option for simple cases when a return + // value has the int type. + GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) + // State returns the current state of the socket, as represented by Linux in // procfs. State() uint32 @@ -834,33 +838,39 @@ func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error { return nil } -// GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { - switch o := opt.(type) { - case tcpip.ErrorOption: - return nil - - case *tcpip.SendQueueSizeOption: +func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { + switch opt { + case tcpip.ReceiveQueueSizeOption: + v := 0 e.Lock() if !e.Connected() { e.Unlock() - return tcpip.ErrNotConnected + return -1, tcpip.ErrNotConnected } - qs := tcpip.SendQueueSizeOption(e.connected.SendQueuedSize()) + v = int(e.receiver.RecvQueuedSize()) e.Unlock() - if qs < 0 { - return tcpip.ErrQueueSizeNotSupported + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported } - *o = qs + return v, nil + default: + return -1, tcpip.ErrUnknownProtocolOption + } +} + +// GetSockOpt implements tcpip.Endpoint.GetSockOpt. +func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { + switch o := opt.(type) { + case tcpip.ErrorOption: return nil - case *tcpip.ReceiveQueueSizeOption: + case *tcpip.SendQueueSizeOption: e.Lock() if !e.Connected() { e.Unlock() return tcpip.ErrNotConnected } - qs := tcpip.ReceiveQueueSizeOption(e.receiver.RecvQueuedSize()) + qs := tcpip.SendQueueSizeOption(e.connected.SendQueuedSize()) e.Unlock() if qs < 0 { return tcpip.ErrQueueSizeNotSupported diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 5fc43db8c..eb262ecaf 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -69,10 +68,19 @@ func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) // NewWithDirent creates a new unix socket using an existing dirent. func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File { - return fs.NewFile(ctx, d, flags, &SocketOperations{ + // You can create AF_UNIX, SOCK_RAW sockets. They're the same as + // SOCK_DGRAM and don't require CAP_NET_RAW. + if stype == linux.SOCK_RAW { + stype = linux.SOCK_DGRAM + } + + s := SocketOperations{ ep: ep, stype: stype, - }) + } + s.EnableLeakCheck("unix.SocketOperations") + + return fs.NewFile(ctx, d, flags, &s) } // DecRef implements RefCounter.DecRef. @@ -108,7 +116,7 @@ func (s *SocketOperations) Endpoint() transport.Endpoint { // extractPath extracts and validates the address. func extractPath(sockaddr []byte) (string, *syserr.Error) { - addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr) + addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr, true /* strict */) if err != nil { return "", err } @@ -152,7 +160,7 @@ func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *sy } // Ioctl implements fs.FileOperations.Ioctl. -func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { return epsocket.Ioctl(ctx, s.ep, io, args) } @@ -191,7 +199,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, * // Accept implements the linux syscall accept(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Issue the accept request to get the new endpoint. ep, err := s.ep.Accept() if err != nil { @@ -226,10 +234,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } } - fdFlags := kernel.FDFlags{ + fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits()) + }) if e != nil { return 0, nil, 0, syserr.FromError(e) } @@ -638,7 +645,7 @@ func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs // Create the endpoint and socket. var ep transport.Endpoint switch stype { - case linux.SOCK_DGRAM: + case linux.SOCK_DGRAM, linux.SOCK_RAW: ep = transport.NewConnectionless(t) case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: ep = transport.NewConnectioned(t, stype, t.Kernel()) @@ -657,7 +664,7 @@ func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.F } switch stype { - case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: + case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW: // Ok default: return nil, nil, syserr.ErrInvalidArgument diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index d77c7a433..445d25010 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -30,7 +30,6 @@ go_library( "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/socket/control", "//pkg/sentry/socket/epsocket", "//pkg/sentry/socket/netlink", diff --git a/pkg/sentry/strace/capability.go b/pkg/sentry/strace/capability.go index f4a8e6365..3255dc18d 100644 --- a/pkg/sentry/strace/capability.go +++ b/pkg/sentry/strace/capability.go @@ -19,7 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" ) -// CapabilityBitset is the set of capabilties in a bitset. +// CapabilityBitset is the set of capabilities in a bitset. var CapabilityBitset = abi.FlagSet{ { Flag: 1 << uint32(linux.CAP_CHOWN), diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go index 57cf6b139..5187594a7 100644 --- a/pkg/sentry/strace/poll.go +++ b/pkg/sentry/strace/poll.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" ) @@ -50,7 +49,7 @@ func pollFD(t *kernel.Task, pfd *linux.PollFD, post bool) string { if post { revents = PollEventSet.Parse(uint64(pfd.REvents)) } - return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, kdefs.FD(pfd.FD)), PollEventSet.Parse(uint64(pfd.Events)), revents) + return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, pfd.FD), PollEventSet.Parse(uint64(pfd.Events)), revents) } func pollFDs(t *kernel.Task, addr usermem.Addr, nfds uint, post bool) string { diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index f9cf2eb21..386b40af7 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -332,7 +332,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string { switch family { case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX: - fa, err := epsocket.GetAddress(int(family), b) + fa, err := epsocket.GetAddress(int(family), b, true /* strict */) if err != nil { return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err) } diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 86e9c5690..311389547 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -31,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -133,7 +132,7 @@ func path(t *kernel.Task, addr usermem.Addr) string { return fmt.Sprintf("%#x %s", addr, path) } -func fd(t *kernel.Task, fd kdefs.FD) string { +func fd(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectory() if root != nil { defer root.DecRef() @@ -151,7 +150,7 @@ func fd(t *kernel.Task, fd kdefs.FD) string { return fmt.Sprintf("AT_FDCWD %s", name) } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) @@ -375,7 +374,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo } switch i.format[arg] { case FD: - output = append(output, fd(t, kdefs.FD(args[arg].Int()))) + output = append(output, fd(t, args[arg].Int())) case WriteBuffer: output = append(output, dump(t, args[arg].Pointer(), args[arg+1].SizeT(), maximumBlobSize)) case WriteIOVec: diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD index 18fddee76..79d972202 100644 --- a/pkg/sentry/syscalls/BUILD +++ b/pkg/sentry/syscalls/BUILD @@ -15,7 +15,6 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/kernel", "//pkg/sentry/kernel/epoll", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/syserror", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index c710ec9e3..87dcad18b 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -15,25 +15,23 @@ package syscalls import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) // CreateEpoll implements the epoll_create(2) linux syscall. -func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) { +func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) { file := epoll.NewEventPoll(t) defer file.DecRef() - flags := kernel.FDFlags{ + fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: closeOnExec, - } - fd, err := t.FDMap().NewFDFrom(0, file, flags, t.ThreadGroup().Limits()) + }) if err != nil { return 0, err } @@ -42,25 +40,25 @@ func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) { } // AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD. -func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { +func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to add the entry. @@ -68,25 +66,25 @@ func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags } // UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD. -func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { +func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to update the entry. @@ -94,25 +92,25 @@ func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFl } // RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL. -func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error { +func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to remove the entry. @@ -120,18 +118,18 @@ func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error { } // WaitEpoll implements the epoll_wait(2) linux syscall. -func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event, error) { +func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, error) { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(fd) + epollfile := t.GetFile(fd) if epollfile == nil { - return nil, syscall.EBADF + return nil, syserror.EBADF } defer epollfile.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return nil, syscall.EBADF + return nil, syserror.EBADF } // Try to read events and return right away if we got them or if the @@ -164,7 +162,7 @@ func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syscall.ETIMEDOUT { + if err == syserror.ETIMEDOUT { return nil, nil } diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 6e5be0158..33a40b9c6 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -71,7 +71,6 @@ go_library( "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/eventfd", "//pkg/sentry/kernel/fasync", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/shm", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index ac3905c5c..264301bfa 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -17,7 +17,6 @@ package linux import ( "io" "sync" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" @@ -54,7 +53,7 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return syscall.EFBIG + return syserror.EFBIG case syserror.ErrInterrupted: // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go index 0c1b5ec27..444f2b004 100644 --- a/pkg/sentry/syscalls/linux/flags.go +++ b/pkg/sentry/syscalls/linux/flags.go @@ -41,6 +41,7 @@ func flagsToPermissions(mask uint) (p fs.PermMask) { func linuxToFlags(mask uint) fs.FileFlags { return fs.FileFlags{ Direct: mask&linux.O_DIRECT != 0, + DSync: mask&(linux.O_DSYNC|linux.O_SYNC) != 0, Sync: mask&linux.O_SYNC != 0, NonBlocking: mask&linux.O_NONBLOCK != 0, Read: (mask & linux.O_ACCMODE) != linux.O_WRONLY, diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 7f18b1ac8..51db2d8f7 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -16,8 +16,6 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -50,335 +48,335 @@ var AMD64 = &kernel.SyscallTable{ Table: map[uintptr]kernel.Syscall{ 0: syscalls.Supported("read", Read), 1: syscalls.Supported("write", Write), - 2: syscalls.Supported("open", Open), + 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), 3: syscalls.Supported("close", Close), - 4: syscalls.Undocumented("stat", Stat), - 5: syscalls.Undocumented("fstat", Fstat), - 6: syscalls.Undocumented("lstat", Lstat), - 7: syscalls.Undocumented("poll", Poll), - 8: syscalls.Undocumented("lseek", Lseek), - 9: syscalls.Undocumented("mmap", Mmap), - 10: syscalls.Undocumented("mprotect", Mprotect), - 11: syscalls.Undocumented("munmap", Munmap), - 12: syscalls.Undocumented("brk", Brk), - 13: syscalls.Undocumented("rt_sigaction", RtSigaction), - 14: syscalls.Undocumented("rt_sigprocmask", RtSigprocmask), - 15: syscalls.Undocumented("rt_sigreturn", RtSigreturn), - 16: syscalls.Undocumented("ioctl", Ioctl), - 17: syscalls.Undocumented("pread64", Pread64), - 18: syscalls.Undocumented("pwrite64", Pwrite64), - 19: syscalls.Undocumented("readv", Readv), - 20: syscalls.Undocumented("writev", Writev), - 21: syscalls.Undocumented("access", Access), - 22: syscalls.Undocumented("pipe", Pipe), - 23: syscalls.Undocumented("select", Select), - 24: syscalls.Undocumented("sched_yield", SchedYield), - 25: syscalls.Undocumented("mremap", Mremap), - 26: syscalls.Undocumented("msync", Msync), - 27: syscalls.Undocumented("mincore", Mincore), - 28: syscalls.Undocumented("madvise", Madvise), - 29: syscalls.Undocumented("shmget", Shmget), - 30: syscalls.Undocumented("shmat", Shmat), - 31: syscalls.Undocumented("shmctl", Shmctl), - 32: syscalls.Undocumented("dup", Dup), - 33: syscalls.Undocumented("dup2", Dup2), - 34: syscalls.Undocumented("pause", Pause), - 35: syscalls.Undocumented("nanosleep", Nanosleep), - 36: syscalls.Undocumented("getitimer", Getitimer), - 37: syscalls.Undocumented("alarm", Alarm), - 38: syscalls.Undocumented("setitimer", Setitimer), - 39: syscalls.Undocumented("getpid", Getpid), - 40: syscalls.Undocumented("sendfile", Sendfile), - 41: syscalls.Undocumented("socket", Socket), - 42: syscalls.Undocumented("connect", Connect), - 43: syscalls.Undocumented("accept", Accept), - 44: syscalls.Undocumented("sendto", SendTo), - 45: syscalls.Undocumented("recvfrom", RecvFrom), - 46: syscalls.Undocumented("sendmsg", SendMsg), - 47: syscalls.Undocumented("recvmsg", RecvMsg), - 48: syscalls.Undocumented("shutdown", Shutdown), - 49: syscalls.Undocumented("bind", Bind), - 50: syscalls.Undocumented("listen", Listen), - 51: syscalls.Undocumented("getsockname", GetSockName), - 52: syscalls.Undocumented("getpeername", GetPeerName), - 53: syscalls.Undocumented("socketpair", SocketPair), - 54: syscalls.Undocumented("setsockopt", SetSockOpt), - 55: syscalls.Undocumented("getsockopt", GetSockOpt), - 56: syscalls.Undocumented("clone", Clone), - 57: syscalls.Undocumented("fork", Fork), - 58: syscalls.Undocumented("vfork", Vfork), - 59: syscalls.Undocumented("execve", Execve), - 60: syscalls.Undocumented("exit", Exit), - 61: syscalls.Undocumented("wait4", Wait4), - 62: syscalls.Undocumented("kill", Kill), - 63: syscalls.Undocumented("uname", Uname), - 64: syscalls.Undocumented("semget", Semget), - 65: syscalls.Undocumented("semop", Semop), - 66: syscalls.Undocumented("semctl", Semctl), - 67: syscalls.Undocumented("shmdt", Shmdt), - 68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 72: syscalls.Undocumented("fcntl", Fcntl), - 73: syscalls.Undocumented("flock", Flock), - 74: syscalls.Undocumented("fsync", Fsync), - 75: syscalls.Undocumented("fdatasync", Fdatasync), - 76: syscalls.Undocumented("truncate", Truncate), - 77: syscalls.Undocumented("ftruncate", Ftruncate), - 78: syscalls.Undocumented("getdents", Getdents), - 79: syscalls.Undocumented("getcwd", Getcwd), - 80: syscalls.Undocumented("chdir", Chdir), - 81: syscalls.Undocumented("fchdir", Fchdir), - 82: syscalls.Undocumented("rename", Rename), - 83: syscalls.Undocumented("mkdir", Mkdir), - 84: syscalls.Undocumented("rmdir", Rmdir), - 85: syscalls.Undocumented("creat", Creat), - 86: syscalls.Undocumented("link", Link), - 87: syscalls.Undocumented("link", Unlink), - 88: syscalls.Undocumented("symlink", Symlink), - 89: syscalls.Undocumented("readlink", Readlink), - 90: syscalls.Undocumented("chmod", Chmod), - 91: syscalls.Undocumented("fchmod", Fchmod), - 92: syscalls.Undocumented("chown", Chown), - 93: syscalls.Undocumented("fchown", Fchown), - 94: syscalls.Undocumented("lchown", Lchown), - 95: syscalls.Undocumented("umask", Umask), - 96: syscalls.Undocumented("gettimeofday", Gettimeofday), - 97: syscalls.Undocumented("getrlimit", Getrlimit), - 98: syscalls.Undocumented("getrusage", Getrusage), - 99: syscalls.Undocumented("sysinfo", Sysinfo), - 100: syscalls.Undocumented("times", Times), - 101: syscalls.Undocumented("ptrace", Ptrace), - 102: syscalls.Undocumented("getuid", Getuid), - 103: syscalls.Undocumented("syslog", Syslog), - 104: syscalls.Undocumented("getgid", Getgid), - 105: syscalls.Undocumented("setuid", Setuid), - 106: syscalls.Undocumented("setgid", Setgid), - 107: syscalls.Undocumented("geteuid", Geteuid), - 108: syscalls.Undocumented("getegid", Getegid), - 109: syscalls.Undocumented("setpgid", Setpgid), - 110: syscalls.Undocumented("getppid", Getppid), - 111: syscalls.Undocumented("getpgrp", Getpgrp), - 112: syscalls.Undocumented("setsid", Setsid), - 113: syscalls.Undocumented("setreuid", Setreuid), - 114: syscalls.Undocumented("setregid", Setregid), - 115: syscalls.Undocumented("getgroups", Getgroups), - 116: syscalls.Undocumented("setgroups", Setgroups), - 117: syscalls.Undocumented("setresuid", Setresuid), - 118: syscalls.Undocumented("getresuid", Getresuid), - 119: syscalls.Undocumented("setresgid", Setresgid), - 120: syscalls.Undocumented("setresgid", Getresgid), - 121: syscalls.Undocumented("getpgid", Getpgid), - 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 124: syscalls.Undocumented("getsid", Getsid), - 125: syscalls.Undocumented("capget", Capget), - 126: syscalls.Undocumented("capset", Capset), - 127: syscalls.Undocumented("rt_sigpending", RtSigpending), - 128: syscalls.Undocumented("rt_sigtimedwait", RtSigtimedwait), - 129: syscalls.Undocumented("rt_sigqueueinfo", RtSigqueueinfo), - 130: syscalls.Undocumented("rt_sigsuspend", RtSigsuspend), - 131: syscalls.Undocumented("sigaltstack", Sigaltstack), - 132: syscalls.Undocumented("utime", Utime), - 133: syscalls.Undocumented("mknod", Mknod), - 134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil), - 135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil), - 136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil), - 137: syscalls.Undocumented("statfs", Statfs), - 138: syscalls.Undocumented("fstatfs", Fstatfs), - 139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}), - 140: syscalls.Undocumented("getpriority", Getpriority), - 141: syscalls.Undocumented("setpriority", Setpriority), + 4: syscalls.Supported("stat", Stat), + 5: syscalls.Supported("fstat", Fstat), + 6: syscalls.Supported("lstat", Lstat), + 7: syscalls.Supported("poll", Poll), + 8: syscalls.Supported("lseek", Lseek), + 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 10: syscalls.Supported("mprotect", Mprotect), + 11: syscalls.Supported("munmap", Munmap), + 12: syscalls.Supported("brk", Brk), + 13: syscalls.Supported("rt_sigaction", RtSigaction), + 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 15: syscalls.Supported("rt_sigreturn", RtSigreturn), + 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 17: syscalls.Supported("pread64", Pread64), + 18: syscalls.Supported("pwrite64", Pwrite64), + 19: syscalls.Supported("readv", Readv), + 20: syscalls.Supported("writev", Writev), + 21: syscalls.Supported("access", Access), + 22: syscalls.Supported("pipe", Pipe), + 23: syscalls.Supported("select", Select), + 24: syscalls.Supported("sched_yield", SchedYield), + 25: syscalls.Supported("mremap", Mremap), + 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 32: syscalls.Supported("dup", Dup), + 33: syscalls.Supported("dup2", Dup2), + 34: syscalls.Supported("pause", Pause), + 35: syscalls.Supported("nanosleep", Nanosleep), + 36: syscalls.Supported("getitimer", Getitimer), + 37: syscalls.Supported("alarm", Alarm), + 38: syscalls.Supported("setitimer", Setitimer), + 39: syscalls.Supported("getpid", Getpid), + 40: syscalls.Supported("sendfile", Sendfile), + 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 42: syscalls.Supported("connect", Connect), + 43: syscalls.Supported("accept", Accept), + 44: syscalls.Supported("sendto", SendTo), + 45: syscalls.Supported("recvfrom", RecvFrom), + 46: syscalls.Supported("sendmsg", SendMsg), + 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 50: syscalls.Supported("listen", Listen), + 51: syscalls.Supported("getsockname", GetSockName), + 52: syscalls.Supported("getpeername", GetPeerName), + 53: syscalls.Supported("socketpair", SocketPair), + 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 57: syscalls.Supported("fork", Fork), + 58: syscalls.Supported("vfork", Vfork), + 59: syscalls.Supported("execve", Execve), + 60: syscalls.Supported("exit", Exit), + 61: syscalls.Supported("wait4", Wait4), + 62: syscalls.Supported("kill", Kill), + 63: syscalls.Supported("uname", Uname), + 64: syscalls.Supported("semget", Semget), + 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 67: syscalls.Supported("shmdt", Shmdt), + 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 76: syscalls.Supported("truncate", Truncate), + 77: syscalls.Supported("ftruncate", Ftruncate), + 78: syscalls.Supported("getdents", Getdents), + 79: syscalls.Supported("getcwd", Getcwd), + 80: syscalls.Supported("chdir", Chdir), + 81: syscalls.Supported("fchdir", Fchdir), + 82: syscalls.Supported("rename", Rename), + 83: syscalls.Supported("mkdir", Mkdir), + 84: syscalls.Supported("rmdir", Rmdir), + 85: syscalls.Supported("creat", Creat), + 86: syscalls.Supported("link", Link), + 87: syscalls.Supported("unlink", Unlink), + 88: syscalls.Supported("symlink", Symlink), + 89: syscalls.Supported("readlink", Readlink), + 90: syscalls.Supported("chmod", Chmod), + 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 92: syscalls.Supported("chown", Chown), + 93: syscalls.Supported("fchown", Fchown), + 94: syscalls.Supported("lchown", Lchown), + 95: syscalls.Supported("umask", Umask), + 96: syscalls.Supported("gettimeofday", Gettimeofday), + 97: syscalls.Supported("getrlimit", Getrlimit), + 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 100: syscalls.Supported("times", Times), + 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 102: syscalls.Supported("getuid", Getuid), + 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 104: syscalls.Supported("getgid", Getgid), + 105: syscalls.Supported("setuid", Setuid), + 106: syscalls.Supported("setgid", Setgid), + 107: syscalls.Supported("geteuid", Geteuid), + 108: syscalls.Supported("getegid", Getegid), + 109: syscalls.Supported("setpgid", Setpgid), + 110: syscalls.Supported("getppid", Getppid), + 111: syscalls.Supported("getpgrp", Getpgrp), + 112: syscalls.Supported("setsid", Setsid), + 113: syscalls.Supported("setreuid", Setreuid), + 114: syscalls.Supported("setregid", Setregid), + 115: syscalls.Supported("getgroups", Getgroups), + 116: syscalls.Supported("setgroups", Setgroups), + 117: syscalls.Supported("setresuid", Setresuid), + 118: syscalls.Supported("getresuid", Getresuid), + 119: syscalls.Supported("setresgid", Setresgid), + 120: syscalls.Supported("getresgid", Getresgid), + 121: syscalls.Supported("getpgid", Getpgid), + 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 124: syscalls.Supported("getsid", Getsid), + 125: syscalls.Supported("capget", Capget), + 126: syscalls.Supported("capset", Capset), + 127: syscalls.Supported("rt_sigpending", RtSigpending), + 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 131: syscalls.Supported("sigaltstack", Sigaltstack), + 132: syscalls.Supported("utime", Utime), + 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), + 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), + 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), - 143: syscalls.Undocumented("sched_getparam", SchedGetparam), - 144: syscalls.Undocumented("sched_setscheduler", SchedSetscheduler), - 145: syscalls.Undocumented("sched_getscheduler", SchedGetscheduler), - 146: syscalls.Undocumented("sched_get_priority_max", SchedGetPriorityMax), - 147: syscalls.Undocumented("sched_get_priority_min", SchedGetPriorityMin), - 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil), - 149: syscalls.Undocumented("mlock", Mlock), - 150: syscalls.Undocumented("munlock", Munlock), - 151: syscalls.Undocumented("mlockall", Mlockall), - 152: syscalls.Undocumented("munlockall", Munlockall), + 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil), - 155: syscalls.Error("pivot_root", syscall.EPERM, "", nil), - 156: syscalls.Error("sysctl", syscall.EPERM, `syscall is "worthless"`, nil), - 157: syscalls.Undocumented("prctl", Prctl), - 158: syscalls.Undocumented("arch_prctl", ArchPrctl), + 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), - 160: syscalls.Undocumented("setrlimit", Setrlimit), - 161: syscalls.Undocumented("chroot", Chroot), - 162: syscalls.Undocumented("sync", Sync), + 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 161: syscalls.Supported("chroot", Chroot), + 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), - 165: syscalls.Undocumented("mount", Mount), - 166: syscalls.Undocumented("umount2", Umount2), + 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), - 170: syscalls.Undocumented("sethostname", Sethostname), - 171: syscalls.Undocumented("setdomainname", Setdomainname), + 170: syscalls.Supported("sethostname", Sethostname), + 171: syscalls.Supported("setdomainname", Setdomainname), 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in > 2.6", nil), - 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in > 2.6", nil), + 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Does not exist > 3.1", nil), - 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux", nil), - 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux", nil), - 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux", nil), - 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux", nil), - 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux", nil), - 186: syscalls.Undocumented("gettid", Gettid), - 187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) - 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 200: syscalls.Undocumented("tkill", Tkill), - 201: syscalls.Undocumented("time", Time), - 202: syscalls.Undocumented("futex", Futex), - 203: syscalls.Undocumented("sched_setaffinity", SchedSetaffinity), - 204: syscalls.Undocumented("sched_getaffinity", SchedGetaffinity), - 205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), - 206: syscalls.Undocumented("io_setup", IoSetup), - 207: syscalls.Undocumented("io_destroy", IoDestroy), - 208: syscalls.Undocumented("io_getevents", IoGetevents), - 209: syscalls.Undocumented("io_submit", IoSubmit), - 210: syscalls.Undocumented("io_cancel", IoCancel), - 211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), + 186: syscalls.Supported("gettid", Gettid), + 187: syscalls.ErrorWithEvent("readahead", syserror.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) + 188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 200: syscalls.Supported("tkill", Tkill), + 201: syscalls.Supported("time", Time), + 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), - 213: syscalls.Undocumented("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since 3.16", nil), - 217: syscalls.Undocumented("getdents64", Getdents64), - 218: syscalls.Undocumented("set_tid_address", SetTidAddress), - 219: syscalls.Undocumented("restart_syscall", RestartSyscall), - 220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) - 221: syscalls.Undocumented("fadvise64", Fadvise64), - 222: syscalls.Undocumented("timer_create", TimerCreate), - 223: syscalls.Undocumented("timer_settime", TimerSettime), - 224: syscalls.Undocumented("timer_gettime", TimerGettime), - 225: syscalls.Undocumented("timer_getoverrun", TimerGetoverrun), - 226: syscalls.Undocumented("timer_delete", TimerDelete), - 227: syscalls.Undocumented("clock_settime", ClockSettime), - 228: syscalls.Undocumented("clock_gettime", ClockGettime), - 229: syscalls.Undocumented("clock_getres", ClockGetres), - 230: syscalls.Undocumented("clock_nanosleep", ClockNanosleep), - 231: syscalls.Undocumented("exit_group", ExitGroup), - 232: syscalls.Undocumented("epoll_wait", EpollWait), - 233: syscalls.Undocumented("epoll_ctl", EpollCtl), - 234: syscalls.Undocumented("tgkill", Tgkill), - 235: syscalls.Undocumented("utimes", Utimes), - 236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil), + 213: syscalls.Supported("epoll_create", EpollCreate), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 217: syscalls.Supported("getdents64", Getdents64), + 218: syscalls.Supported("set_tid_address", SetTidAddress), + 219: syscalls.Supported("restart_syscall", RestartSyscall), + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 222: syscalls.Supported("timer_create", TimerCreate), + 223: syscalls.Supported("timer_settime", TimerSettime), + 224: syscalls.Supported("timer_gettime", TimerGettime), + 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 226: syscalls.Supported("timer_delete", TimerDelete), + 227: syscalls.Supported("clock_settime", ClockSettime), + 228: syscalls.Supported("clock_gettime", ClockGettime), + 229: syscalls.Supported("clock_getres", ClockGetres), + 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 231: syscalls.Supported("exit_group", ExitGroup), + 232: syscalls.Supported("epoll_wait", EpollWait), + 233: syscalls.Supported("epoll_ctl", EpollCtl), + 234: syscalls.Supported("tgkill", Tgkill), + 235: syscalls.Supported("utimes", Utimes), + 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), - 238: syscalls.Undocumented("set_mempolicy", SetMempolicy), - 239: syscalls.Undocumented("get_mempolicy", GetMempolicy), - 240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 243: syscalls.ErrorWithEvent("mq_timedreceive", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), - 247: syscalls.Undocumented("waitid", Waitid), - 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user", nil), - 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user", nil), - 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user", nil), + 247: syscalls.Supported("waitid", Waitid), + 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 253: syscalls.Undocumented("inotify_init", InotifyInit), - 254: syscalls.Undocumented("inotify_add_watch", InotifyAddWatch), - 255: syscalls.Undocumented("inotify_rm_watch", InotifyRmWatch), + 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), + 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), - 257: syscalls.Undocumented("openat", Openat), - 258: syscalls.Undocumented("mkdirat", Mkdirat), - 259: syscalls.Undocumented("mknodat", Mknodat), - 260: syscalls.Undocumented("fchownat", Fchownat), - 261: syscalls.Undocumented("futimesat", Futimesat), - 262: syscalls.Undocumented("fstatat", Fstatat), - 263: syscalls.Undocumented("unlinkat", Unlinkat), - 264: syscalls.Undocumented("renameat", Renameat), - 265: syscalls.Undocumented("linkat", Linkat), - 266: syscalls.Undocumented("symlinkat", Symlinkat), - 267: syscalls.Undocumented("readlinkat", Readlinkat), - 268: syscalls.Undocumented("fchmodat", Fchmodat), - 269: syscalls.Undocumented("faccessat", Faccessat), - 270: syscalls.Undocumented("pselect", Pselect), - 271: syscalls.Undocumented("ppoll", Ppoll), - 272: syscalls.Undocumented("unshare", Unshare), - 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete", nil), - 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete", nil), - 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 277: syscalls.Undocumented("sync_file_range", SyncFileRange), - 278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) - 280: syscalls.Undocumented("utimensat", Utimensat), - 281: syscalls.Undocumented("epoll_pwait", EpollPwait), - 282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) - 283: syscalls.Undocumented("timerfd_create", TimerfdCreate), - 284: syscalls.Undocumented("eventfd", Eventfd), - 285: syscalls.Undocumented("fallocate", Fallocate), - 286: syscalls.Undocumented("timerfd_settime", TimerfdSettime), - 287: syscalls.Undocumented("timerfd_gettime", TimerfdGettime), - 288: syscalls.Undocumented("accept4", Accept4), - 289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) - 290: syscalls.Undocumented("eventfd2", Eventfd2), - 291: syscalls.Undocumented("epoll_create1", EpollCreate1), - 292: syscalls.Undocumented("dup3", Dup3), - 293: syscalls.Undocumented("pipe2", Pipe2), - 294: syscalls.Undocumented("inotify_init1", InotifyInit1), - 295: syscalls.Undocumented("preadv", Preadv), - 296: syscalls.Undocumented("pwritev", Pwritev), - 297: syscalls.Undocumented("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil), - 299: syscalls.Undocumented("recvmmsg", RecvMMsg), - 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 302: syscalls.Undocumented("prlimit64", Prlimit64), - 303: syscalls.ErrorWithEvent("name_to_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil), - 304: syscalls.ErrorWithEvent("open_by_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil), + 257: syscalls.Supported("openat", Openat), + 258: syscalls.Supported("mkdirat", Mkdirat), + 259: syscalls.Supported("mknodat", Mknodat), + 260: syscalls.Supported("fchownat", Fchownat), + 261: syscalls.Supported("futimesat", Futimesat), + 262: syscalls.Supported("fstatat", Fstatat), + 263: syscalls.Supported("unlinkat", Unlinkat), + 264: syscalls.Supported("renameat", Renameat), + 265: syscalls.Supported("linkat", Linkat), + 266: syscalls.Supported("symlinkat", Symlinkat), + 267: syscalls.Supported("readlinkat", Readlinkat), + 268: syscalls.Supported("fchmodat", Fchmodat), + 269: syscalls.Supported("faccessat", Faccessat), + 270: syscalls.Supported("pselect", Pselect), + 271: syscalls.Supported("ppoll", Ppoll), + 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 276: syscalls.ErrorWithEvent("tee", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 280: syscalls.Supported("utimensat", Utimensat), + 281: syscalls.Supported("epoll_pwait", EpollPwait), + 282: syscalls.ErrorWithEvent("signalfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 283: syscalls.Supported("timerfd_create", TimerfdCreate), + 284: syscalls.Supported("eventfd", Eventfd), + 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 286: syscalls.Supported("timerfd_settime", TimerfdSettime), + 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 288: syscalls.Supported("accept4", Accept4), + 289: syscalls.ErrorWithEvent("signalfd4", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 290: syscalls.Supported("eventfd2", Eventfd2), + 291: syscalls.Supported("epoll_create1", EpollCreate1), + 292: syscalls.Supported("dup3", Dup3), + 293: syscalls.Supported("pipe2", Pipe2), + 294: syscalls.Supported("inotify_init1", InotifyInit1), + 295: syscalls.Supported("preadv", Preadv), + 296: syscalls.Supported("pwritev", Pwritev), + 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 302: syscalls.Supported("prlimit64", Prlimit64), + 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), - 306: syscalls.Undocumented("syncfs", Syncfs), - 307: syscalls.Undocumented("sendmmsg", SendMMsg), - 308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) - 309: syscalls.Undocumented("getcpu", Getcpu), - 310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 309: syscalls.Supported("getcpu", Getcpu), + 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) - 317: syscalls.Undocumented("seccomp", Seccomp), - 318: syscalls.Undocumented("getrandom", GetRandom), - 319: syscalls.Undocumented("memfd_create", MemfdCreate), + 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 317: syscalls.Supported("seccomp", Seccomp), + 318: syscalls.Supported("getrandom", GetRandom), + 319: syscalls.Supported("memfd_create", MemfdCreate), 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) - 323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) - 325: syscalls.Undocumented("mlock2", Mlock2), + 322: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) + 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) + 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls after 325 are "backports" from versions of Linux after 4.4. - 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil), - 327: syscalls.Undocumented("preadv2", Preadv2), - 328: syscalls.Undocumented("pwritev2", Pwritev2), + 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 327: syscalls.Supported("preadv2", Preadv2), + 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), 332: syscalls.Supported("statx", Statx), }, diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index 00b7e7cf2..333013d8c 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -15,8 +15,6 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -27,7 +25,7 @@ import ( // STOP are clear. func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { if size != linux.SignalSetSize { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } b := t.CopyScratchBuffer(8) if _, err := t.CopyInBytes(sigSetAddr, b); err != nil { diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index 7081d1a45..f56411bfe 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -55,7 +54,7 @@ type ioCallback struct { OpCode uint16 ReqPrio int16 - FD uint32 + FD int32 Buf uint64 Bytes uint64 @@ -65,7 +64,7 @@ type ioCallback struct { Flags uint32 // eventfd to signal if IOCB_FLAG_RESFD is set in flags. - ResFD uint32 + ResFD int32 } // ioEvent describes an I/O result. @@ -292,7 +291,7 @@ func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioC // submitCallback processes a single callback. func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error { - file := t.FDMap().GetFile(kdefs.FD(cb.FD)) + file := t.GetFile(cb.FD) if file == nil { // File not found. return syserror.EBADF @@ -302,7 +301,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Was there an eventFD? Extract it. var eventFile *fs.File if cb.Flags&_IOCB_FLAG_RESFD != 0 { - eventFile = t.FDMap().GetFile(kdefs.FD(cb.ResFD)) + eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. return syserror.EBADF diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 14a61cfa5..4a2b9f061 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -15,12 +15,10 @@ package linux import ( - "syscall" - + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/syscalls" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -30,11 +28,11 @@ import ( // EpollCreate1 implements the epoll_create1(2) linux syscall. func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() - if flags & ^syscall.EPOLL_CLOEXEC != 0 { + if flags & ^linux.EPOLL_CLOEXEC != 0 { return 0, nil, syserror.EINVAL } - closeOnExec := flags&syscall.EPOLL_CLOEXEC != 0 + closeOnExec := flags&linux.EPOLL_CLOEXEC != 0 fd, err := syscalls.CreateEpoll(t, closeOnExec) if err != nil { return 0, nil, err @@ -61,46 +59,43 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // EpollCtl implements the epoll_ctl(2) linux syscall. func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - epfd := kdefs.FD(args[0].Int()) + epfd := args[0].Int() op := args[1].Int() - fd := kdefs.FD(args[2].Int()) + fd := args[2].Int() eventAddr := args[3].Pointer() // Capture the event state if needed. flags := epoll.EntryFlags(0) mask := waiter.EventMask(0) var data [2]int32 - if op != syscall.EPOLL_CTL_DEL { - var e syscall.EpollEvent + if op != linux.EPOLL_CTL_DEL { + var e linux.EpollEvent if _, err := t.CopyIn(eventAddr, &e); err != nil { return 0, nil, err } - if e.Events&syscall.EPOLLONESHOT != 0 { + if e.Events&linux.EPOLLONESHOT != 0 { flags |= epoll.OneShot } - // syscall.EPOLLET is incorrectly generated as a negative number - // in Go, see https://github.com/golang/go/issues/5328 for - // details. - if e.Events&-syscall.EPOLLET != 0 { + if e.Events&linux.EPOLLET != 0 { flags |= epoll.EdgeTriggered } mask = waiter.EventMaskFromLinux(e.Events) data[0] = e.Fd - data[1] = e.Pad + data[1] = e.Data } // Perform the requested operations. switch op { - case syscall.EPOLL_CTL_ADD: + case linux.EPOLL_CTL_ADD: // See fs/eventpoll.c. mask |= waiter.EventHUp | waiter.EventErr return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data) - case syscall.EPOLL_CTL_DEL: + case linux.EPOLL_CTL_DEL: return 0, nil, syscalls.RemoveEpoll(t, epfd, fd) - case syscall.EPOLL_CTL_MOD: + case linux.EPOLL_CTL_MOD: // Same as EPOLL_CTL_ADD. mask |= waiter.EventHUp | waiter.EventErr return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data) @@ -132,7 +127,7 @@ func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error { // EpollWait implements the epoll_wait(2) linux syscall. func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - epfd := kdefs.FD(args[0].Int()) + epfd := args[0].Int() eventsAddr := args[1].Pointer() maxEvents := int(args[2].Int()) timeout := int(args[3].Int()) diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 7dbe84884..8a34c4e99 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -15,12 +15,11 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" + "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -38,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC) if flags & ^allOps != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0) @@ -47,10 +46,9 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc }) defer event.DecRef() - fd, err := t.FDMap().NewFDFrom(0, event, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ CloseOnExec: flags&EFD_CLOEXEC != 0, - }, - t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index d9ed02c99..2e00a91ce 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -26,8 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -35,7 +33,7 @@ import ( ) // fileOpAt performs an operation on the second last component in the path. -func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error { +func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error { // Extract the last component. dir, name := fs.SplitLast(path) if dir == "/" { @@ -61,7 +59,7 @@ func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dire } // fileOpOn performs an operation on the last entry of the path. -func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error { +func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error { var ( d *fs.Dirent // The file. wd *fs.Dirent // The working directory (if required.) @@ -79,7 +77,7 @@ func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func rel = wd } else { // Need to extract the given FD. - f = t.FDMap().GetFile(dirFD) + f = t.GetFile(dirFD) if f == nil { return syserror.EBADF } @@ -132,7 +130,7 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string return path, dirPath, nil } -func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) { +func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -185,8 +183,9 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u defer file.DecRef() // Success. - fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0} - newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { return err } @@ -202,7 +201,7 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u return fd, err // Use result in frame. } -func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -286,7 +285,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Mknodat implements the linux syscall mknodat(2). func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() path := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) // We don't need this argument until we support creation of device nodes. @@ -295,7 +294,7 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca return 0, nil, mknodAt(t, dirFD, path, mode) } -func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { +func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -304,44 +303,108 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod return 0, syserror.ENOENT } - err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error { - if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR - } - - fileFlags := linuxToFlags(flags) - // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. - fileFlags.LargeFile = true + fileFlags := linuxToFlags(flags) + // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. + fileFlags.LargeFile = true + + err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error { + // Resolve the name to see if it exists, and follow any + // symlinks along the way. We must do the symlink resolution + // manually because if the symlink target does not exist, we + // must create the target (and not the symlink itself). + var ( + found *fs.Dirent + err error + ) + for { + if !fs.IsDir(parent.Inode.StableAttr) { + return syserror.ENOTDIR + } - // Does this file exist already? - targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals) - var newFile *fs.File - switch err { - case nil: - // The file existed. - defer targetDirent.DecRef() + // Start by looking up the dirent at 'name'. + found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals) + if err != nil { + break + } + defer found.DecRef() - // Check if we wanted to create. + // We found something (possibly a symlink). If the + // O_EXCL flag was passed, then we can immediately + // return EEXIST. if flags&linux.O_EXCL != 0 { return syserror.EEXIST } + // If we have a non-symlink, then we can proceed. + if !fs.IsSymlink(found.Inode.StableAttr) { + break + } + + // If O_NOFOLLOW was passed, then don't try to resolve + // anything. + if flags&linux.O_NOFOLLOW != 0 { + return syserror.ELOOP + } + + // Try to resolve the symlink directly to a Dirent. + var resolved *fs.Dirent + resolved, err = found.Inode.Getlink(t) + if err == nil { + // No more resolution necessary. + defer resolved.DecRef() + break + } + if err != fs.ErrResolveViaReadlink { + return err + } + + // Are we able to resolve further? + if remainingTraversals == 0 { + return syscall.ELOOP + } + + // Resolve the symlink to a path via Readlink. + var path string + path, err = found.Inode.Readlink(t) + if err != nil { + break + } + remainingTraversals-- + + // Get the new parent from the target path. + var newParent *fs.Dirent + newParentPath, newName := fs.SplitLast(path) + newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals) + if err != nil { + break + } + defer newParent.DecRef() + + // Repeat the process with the parent and name of the + // symlink target. + parent = newParent + name = newName + } + + var newFile *fs.File + switch err { + case nil: // Like sys_open, check for a few things about the // filesystem before trying to get a reference to the // fs.File. The same constraints on Check apply. - if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { + if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { return err } // Should we truncate the file? if flags&linux.O_TRUNC != 0 { - if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil { + if err := found.Inode.Truncate(t, found, 0); err != nil { return err } } // Create a new fs.File. - newFile, err = targetDirent.Inode.GetFile(t, targetDirent, fileFlags) + newFile, err = found.Inode.GetFile(t, found, fileFlags) if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } @@ -350,26 +413,27 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod // File does not exist. Proceed with creation. // Do we have write permissions on the parent? - if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } // Attempt a creation. perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) - newFile, err = d.Create(t, root, name, fileFlags, perms) + newFile, err = parent.Create(t, root, name, fileFlags, perms) if err != nil { // No luck, bail. return err } defer newFile.DecRef() - targetDirent = newFile.Dirent + found = newFile.Dirent default: return err } // Success. - fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0} - newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { return err } @@ -378,10 +442,10 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod fd = uintptr(newFD) // Queue the open inotify event. The creation event is - // automatically queued when the dirent is targetDirent. The - // open events are implemented at the syscall layer so we need - // to manually queue one here. - targetDirent.InotifyEvent(linux.IN_OPEN, 0) + // automatically queued when the dirent is found. The open + // events are implemented at the syscall layer so we need to + // manually queue one here. + found.InotifyEvent(linux.IN_OPEN, 0) return nil }) @@ -403,7 +467,7 @@ func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Openat implements linux syscall openat(2). func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() flags := uint(args[2].Uint()) if flags&linux.O_CREAT != 0 { @@ -442,7 +506,7 @@ func (ac accessContext) Value(key interface{}) interface{} { } } -func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error { +func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode uint) error { const rOK = 4 const wOK = 2 const xOK = 1 @@ -497,7 +561,7 @@ func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Faccessat implements linux syscall faccessat(2). func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() mode := args[2].ModeT() flags := args[3].Int() @@ -507,10 +571,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Ioctl implements linux syscall ioctl(2). func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() request := int(args[1].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -519,12 +583,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Shared flags between file and socket. switch request { case linux.FIONCLEX: - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: false, }) return 0, nil, nil case linux.FIOCLEX: - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: true, }) return 0, nil, nil @@ -571,7 +635,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err default: - ret, err := file.FileOperations.Ioctl(t, t.MemoryManager(), args) + ret, err := file.FileOperations.Ioctl(t, file, t.MemoryManager(), args) if err != nil { return 0, nil, err } @@ -668,9 +732,9 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fchdir implements the linux syscall fchdir(2). func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -692,44 +756,47 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Close implements linux syscall close(2). func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file, ok := t.FDMap().Remove(fd) - if !ok { + // Note that Remove provides a reference on the file that we may use to + // flush. It is still active until we drop the final reference below + // (and other reference-holding operations complete). + file := t.FDTable().Remove(fd) + if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() err := file.Flush(t) - return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file) + return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) } // Dup implements linux syscall dup(2). func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() - newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { return 0, nil, syserror.EMFILE } - return uintptr(newfd), nil, nil + return uintptr(newFD), nil, nil } // Dup2 implements linux syscall dup2(2). func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldfd := kdefs.FD(args[0].Int()) - newfd := kdefs.FD(args[1].Int()) + oldfd := args[0].Int() + newfd := args[1].Int() // If oldfd is a valid file descriptor, and newfd has the same value as oldfd, // then dup2() does nothing, and returns newfd. if oldfd == newfd { - oldFile := t.FDMap().GetFile(oldfd) + oldFile := t.GetFile(oldfd) if oldFile == nil { return 0, nil, syserror.EBADF } @@ -745,21 +812,21 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Dup3 implements linux syscall dup3(2). func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldfd := kdefs.FD(args[0].Int()) - newfd := kdefs.FD(args[1].Int()) + oldfd := args[0].Int() + newfd := args[1].Int() flags := args[2].Uint() if oldfd == newfd { return 0, nil, syserror.EINVAL } - oldFile := t.FDMap().GetFile(oldfd) + oldFile := t.GetFile(oldfd) if oldFile == nil { return 0, nil, syserror.EBADF } defer oldFile.DecRef() - err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}, t.ThreadGroup().Limits()) + err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}) if err != nil { return 0, nil, err } @@ -802,10 +869,10 @@ func fSetOwn(t *kernel.Task, file *fs.File, who int32) { // Fcntl implements linux syscall fcntl(2). func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() cmd := args[1].Int() - file, flags := t.FDMap().GetDescriptor(fd) + file, flags := t.FDTable().Get(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -813,9 +880,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: - from := kdefs.FD(args[2].Int()) - fdFlags := kernel.FDFlags{CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC} - fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits()) + from := args[2].Int() + fd, err := t.NewFDFrom(from, file, kernel.FDFlags{ + CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, + }) if err != nil { return 0, nil, err } @@ -824,7 +892,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) case linux.F_GETFL: @@ -842,7 +910,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Copy in the lock request. flockAddr := args[2].Pointer() - var flock syscall.Flock_t + var flock linux.Flock if _, err := t.CopyIn(flockAddr, &flock); err != nil { return 0, nil, err } @@ -885,17 +953,17 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } - // The lock uid is that of the Task's FDMap. - lockUniqueID := lock.UniqueID(t.FDMap().ID()) + // The lock uid is that of the Task's FDTable. + lockUniqueID := lock.UniqueID(t.FDTable().ID()) // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. switch flock.Type { - case syscall.F_RDLCK: + case linux.F_RDLCK: if !file.Flags().Read { return 0, nil, syserror.EBADF } - if cmd == syscall.F_SETLK { + if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN @@ -907,11 +975,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } return 0, nil, nil - case syscall.F_WRLCK: + case linux.F_WRLCK: if !file.Flags().Write { return 0, nil, syserror.EBADF } - if cmd == syscall.F_SETLK { + if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN @@ -923,7 +991,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } return 0, nil, nil - case syscall.F_UNLCK: + case linux.F_UNLCK: file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng) return 0, nil, nil default: @@ -944,17 +1012,18 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint()) return 0, nil, err case linux.F_GETPIPE_SZ: - sz, ok := file.FileOperations.(pipe.Sizer) + sz, ok := file.FileOperations.(fs.FifoSizer) if !ok { return 0, nil, syserror.EINVAL } - return uintptr(sz.PipeSize()), nil, nil + size, err := sz.FifoSize(t, file) + return uintptr(size), nil, err case linux.F_SETPIPE_SZ: - sz, ok := file.FileOperations.(pipe.Sizer) + sz, ok := file.FileOperations.(fs.FifoSizer) if !ok { return 0, nil, syserror.EINVAL } - n, err := sz.SetPipeSize(int64(args[2].Int())) + n, err := sz.SetFifoSize(int64(args[2].Int())) return uintptr(n), nil, err default: // Everything else is not yet supported. @@ -975,7 +1044,7 @@ const ( // Fadvise64 implements linux syscall fadvise64(2). // This implementation currently ignores the provided advice. func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() length := args[2].Int64() advice := args[3].Int() @@ -984,7 +1053,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, syserror.EINVAL } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1010,7 +1079,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } -func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1055,14 +1124,14 @@ func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Mkdirat implements linux syscall mkdirat(2). func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) return 0, nil, mkdirAt(t, dirFD, addr, mode) } -func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { +func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1102,7 +1171,7 @@ func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr) } -func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error { +func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error { newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err @@ -1145,7 +1214,7 @@ func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Symlinkat implements linux syscall symlinkat(2). func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldAddr := args[0].Pointer() - dirFD := kdefs.FD(args[1].Int()) + dirFD := args[1].Int() newAddr := args[2].Pointer() return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr) @@ -1187,7 +1256,7 @@ func mayLinkAt(t *kernel.Task, target *fs.Inode) error { // linkAt creates a hard link to the target specified by oldDirFD and oldAddr, // specified by newDirFD and newAddr. If resolve is true, then the symlinks // will be followed when evaluating the target. -func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error { +func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error { oldPath, _, err := copyInPath(t, oldAddr, allowEmpty) if err != nil { return err @@ -1201,7 +1270,7 @@ func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kd } if allowEmpty && oldPath == "" { - target := t.FDMap().GetFile(oldDirFD) + target := t.GetFile(oldDirFD) if target == nil { return syserror.EBADF } @@ -1263,9 +1332,9 @@ func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Linkat implements linux syscall linkat(2). func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldDirFD := kdefs.FD(args[0].Int()) + oldDirFD := args[0].Int() oldAddr := args[1].Pointer() - newDirFD := kdefs.FD(args[2].Int()) + newDirFD := args[2].Int() newAddr := args[3].Pointer() // man linkat(2): @@ -1290,7 +1359,7 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) } -func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { +func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -1340,7 +1409,7 @@ func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Readlinkat implements linux syscall readlinkat(2). func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() bufAddr := args[2].Pointer() size := args[3].SizeT() @@ -1349,7 +1418,7 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return n, nil, err } -func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { +func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1379,7 +1448,7 @@ func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Unlinkat implements linux syscall unlinkat(2). func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() flags := args[2].Uint() if flags&linux.AT_REMOVEDIR != 0 { @@ -1407,7 +1476,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG @@ -1440,10 +1509,10 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Ftruncate implements linux syscall ftruncate(2). func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() length := args[1].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1467,7 +1536,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG @@ -1558,7 +1627,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { return nil } -func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { +func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { path, _, err := copyInPath(t, addr, allowEmpty) if err != nil { return err @@ -1566,7 +1635,7 @@ func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty if path == "" { // Annoying. What's wrong with fchown? - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syserror.EBADF } @@ -1600,11 +1669,11 @@ func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchown implements linux syscall fchown(2). func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() uid := auth.UID(args[1].Uint()) gid := auth.GID(args[2].Uint()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1615,7 +1684,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchownat implements Linux syscall fchownat(2). func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() uid := auth.UID(args[2].Uint()) gid := auth.GID(args[3].Uint()) @@ -1645,7 +1714,7 @@ func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error { return nil } -func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1666,10 +1735,10 @@ func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fchmod implements linux syscall fchmod(2). func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() mode := linux.FileMode(args[1].ModeT()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1680,7 +1749,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchmodat implements linux syscall fchmodat(2). func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) @@ -1696,7 +1765,7 @@ func defaultSetToSystemTimeSpec() fs.TimeSpec { } } -func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { +func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error { // Does the task own the file? if !d.Inode.CheckOwnership(t) { @@ -1729,7 +1798,7 @@ func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, r // Linux returns EINVAL in this case. See utimes.c. return syserror.EINVAL } - f := t.FDMap().GetFile(dirFD) + f := t.GetFile(dirFD) if f == nil { return syserror.EBADF } @@ -1757,7 +1826,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // No timesAddr argument will be interpreted as current system time. ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { - var times syscall.Utimbuf + var times linux.Utime if _, err := t.CopyIn(timesAddr, ×); err != nil { return 0, nil, err } @@ -1797,7 +1866,7 @@ func timespecIsValid(ts linux.Timespec) bool { // Utimensat implements linux syscall utimensat(2). func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() pathnameAddr := args[1].Pointer() timesAddr := args[2].Pointer() flags := args[3].Int() @@ -1832,7 +1901,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Futimesat implements linux syscall futimesat(2). func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() pathnameAddr := args[1].Pointer() timesAddr := args[2].Pointer() @@ -1856,7 +1925,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) } -func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error { +func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error { newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err @@ -1904,21 +1973,21 @@ func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Renameat implements linux syscall renameat(2). func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldDirFD := kdefs.FD(args[0].Int()) + oldDirFD := args[0].Int() oldPathAddr := args[1].Pointer() - newDirFD := kdefs.FD(args[2].Int()) + newDirFD := args[2].Int() newPathAddr := args[3].Pointer() return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) } // Fallocate implements linux system call fallocate(2). func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() mode := args[1].Int64() offset := args[2].Int64() length := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1949,7 +2018,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG @@ -1967,10 +2036,10 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Flock implements linux syscall flock(2). func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() operation := args[1].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF @@ -2077,8 +2146,9 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S defer dirent.DecRef() defer file.DecRef() - fdFlags := kernel.FDFlags{CloseOnExec: cloExec} - newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index dea872672..63e2c5a5d 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -17,20 +17,19 @@ package linux import ( "bytes" "io" - "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) // Getdents implements linux syscall getdents(2) for 64bit systems. func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := int(args[2].Uint()) @@ -46,7 +45,7 @@ func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Getdents64 implements linux syscall getdents64(2). func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := int(args[2].Uint()) @@ -62,8 +61,8 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // getdents implements the core of getdents(2)/getdents64(2). // f is the syscall implementation dirent serialization function. -func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { - dir := t.FDMap().GetFile(fd) +func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { + dir := t.GetFile(fd) if dir == nil { return 0, syserror.EBADF } @@ -83,7 +82,7 @@ func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(* switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err { case nil: - dir.Dirent.InotifyEvent(syscall.IN_ACCESS, 0) + dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0) return uintptr(ds.Written()), nil case io.EOF: return 0, nil @@ -147,21 +146,21 @@ func smallestDirent64(a arch.Context) uint { func toType(nodeType fs.InodeType) uint8 { switch nodeType { case fs.RegularFile, fs.SpecialFile: - return syscall.DT_REG + return linux.DT_REG case fs.Symlink: - return syscall.DT_LNK + return linux.DT_LNK case fs.Directory, fs.SpecialDirectory: - return syscall.DT_DIR + return linux.DT_DIR case fs.Pipe: - return syscall.DT_FIFO + return linux.DT_FIFO case fs.CharacterDevice: - return syscall.DT_CHR + return linux.DT_CHR case fs.BlockDevice: - return syscall.DT_BLK + return linux.DT_BLK case fs.Socket: - return syscall.DT_SOCK + return linux.DT_SOCK default: - return syscall.DT_UNKNOWN + return linux.DT_UNKNOWN } } diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index 9cfa660fa..b2c7b3444 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -15,14 +15,12 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.dev/gvisor/pkg/syserror" ) const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) @@ -32,7 +30,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. flags := int(args[0].Int()) if flags&^allFlags != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } dirent := fs.NewDirent(t, anon.NewInode(t), "inotify") @@ -44,9 +42,9 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) defer n.DecRef() - fd, err := t.FDMap().NewFDFrom(0, n, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, n, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err @@ -63,18 +61,18 @@ func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // fdToInotify resolves an fd to an inotify object. If successful, the file will // have an extra ref and the caller is responsible for releasing the ref. -func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) { - file := t.FDMap().GetFile(fd) +func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { + file := t.GetFile(fd) if file == nil { // Invalid fd. - return nil, nil, syscall.EBADF + return nil, nil, syserror.EBADF } ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. file.DecRef() - return nil, nil, syscall.EINVAL + return nil, nil, syserror.EINVAL } return ino, file, nil @@ -82,7 +80,7 @@ func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) { // InotifyAddWatch implements the inotify_add_watch() syscall. func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() mask := args[2].Uint() @@ -93,7 +91,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } ino, file, err := fdToInotify(t, fd) @@ -110,11 +108,11 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error { // "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7) if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) { - return syscall.ENOTDIR + return syserror.ENOTDIR } // Copy out to the return frame. - fd = kdefs.FD(ino.AddWatch(dirent, mask)) + fd = ino.AddWatch(dirent, mask) return nil }) @@ -123,7 +121,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // InotifyRmWatch implements the inotify_rm_watch() syscall. func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() wd := args[1].Int() ino, file, err := fdToInotify(t, fd) diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index a3813b818..297e920c4 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -18,17 +18,16 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) // Lseek implements linux syscall lseek(2). func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() offset := args[1].Int64() whence := args[2].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index d831833bc..58a05b5bb 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -40,7 +39,7 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { prot := args[2].Int() flags := args[3].Int() - fd := kdefs.FD(args[4].Int()) + fd := args[4].Int() fixed := flags&linux.MAP_FIXED != 0 private := flags&linux.MAP_PRIVATE != 0 shared := flags&linux.MAP_SHARED != 0 @@ -80,7 +79,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if !anon { // Convert the passed FD to a file reference. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 7c1bea43d..418d7fa5f 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -15,20 +15,19 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) // pipe2 implements the actual system call with flags. func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) @@ -38,25 +37,18 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { w.SetFlags(linuxToFlags(flags).Settable()) defer w.DecRef() - rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{ - CloseOnExec: flags&linux.O_CLOEXEC != 0}, - t.ThreadGroup().Limits()) - if err != nil { - return 0, err - } - - wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{ - CloseOnExec: flags&linux.O_CLOEXEC != 0}, - t.ThreadGroup().Limits()) + fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { - t.FDMap().Remove(rfd) return 0, err } - if _, err := t.CopyOut(addr, []kdefs.FD{rfd, wfd}); err != nil { - t.FDMap().Remove(rfd) - t.FDMap().Remove(wfd) - return 0, syscall.EFAULT + if _, err := t.CopyOut(addr, fds); err != nil { + // The files are not closed in this case, the exact semantics + // of this error case are not well defined, but they could have + // already been observed by user space. + return 0, syserror.EFAULT } return 0, nil } diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index ef6211218..7a13beac2 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -64,7 +63,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan return } - file := t.FDMap().GetFile(kdefs.FD(pfd.FD)) + file := t.GetFile(pfd.FD) if file == nil { pfd.REvents = linux.POLLNVAL return @@ -265,7 +264,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add // immediately to ensure we don't leak. Note, another thread // might be about to close fd. This is racy, but that's // OK. Linux is racy in the same way. - file := t.FDMap().GetFile(kdefs.FD(fd)) + file := t.GetFile(fd) if file == nil { return 0, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 9d70881fd..98db32d77 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -16,15 +16,14 @@ package linux import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserror" ) // Prctl implements linux syscall prctl(2). @@ -37,7 +36,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_PDEATHSIG: sig := linux.Signal(args[1].Int()) if sig != 0 && !sig.IsValid() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } t.SetParentDeathSignal(sig) return 0, nil, nil @@ -68,7 +67,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall d = mm.UserDumpable default: // N.B. Userspace may not pass SUID_DUMP_ROOT. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } t.MemoryManager().SetDumpability(d) return 0, nil, nil @@ -89,7 +88,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else if val == 1 { t.SetKeepCaps(true) } else { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil @@ -97,7 +96,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_NAME: addr := args[1].Pointer() name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1) - if err != nil && err != syscall.ENAMETOOLONG { + if err != nil && err != syserror.ENAMETOOLONG { return 0, nil, err } t.SetName(name) @@ -117,22 +116,22 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_MM: if !t.HasCapability(linux.CAP_SYS_RESOURCE) { - return 0, nil, syscall.EPERM + return 0, nil, syserror.EPERM } switch args[1].Int() { case linux.PR_SET_MM_EXE_FILE: - fd := kdefs.FD(args[2].Int()) + fd := args[2].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } // Set the underlying executable. @@ -154,12 +153,12 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } case linux.PR_SET_NO_NEW_PRIVS: if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // no_new_privs is assumed to always be set. See // kernel.Task.updateCredsForExec. @@ -167,14 +166,14 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_GET_NO_NEW_PRIVS: if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 1, nil, nil case linux.PR_SET_SECCOMP: if args[1].Int() != linux.SECCOMP_MODE_FILTER { // Unsupported mode. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer()) @@ -185,7 +184,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_READ: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } var rv uintptr if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 { @@ -196,7 +195,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_DROP: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, t.DropBoundingCapability(cp) @@ -221,7 +220,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index a1965f490..b2474e60d 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -39,11 +38,11 @@ const ( // they can do large reads all at once. Bug for bug. Same for other read // calls below. func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -75,12 +74,12 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Pread64 implements linux syscall pread64(2). func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -122,11 +121,11 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Readv implements linux syscall readv(2). func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -152,12 +151,12 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Preadv implements linux syscall preadv(2). func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -201,13 +200,13 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // splits the offset argument into a high/low value for compatibility with // 32-bit architectures. The flags argument is the 5th argument. - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() flags := int(args[5].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go index 434bbb322..99f6993f5 100644 --- a/pkg/sentry/syscalls/linux/sys_sched.go +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -15,11 +15,10 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -37,13 +36,13 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel pid := args[0].Int() param := args[1].Pointer() if param == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } r := SchedParam{schedPriority: onlyPriority} if _, err := t.CopyOut(param, r); err != nil { @@ -57,10 +56,10 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := args[0].Int() if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } return onlyScheduler, nil, nil } @@ -71,20 +70,20 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke policy := args[1].Int() param := args[2].Pointer() if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if policy != onlyScheduler { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } var r SchedParam if _, err := t.CopyIn(param, &r); err != nil { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if r.schedPriority != onlyPriority { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go index 4885b5e40..18510ead8 100644 --- a/pkg/sentry/syscalls/linux/sys_seccomp.go +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -15,13 +15,12 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. @@ -43,7 +42,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { // We only support SECCOMP_SET_MODE_FILTER at the moment. if mode != linux.SECCOMP_SET_MODE_FILTER { // Unsupported mode. - return syscall.EINVAL + return syserror.EINVAL } tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0 @@ -51,7 +50,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC. if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 { // Unsupported flag. - return syscall.EINVAL + return syserror.EINVAL } var fprog userSockFprog @@ -65,7 +64,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { compiledFilter, err := bpf.Compile(filter) if err != nil { t.Debugf("Invalid seccomp-bpf filter: %v", err) - return syscall.EINVAL + return syserror.EINVAL } return t.AppendSyscallFilter(compiledFilter, tsync) diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index ccdb079bb..fa568a660 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -15,7 +15,6 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -23,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -132,7 +130,7 @@ func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { if addrlen > maxAddrLen { - return nil, syscall.EINVAL + return nil, syserror.EINVAL } addrBuf := make([]byte, addrlen) @@ -154,7 +152,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user } if int32(bufLen) < 0 { - return syscall.EINVAL + return syserror.EINVAL } // Write the length unconditionally. @@ -187,7 +185,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Create the new socket. @@ -200,9 +198,9 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal }) defer s.DecRef() - fd, err := t.FDMap().NewFDFrom(0, s, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } @@ -219,15 +217,12 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } fileFlags := fs.SettableFileFlags{ NonBlocking: stype&linux.SOCK_NONBLOCK != 0, } - fdFlags := kernel.FDFlags{ - CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, - } // Create the socket pair. s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol) @@ -240,20 +235,16 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy defer s2.DecRef() // Create the FDs for the sockets. - fd1, err := t.FDMap().NewFDFrom(0, s1, fdFlags, t.ThreadGroup().Limits()) - if err != nil { - return 0, nil, err - } - fd2, err := t.FDMap().NewFDFrom(0, s2, fdFlags, t.ThreadGroup().Limits()) + fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + }) if err != nil { - t.FDMap().Remove(fd1) return 0, nil, err } // Copy the file descriptors out. - if _, err := t.CopyOut(socks, []int32{int32(fd1), int32(fd2)}); err != nil { - t.FDMap().Remove(fd1) - t.FDMap().Remove(fd2) + if _, err := t.CopyOut(socks, fds); err != nil { + // Note that we don't close files here; see pipe(2) also. return 0, nil, err } @@ -262,21 +253,21 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Connect implements the linux syscall connect(2). func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Uint() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Capture address and call syscall implementation. @@ -291,23 +282,23 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // accept is the implementation of the accept syscall. It is called by accept // and accept4 syscall handlers. -func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { +func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } // Call the syscall implementation for this socket, then copy the @@ -322,7 +313,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr if peerRequested { // NOTE(magi): Linux does not give you an error if it can't // write the data back out so neither do we. - if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL { + if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL { return 0, err } } @@ -331,7 +322,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr // Accept4 implements the linux syscall accept4(2). func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() flags := int(args[3].Int()) @@ -342,7 +333,7 @@ func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Accept implements the linux syscall accept(2). func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() @@ -352,21 +343,21 @@ func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Bind implements the linux syscall bind(2). func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Uint() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Capture address and call syscall implementation. @@ -380,20 +371,20 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Listen implements the linux syscall listen(2). func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() backlog := args[1].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Per Linux, the backlog is silently capped to reasonable values. @@ -409,27 +400,27 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Shutdown implements the linux syscall shutdown(2). func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() how := args[1].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Validate how, then call syscall implementation. switch how { case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, s.Shutdown(t, int(how)).ToError() @@ -437,23 +428,23 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // GetSockOpt implements the linux syscall getsockopt(2). func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() level := args[1].Int() name := args[2].Int() optValAddr := args[3].Pointer() optLenAddr := args[4].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Read the length if present. Reject negative values. @@ -464,7 +455,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } if optLen < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } } @@ -521,30 +512,30 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interfac // // Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() level := args[1].Int() name := args[2].Int() optValAddr := args[3].Pointer() optLen := args[4].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } if optLen <= 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if optLen > maxOptLen { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) if _, err := t.CopyIn(optValAddr, &buf); err != nil { @@ -561,21 +552,21 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // GetSockName implements the linux syscall getsockname(2). func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Get the socket name and copy it to the caller. @@ -589,21 +580,21 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // GetPeerName implements the linux syscall getpeername(2). func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Get the socket peer name and copy it to the caller. @@ -617,31 +608,31 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // RecvMsg implements the linux syscall recvmsg(2). func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() flags := args[2].Int() if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -663,7 +654,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // RecvMMsg implements the linux syscall recvmmsg(2). func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() vlen := args[2].Uint() flags := args[3].Int() @@ -671,25 +662,25 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } if file.Flags().NonBlocking { @@ -704,7 +695,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if !ts.Valid() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) haveDeadline = true @@ -724,7 +715,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } var n uintptr if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { @@ -734,7 +725,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } if _, err = t.CopyOut(lp, uint32(n)); err != nil { break @@ -756,7 +747,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if msg.IovLen > linux.UIO_MAXIOV { - return 0, syscall.EMSGSIZE + return 0, syserror.EMSGSIZE } dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -767,7 +758,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i // FIXME(b/63594852): Pretend we have an empty error queue. if flags&linux.MSG_ERRQUEUE != 0 { - return 0, syscall.EAGAIN + return 0, syserror.EAGAIN } // Fast path when no control message nor name buffers are provided. @@ -792,7 +783,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if msg.ControlLen > maxControlLen { - return 0, syscall.ENOBUFS + return 0, syserror.ENOBUFS } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { @@ -811,6 +802,11 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i controlData = control.PackTimestamp(t, cms.IP.Timestamp, controlData) } + if cms.IP.HasInq { + // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP. + controlData = control.PackInq(t, cms.IP.Inq, controlData) + } + if cms.Unix.Rights != nil { controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) } @@ -842,27 +838,27 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i // recvFrom is the implementation of the recvfrom syscall. It is called by // recvfrom and recv syscall handlers. -func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { +func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { if int(bufLen) < 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } if file.Flags().NonBlocking { @@ -903,7 +899,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f // RecvFrom implements the linux syscall recvfrom(2). func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() bufPtr := args[1].Pointer() bufLen := args[2].Uint64() flags := args[3].Int() @@ -916,31 +912,31 @@ func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // SendMsg implements the linux syscall sendmsg(2). func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() flags := args[2].Int() if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -953,32 +949,32 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // SendMMsg implements the linux syscall sendmmsg(2). func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() vlen := args[2].Uint() flags := args[3].Int() if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -990,7 +986,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } var n uintptr if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { @@ -1000,7 +996,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } if _, err = t.CopyOut(lp, uint32(n)); err != nil { break @@ -1025,7 +1021,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme if msg.ControlLen > 0 { // Put an upper bound to prevent large allocations. if msg.ControlLen > maxControlLen { - return 0, syscall.ENOBUFS + return 0, syserror.ENOBUFS } controlData = make([]byte, msg.ControlLen) if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { @@ -1045,7 +1041,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme // Read data then call the sendmsg implementation. if msg.IovLen > linux.UIO_MAXIOV { - return 0, syscall.EMSGSIZE + return 0, syserror.EMSGSIZE } src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -1079,23 +1075,23 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme // sendTo is the implementation of the sendto syscall. It is called by sendto // and send syscall handlers. -func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { +func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } if file.Flags().NonBlocking { @@ -1135,7 +1131,7 @@ func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, fla // SendTo implements the linux syscall sendto(2). func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() bufPtr := args[1].Pointer() bufLen := args[2].Uint64() flags := args[3].Int() diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index b6517313f..a7c98efcb 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -81,8 +80,8 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB // Sendfile implements linux system call sendfile(2). func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - outFD := kdefs.FD(args[0].Int()) - inFD := kdefs.FD(args[1].Int()) + outFD := args[0].Int() + inFD := args[1].Int() offsetAddr := args[2].Pointer() count := int64(args[3].SizeT()) @@ -92,13 +91,13 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } @@ -163,9 +162,9 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Splice implements splice(2). func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - inFD := kdefs.FD(args[0].Int()) + inFD := args[0].Int() inOffset := args[1].Pointer() - outFD := kdefs.FD(args[2].Int()) + outFD := args[2].Int() outOffset := args[3].Pointer() count := int64(args[4].SizeT()) flags := args[5].Int() @@ -182,13 +181,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0 // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } @@ -251,8 +250,8 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Tee imlements tee(2). func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - inFD := kdefs.FD(args[0].Int()) - outFD := kdefs.FD(args[1].Int()) + inFD := args[0].Int() + outFD := args[1].Int() count := int64(args[2].SizeT()) flags := args[3].Int() @@ -265,13 +264,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0 // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 9a5657254..5556bc276 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -42,7 +41,7 @@ func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Fstatat implements linux syscall newfstatat, i.e. fstatat(2). func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() statAddr := args[2].Pointer() flags := args[3].Int() @@ -54,7 +53,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if path == "" { // Annoying. What's wrong with fstat? - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -93,10 +92,10 @@ func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fstat implements linux syscall fstat(2). func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() statAddr := args[1].Pointer() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -178,7 +177,7 @@ func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs // Statx implements linux syscall statx(2). func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() pathAddr := args[1].Pointer() flags := args[2].Int() mask := args[3].Uint() @@ -197,7 +196,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if path == "" { - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -285,10 +284,10 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fstatfs implements linux syscall fstatfs(2). func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() statfsAddr := args[1].Pointer() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 37225735f..3e55235bd 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,9 +31,9 @@ func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Syncfs implements linux system call syncfs(2). func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -47,9 +46,9 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fsync implements linux syscall fsync(2). func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -63,9 +62,9 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // // At the moment, it just calls Fsync, which is a big hammer, but correct. func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -79,6 +78,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { var err error + fd := args[0].Int() offset := args[1].Int64() nbytes := args[2].Int64() uflags := args[3].Uint() @@ -97,8 +97,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel nbytes = fs.FileMaxOffset } - fd := kdefs.FD(args[0].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 9e037bd7b..595eb9155 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -123,29 +123,29 @@ func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) { opts := kernel.CloneOptions{ SharingOptions: kernel.SharingOptions{ - NewAddressSpace: flags&syscall.CLONE_VM == 0, - NewSignalHandlers: flags&syscall.CLONE_SIGHAND == 0, - NewThreadGroup: flags&syscall.CLONE_THREAD == 0, + NewAddressSpace: flags&linux.CLONE_VM == 0, + NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0, + NewThreadGroup: flags&linux.CLONE_THREAD == 0, TerminationSignal: linux.Signal(flags & exitSignalMask), - NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID, - NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER, - NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET, - NewFiles: flags&syscall.CLONE_FILES == 0, - NewFSContext: flags&syscall.CLONE_FS == 0, - NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS, - NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC, + NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, + NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, + NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, + NewFiles: flags&linux.CLONE_FILES == 0, + NewFSContext: flags&linux.CLONE_FS == 0, + NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, + NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, }, Stack: stack, - SetTLS: flags&syscall.CLONE_SETTLS == syscall.CLONE_SETTLS, + SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS, TLS: tls, - ChildClearTID: flags&syscall.CLONE_CHILD_CLEARTID == syscall.CLONE_CHILD_CLEARTID, - ChildSetTID: flags&syscall.CLONE_CHILD_SETTID == syscall.CLONE_CHILD_SETTID, + ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID, + ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID, ChildTID: childTID, - ParentSetTID: flags&syscall.CLONE_PARENT_SETTID == syscall.CLONE_PARENT_SETTID, + ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID, ParentTID: parentTID, - Vfork: flags&syscall.CLONE_VFORK == syscall.CLONE_VFORK, - Untraced: flags&syscall.CLONE_UNTRACED == syscall.CLONE_UNTRACED, - InheritTracer: flags&syscall.CLONE_PTRACE == syscall.CLONE_PTRACE, + Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK, + Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED, + InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE, } ntid, ctrl, err := t.Clone(&opts) return uintptr(ntid), ctrl, err @@ -168,7 +168,7 @@ func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // "A call to fork() is equivalent to a call to clone(2) specifying flags // as just SIGCHLD." - fork(2) - return clone(t, int(syscall.SIGCHLD), 0, 0, 0, 0) + return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0) } // Vfork implements Linux syscall vfork(2). @@ -178,7 +178,7 @@ func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // // CLONE_VM | CLONE_VFORK | SIGCHLD // """ - vfork(2) - return clone(t, syscall.CLONE_VM|syscall.CLONE_VFORK|int(syscall.SIGCHLD), 0, 0, 0, 0) + return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0) } // parseCommonWaitOptions applies the options common to wait4 and waitid to @@ -193,7 +193,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { wopts.NonCloneTasks = true wopts.CloneTasks = true default: - return syscall.EINVAL + return syserror.EINVAL } if options&linux.WCONTINUED != 0 { wopts.Events |= kernel.EventGroupContinue @@ -210,7 +210,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { // wait4 waits for the given child process to exit. func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) { if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventExit | kernel.EventTraceeStop, @@ -291,10 +291,10 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal rusageAddr := args[4].Pointer() if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventTraceeStop, @@ -307,7 +307,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.P_PGID: wopts.SpecificPGID = kernel.ProcessGroupID(id) default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if err := parseCommonWaitOptions(&wopts, options); err != nil { @@ -347,12 +347,12 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, nil } si := arch.SignalInfo{ - Signo: int32(syscall.SIGCHLD), + Signo: int32(linux.SIGCHLD), } si.SetPid(int32(wr.TID)) si.SetUid(int32(wr.UID)) // TODO(b/73541790): convert kernel.ExitStatus to functions and make - // WaitResult.Status a linux.WaitStatus + // WaitResult.Status a linux.WaitStatus. s := syscall.WaitStatus(wr.Status) switch { case s.Exited(): @@ -374,7 +374,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } case s.Continued(): si.Code = arch.CLD_CONTINUED - si.SetStatus(int32(syscall.SIGCONT)) + si.SetStatus(int32(linux.SIGCONT)) default: t.Warningf("waitid got incomprehensible wait status %d", s) } @@ -395,16 +395,16 @@ func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() opts := kernel.SharingOptions{ - NewAddressSpace: flags&syscall.CLONE_VM == syscall.CLONE_VM, - NewSignalHandlers: flags&syscall.CLONE_SIGHAND == syscall.CLONE_SIGHAND, - NewThreadGroup: flags&syscall.CLONE_THREAD == syscall.CLONE_THREAD, - NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID, - NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER, - NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET, - NewFiles: flags&syscall.CLONE_FILES == syscall.CLONE_FILES, - NewFSContext: flags&syscall.CLONE_FS == syscall.CLONE_FS, - NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS, - NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC, + NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM, + NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND, + NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD, + NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, + NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, + NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, + NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES, + NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS, + NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, + NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, } // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) if opts.NewPIDNamespace { @@ -623,7 +623,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S who := kernel.ThreadID(args[1].Int()) switch which { - case syscall.PRIO_PROCESS: + case linux.PRIO_PROCESS: // Look for who, return ESRCH if not found. var task *kernel.Task if who == 0 { @@ -633,7 +633,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } // From kernel/sys.c:getpriority: @@ -641,13 +641,13 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // will not return the normal nice-value, but a negated // value that has been offset by 20" return uintptr(20 - task.Niceness()), nil, nil - case syscall.PRIO_USER: + case linux.PRIO_USER: fallthrough - case syscall.PRIO_PGRP: + case linux.PRIO_PGRP: // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } } @@ -669,7 +669,7 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } switch which { - case syscall.PRIO_PROCESS: + case linux.PRIO_PROCESS: // Look for who, return ESRCH if not found. var task *kernel.Task if who == 0 { @@ -679,17 +679,17 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } task.SetNiceness(niceval) - case syscall.PRIO_USER: + case linux.PRIO_USER: fallthrough - case syscall.PRIO_PGRP: + case linux.PRIO_PGRP: // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index ca5ccb7c3..d4134207b 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -15,13 +15,13 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) const nsecPerSec = int64(time.Second) @@ -47,7 +47,7 @@ func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) return itv, nil default: - return linux.ItimerVal{}, syscall.ENOSYS + return linux.ItimerVal{}, syserror.ENOSYS } } @@ -65,7 +65,7 @@ func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) e _, err := t.CopyOut(addr, itv) return err default: - return syscall.ENOSYS + return syserror.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index b889a2a93..cf49b43db 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/syserror" ) @@ -49,9 +48,9 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel NonBlocking: flags&linux.TFD_NONBLOCK != 0, }) - fd, err := t.FDMap().NewFDFrom(0, f, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&linux.TFD_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } @@ -61,7 +60,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel // TimerfdSettime implements Linux syscall timerfd_settime(2). func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() flags := args[1].Int() newValAddr := args[2].Pointer() oldValAddr := args[3].Pointer() @@ -70,7 +69,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne return 0, nil, syserror.EINVAL } - f := t.FDMap().GetFile(fd) + f := t.GetFile(fd) if f == nil { return 0, nil, syserror.EBADF } @@ -101,10 +100,10 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // TimerfdGettime implements Linux syscall timerfd_gettime(2). func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() curValAddr := args[1].Pointer() - f := t.FDMap().GetFile(fd) + f := t.GetFile(fd) if f == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go index e3d1c6201..b3eb96a1c 100644 --- a/pkg/sentry/syscalls/linux/sys_tls.go +++ b/pkg/sentry/syscalls/linux/sys_tls.go @@ -17,11 +17,10 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" ) // ArchPrctl implements linux syscall arch_prctl(2). @@ -39,14 +38,14 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.ARCH_SET_FS: fsbase := args[1].Uint64() if !t.Arch().SetTLS(uintptr(fsbase)) { - return 0, nil, syscall.EPERM + return 0, nil, syserror.EPERM } case linux.ARCH_GET_GS, linux.ARCH_SET_GS: t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 3a5bf9ac4..5278c96a6 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -39,11 +38,11 @@ const ( // Write implements linux syscall write(2). func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -75,12 +74,12 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Pwrite64 implements linux syscall pwrite64(2). func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -122,11 +121,11 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Writev implements linux syscall writev(2). func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -152,12 +151,12 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Pwritev implements linux syscall pwritev(2). func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -202,7 +201,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // splits the offset argument into a high/low value for compatibility with // 32-bit architectures. The flags argument is the 5th argument. - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() @@ -212,7 +211,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, syserror.EACCES } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go index 9ba0eba7a..4ff8f9234 100644 --- a/pkg/sentry/syscalls/linux/timespec.go +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -15,7 +15,6 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -70,7 +69,7 @@ func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) { tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:])) return tv, nil default: - return linux.Timeval{}, syscall.ENOSYS + return linux.Timeval{}, syserror.ENOSYS } } @@ -84,7 +83,7 @@ func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error _, err := t.CopyOutBytes(addr, out) return err default: - return syscall.ENOSYS + return syserror.ENOSYS } } @@ -104,7 +103,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.D return 0, err } if !timespec.Valid() { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } timeout = time.Duration(timespec.ToNsecCapped()) } diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index a5f3d8407..f88055676 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -26,7 +26,6 @@ package syscalls import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -40,16 +39,7 @@ func Supported(name string, fn kernel.SyscallFn) kernel.Syscall { Name: name, Fn: fn, SupportLevel: kernel.SupportFull, - Note: "Full Support", - } -} - -// Undocumented returns a syscall that is undocumented. -func Undocumented(name string, fn kernel.SyscallFn) kernel.Syscall { - return kernel.Syscall{ - Name: name, - Fn: fn, - SupportLevel: kernel.SupportUndocumented, + Note: "Fully Supported.", } } @@ -65,7 +55,7 @@ func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []st } // Error returns a syscall handler that will always give the passed error. -func Error(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { +func Error(name string, err error, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } @@ -75,14 +65,14 @@ func Error(name string, err syscall.Errno, note string, urls []string) kernel.Sy return 0, nil, err }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q", note, err.Error()), + Note: fmt.Sprintf("%sReturns %q.", note, err.Error()), URLs: urls, } } // ErrorWithEvent gives a syscall function that sends an unimplemented // syscall event via the event channel and returns the passed error. -func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { +func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } @@ -93,7 +83,7 @@ func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) return 0, nil, err }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q", note, err.Error()), + Note: fmt.Sprintf("%sReturns %q.", note, err.Error()), URLs: urls, } } @@ -115,7 +105,7 @@ func CapError(name string, c linux.Capability, note string, urls []string) kerne return 0, nil, syserror.ENOSYS }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise", note, syserror.EPERM, c.String(), syserror.ENOSYS), + Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS), URLs: urls, } } diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index d2ede0353..8aa6a3017 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -6,7 +6,7 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") go_template_instance( name = "seqatomic_parameters", - out = "seqatomic_parameters.go", + out = "seqatomic_parameters_unsafe.go", package = "time", suffix = "Parameters", template = "//third_party/gvsync:generic_seqatomic", @@ -27,7 +27,7 @@ go_library( "parameters.go", "sampler.go", "sampler_unsafe.go", - "seqatomic_parameters.go", + "seqatomic_parameters_unsafe.go", "tsc_amd64.s", "tsc_arm64.s", ], diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD new file mode 100644 index 000000000..4de6c41cf --- /dev/null +++ b/pkg/sentry/vfs/BUILD @@ -0,0 +1,46 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "vfs", + srcs = [ + "context.go", + "debug.go", + "dentry.go", + "file_description.go", + "file_description_impl_util.go", + "filesystem.go", + "filesystem_type.go", + "mount.go", + "mount_unsafe.go", + "options.go", + "permissions.go", + "resolving_path.go", + "syscalls.go", + "vfs.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/vfs", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/usermem", + "//pkg/syserror", + "//pkg/waiter", + "//third_party/gvsync", + ], +) + +go_test( + name = "vfs_test", + size = "small", + srcs = [ + "mount_test.go", + ], + embed = [":vfs"], +) diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md new file mode 100644 index 000000000..7847854bc --- /dev/null +++ b/pkg/sentry/vfs/README.md @@ -0,0 +1,197 @@ +# The gVisor Virtual Filesystem + +THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION +USE. For the filesystem implementation currently used by gVisor, see the `fs` +package. + +## Implementation Notes + +### Reference Counting + +Filesystem, Dentry, Mount, MountNamespace, and FileDescription are all +reference-counted. Mount and MountNamespace are exclusively VFS-managed; when +their reference count reaches zero, VFS releases their resources. Filesystem and +FileDescription management is shared between VFS and filesystem implementations; +when their reference count reaches zero, VFS notifies the implementation by +calling `FilesystemImpl.Release()` or `FileDescriptionImpl.Release()` +respectively and then releases VFS-owned resources. Dentries are exclusively +managed by filesystem implementations; reference count changes are abstracted +through DentryImpl, which should release resources when reference count reaches +zero. + +Filesystem references are held by: + +- Mount: Each referenced Mount holds a reference on the mounted Filesystem. + +Dentry references are held by: + +- FileDescription: Each referenced FileDescription holds a reference on the + Dentry through which it was opened, via `FileDescription.vd.dentry`. + +- Mount: Each referenced Mount holds a reference on its mount point and on the + mounted filesystem root. The mount point is mutable (`mount(MS_MOVE)`). + +Mount references are held by: + +- FileDescription: Each referenced FileDescription holds a reference on the + Mount on which it was opened, via `FileDescription.vd.mount`. + +- Mount: Each referenced Mount holds a reference on its parent, which is the + mount containing its mount point. + +- VirtualFilesystem: A reference is held on all Mounts that are attached + (reachable by Mount traversal). + +MountNamespace and FileDescription references are held by users of VFS. The +expectation is that each `kernel.Task` holds a reference on its corresponding +MountNamespace, and each file descriptor holds a reference on its represented +FileDescription. + +Notes: + +- Dentries do not hold a reference on their owning Filesystem. Instead, all + uses of a Dentry occur in the context of a Mount, which holds a reference on + the relevant Filesystem (see e.g. the VirtualDentry type). As a corollary, + when releasing references on both a Dentry and its corresponding Mount, the + Dentry's reference must be released first (because releasing the Mount's + reference may release the last reference on the Filesystem, whose state may + be required to release the Dentry reference). + +### The Inheritance Pattern + +Filesystem, Dentry, and FileDescription are all concepts featuring both state +that must be shared between VFS and filesystem implementations, and operations +that are implementation-defined. To facilitate this, each of these three +concepts follows the same pattern, shown below for Dentry: + +```go +// Dentry represents a node in a filesystem tree. +type Dentry struct { + // VFS-required dentry state. + parent *Dentry + // ... + + // impl is the DentryImpl associated with this Dentry. impl is immutable. + // This should be the last field in Dentry. + impl DentryImpl +} + +// Init must be called before first use of d. +func (d *Dentry) Init(impl DentryImpl) { + d.impl = impl +} + +// Impl returns the DentryImpl associated with d. +func (d *Dentry) Impl() DentryImpl { + return d.impl +} + +// DentryImpl contains implementation-specific details of a Dentry. +// Implementations of DentryImpl should contain their associated Dentry by +// value as their first field. +type DentryImpl interface { + // VFS-required implementation-defined dentry operations. + IncRef() + // ... +} +``` + +This construction, which is essentially a type-safe analogue to Linux's +`container_of` pattern, has the following properties: + +- VFS works almost exclusively with pointers to Dentry rather than DentryImpl + interface objects, such as in the type of `Dentry.parent`. This avoids + interface method calls (which are somewhat expensive to perform, and defeat + inlining and escape analysis), reduces the size of VFS types (since an + interface object is two pointers in size), and allows pointers to be loaded + and stored atomically using `sync/atomic`. Implementation-defined behavior + is accessed via `Dentry.impl` when required. + +- Filesystem implementations can access the implementation-defined state + associated with objects of VFS types by type-asserting or type-switching + (e.g. `Dentry.Impl().(*myDentry)`). Type assertions to a concrete type + require only an equality comparison of the interface object's type pointer + to a static constant, and are consequently very fast. + +- Filesystem implementations can access the VFS state associated with objects + of implementation-defined types directly. + +- VFS and implementation-defined state for a given type occupy the same + object, minimizing memory allocations and maximizing memory locality. `impl` + is the last field in `Dentry`, and `Dentry` is the first field in + `DentryImpl` implementations, for similar reasons: this tends to cause + fetching of the `Dentry.impl` interface object to also fetch `DentryImpl` + fields, either because they are in the same cache line or via next-line + prefetching. + +## Future Work + +- Most `mount(2)` features, and unmounting, are incomplete. + +- VFS1 filesystems are not directly compatible with VFS2. It may be possible + to implement shims that implement `vfs.FilesystemImpl` for + `fs.MountNamespace`, `vfs.DentryImpl` for `fs.Dirent`, and + `vfs.FileDescriptionImpl` for `fs.File`, which may be adequate for + filesystems that are not performance-critical (e.g. sysfs); however, it is + not clear that this will be less effort than simply porting the filesystems + in question. Practically speaking, the following filesystems will probably + need to be ported or made compatible through a shim to evaluate filesystem + performance on realistic workloads: + + - devfs/procfs/sysfs, which will realistically be necessary to execute + most applications. (Note that procfs and sysfs do not support hard + links, so they do not require the complexity of separate inode objects. + Also note that Linux's /dev is actually a variant of tmpfs called + devtmpfs.) + + - tmpfs. This should be relatively straightforward: copy/paste memfs, + store regular file contents in pgalloc-allocated memory instead of + `[]byte`, and add support for file timestamps. (In fact, it probably + makes more sense to convert memfs to tmpfs and not keep the former.) + + - A remote filesystem, either lisafs (if it is ready by the time that + other benchmarking prerequisites are) or v9fs (aka 9P, aka gofers). + + - epoll files. + + Filesystems that will need to be ported before switching to VFS2, but can + probably be skipped for early testing: + + - overlayfs, which is needed for (at least) synthetic mount points. + + - Support for host ttys. + + - timerfd files. + + Filesystems that can be probably dropped: + + - ashmem, which is far too incomplete to use. + + - binder, which is similarly far too incomplete to use. + + - whitelistfs, which we are already actively attempting to remove. + +- Save/restore. For instance, it is unclear if the current implementation of + the `state` package supports the inheritance pattern described above. + +- Many features that were previously implemented by VFS must now be + implemented by individual filesystems (though, in most cases, this should + consist of calls to hooks or libraries provided by `vfs` or other packages). + This includes, but is not necessarily limited to: + + - Block and character device special files + + - Inotify + + - File locking + + - `O_ASYNC` + +- Reference counts in the `vfs` package do not use the `refs` package since + `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference + count, resulting in considerable cache bloat. 24 bytes of this overhead is + for weak reference support, which have poor performance and will not be used + by VFS2. The remaining 40 bytes is to store a descriptive string and stack + trace for reference leak checking; we can support reference leak checking + without incurring this space overhead by including the applicable + information directly in finalizers for applicable types. diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go new file mode 100644 index 000000000..32cf9151b --- /dev/null +++ b/pkg/sentry/vfs/context.go @@ -0,0 +1,37 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/sentry/context" +) + +// contextID is this package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxMountNamespace is a Context.Value key for a MountNamespace. + CtxMountNamespace contextID = iota +) + +// MountNamespaceFromContext returns the MountNamespace used by ctx. It does +// not take a reference on the returned MountNamespace. If ctx is not +// associated with a MountNamespace, MountNamespaceFromContext returns nil. +func MountNamespaceFromContext(ctx context.Context) *MountNamespace { + if v := ctx.Value(CtxMountNamespace); v != nil { + return v.(*MountNamespace) + } + return nil +} diff --git a/pkg/abi/linux/binder.go b/pkg/sentry/vfs/debug.go index 63b08324a..0ed20f249 100644 --- a/pkg/abi/linux/binder.go +++ b/pkg/sentry/vfs/debug.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,9 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -package linux +package vfs -// BinderVersion structure is used for BINDER_VERSION ioctl. -type BinderVersion struct { - ProtocolVersion int32 -} +const ( + // If checkInvariants is true, perform runtime checks for invariants + // expected by the vfs package. This is normally disabled since VFS is + // often a hot path. + checkInvariants = false +) diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go new file mode 100644 index 000000000..45912fc58 --- /dev/null +++ b/pkg/sentry/vfs/dentry.go @@ -0,0 +1,347 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/syserror" +) + +// Dentry represents a node in a Filesystem tree which may represent a file. +// +// Dentries are reference-counted. Unless otherwise specified, all Dentry +// methods require that a reference is held. +// +// A Dentry transitions through up to 3 different states through its lifetime: +// +// - Dentries are initially "independent". Independent Dentries have no parent, +// and consequently no name. +// +// - Dentry.InsertChild() causes an independent Dentry to become a "child" of +// another Dentry. A child node has a parent node, and a name in that parent, +// both of which are mutable by DentryMoveChild(). Each child Dentry's name is +// unique within its parent. +// +// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A +// disowned Dentry can still refer to its former parent and its former name in +// said parent, but the disowned Dentry is no longer reachable from its parent, +// and a new Dentry with the same name may become a child of the parent. (This +// is analogous to a struct dentry being "unhashed" in Linux.) +// +// Dentry is loosely analogous to Linux's struct dentry, but: +// +// - VFS does not associate Dentries with inodes. gVisor interacts primarily +// with filesystems that are accessed through filesystem APIs (as opposed to +// raw block devices); many such APIs support only paths and file descriptors, +// and not inodes. Furthermore, when parties outside the scope of VFS can +// rename inodes on such filesystems, VFS generally cannot "follow" the rename, +// both due to synchronization issues and because it may not even be able to +// name the destination path; this implies that it would in fact be *incorrect* +// for Dentries to be associated with inodes on such filesystems. Consequently, +// operations that are inode operations in Linux are FilesystemImpl methods +// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do +// support inodes may store appropriate state in implementations of DentryImpl. +// +// - VFS does not provide synchronization for mutable Dentry fields, other than +// mount-related ones. +// +// - VFS does not require that Dentries are instantiated for all paths accessed +// through VFS, only those that are tracked beyond the scope of a single +// Filesystem operation. This includes file descriptions, mount points, mount +// roots, process working directories, and chroots. This avoids instantiation +// of Dentries for operations on mutable remote filesystems that can't actually +// cache any state in the Dentry. +// +// - For the reasons above, VFS is not directly responsible for managing Dentry +// lifetime. Dentry reference counts only indicate the extent to which VFS +// requires Dentries to exist; Filesystems may elect to cache or discard +// Dentries with zero references. +type Dentry struct { + // parent is this Dentry's parent in this Filesystem. If this Dentry is + // independent, parent is nil. + parent *Dentry + + // name is this Dentry's name in parent. + name string + + flags uint32 + + // mounts is the number of Mounts for which this Dentry is Mount.point. + // mounts is accessed using atomic memory operations. + mounts uint32 + + // children are child Dentries. + children map[string]*Dentry + + // impl is the DentryImpl associated with this Dentry. impl is immutable. + // This should be the last field in Dentry. + impl DentryImpl +} + +const ( + // dflagsDisownedMask is set in Dentry.flags if the Dentry has been + // disowned. + dflagsDisownedMask = 1 << iota +) + +// Init must be called before first use of d. +func (d *Dentry) Init(impl DentryImpl) { + d.impl = impl +} + +// Impl returns the DentryImpl associated with d. +func (d *Dentry) Impl() DentryImpl { + return d.impl +} + +// DentryImpl contains implementation details for a Dentry. Implementations of +// DentryImpl should contain their associated Dentry by value as their first +// field. +type DentryImpl interface { + // IncRef increments the Dentry's reference count. A Dentry with a non-zero + // reference count must remain coherent with the state of the filesystem. + IncRef(fs *Filesystem) + + // TryIncRef increments the Dentry's reference count and returns true. If + // the Dentry's reference count is zero, TryIncRef may do nothing and + // return false. (It is also permitted to succeed if it can restore the + // guarantee that the Dentry is coherent with the state of the filesystem.) + // + // TryIncRef does not require that a reference is held on the Dentry. + TryIncRef(fs *Filesystem) bool + + // DecRef decrements the Dentry's reference count. + DecRef(fs *Filesystem) +} + +// IsDisowned returns true if d is disowned. +func (d *Dentry) IsDisowned() bool { + return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0 +} + +// Preconditions: !d.IsDisowned(). +func (d *Dentry) setDisowned() { + atomic.AddUint32(&d.flags, dflagsDisownedMask) +} + +func (d *Dentry) isMounted() bool { + return atomic.LoadUint32(&d.mounts) != 0 +} + +func (d *Dentry) incRef(fs *Filesystem) { + d.impl.IncRef(fs) +} + +func (d *Dentry) tryIncRef(fs *Filesystem) bool { + return d.impl.TryIncRef(fs) +} + +func (d *Dentry) decRef(fs *Filesystem) { + d.impl.DecRef(fs) +} + +// These functions are exported so that filesystem implementations can use +// them. The vfs package, and users of VFS, should not call these functions. +// Unless otherwise specified, these methods require that there are no +// concurrent mutators of d. + +// Name returns d's name in its parent in its owning Filesystem. If d is +// independent, Name returns an empty string. +func (d *Dentry) Name() string { + return d.name +} + +// Parent returns d's parent in its owning Filesystem. It does not take a +// reference on the returned Dentry. If d is independent, Parent returns nil. +func (d *Dentry) Parent() *Dentry { + return d.parent +} + +// ParentOrSelf is equivalent to Parent, but returns d if d is independent. +func (d *Dentry) ParentOrSelf() *Dentry { + if d.parent == nil { + return d + } + return d.parent +} + +// Child returns d's child with the given name in its owning Filesystem. It +// does not take a reference on the returned Dentry. If no such child exists, +// Child returns nil. +func (d *Dentry) Child(name string) *Dentry { + return d.children[name] +} + +// HasChildren returns true if d has any children. +func (d *Dentry) HasChildren() bool { + return len(d.children) != 0 +} + +// InsertChild makes child a child of d with the given name. +// +// InsertChild is a mutator of d and child. +// +// Preconditions: child must be an independent Dentry. d and child must be from +// the same Filesystem. d must not already have a child with the given name. +func (d *Dentry) InsertChild(child *Dentry, name string) { + if checkInvariants { + if _, ok := d.children[name]; ok { + panic(fmt.Sprintf("parent already contains a child named %q", name)) + } + if child.parent != nil || child.name != "" { + panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name)) + } + } + if d.children == nil { + d.children = make(map[string]*Dentry) + } + d.children[name] = child + child.parent = d + child.name = name +} + +// PrepareDeleteDentry must be called before attempting to delete the file +// represented by d. If PrepareDeleteDentry succeeds, the caller must call +// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. +// +// Preconditions: d is a child Dentry. +func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { + if checkInvariants { + if d.parent == nil { + panic("d is independent") + } + if d.IsDisowned() { + panic("d is already disowned") + } + } + vfs.mountMu.RLock() + if _, ok := mntns.mountpoints[d]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + // Return with vfs.mountMu locked, which will be unlocked by + // AbortDeleteDentry or CommitDeleteDentry. + return nil +} + +// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion +// fails. +func (vfs *VirtualFilesystem) AbortDeleteDentry() { + vfs.mountMu.RUnlock() +} + +// CommitDeleteDentry must be called after the file represented by d is +// deleted, and causes d to become disowned. +// +// Preconditions: PrepareDeleteDentry was previously called on d. +func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { + delete(d.parent.children, d.name) + d.setDisowned() + // TODO: lazily unmount mounts at d + vfs.mountMu.RUnlock() +} + +// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as +// appropriate for in-memory filesystems that don't need to ensure that some +// external state change succeeds before committing the deletion. +func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error { + if err := vfs.PrepareDeleteDentry(mntns, d); err != nil { + return err + } + vfs.CommitDeleteDentry(d) + return nil +} + +// PrepareRenameDentry must be called before attempting to rename the file +// represented by from. If to is not nil, it represents the file that will be +// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the +// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or +// CommitRenameExchangeDentry depending on the rename's outcome. +// +// Preconditions: from is a child Dentry. If to is not nil, it must be a child +// Dentry from the same Filesystem. +func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { + if checkInvariants { + if from.parent == nil { + panic("from is independent") + } + if from.IsDisowned() { + panic("from is already disowned") + } + if to != nil { + if to.parent == nil { + panic("to is independent") + } + if to.IsDisowned() { + panic("to is already disowned") + } + } + } + vfs.mountMu.RLock() + if _, ok := mntns.mountpoints[from]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + if to != nil { + if _, ok := mntns.mountpoints[to]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + } + // Return with vfs.mountMu locked, which will be unlocked by + // AbortRenameDentry, CommitRenameReplaceDentry, or + // CommitRenameExchangeDentry. + return nil +} + +// AbortRenameDentry must be called after PrepareRenameDentry if the rename +// fails. +func (vfs *VirtualFilesystem) AbortRenameDentry() { + vfs.mountMu.RUnlock() +} + +// CommitRenameReplaceDentry must be called after the file represented by from +// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file +// that was replaced by from. +// +// Preconditions: PrepareRenameDentry was previously called on from and to. +// newParent.Child(newName) == to. +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) { + if to != nil { + to.setDisowned() + // TODO: lazily unmount mounts at d + } + if newParent.children == nil { + newParent.children = make(map[string]*Dentry) + } + newParent.children[newName] = from + from.parent = newParent + from.name = newName + vfs.mountMu.RUnlock() +} + +// CommitRenameExchangeDentry must be called after the files represented by +// from and to are exchanged by rename(RENAME_EXCHANGE). +// +// Preconditions: PrepareRenameDentry was previously called on from and to. +func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { + from.parent, to.parent = to.parent, from.parent + from.name, to.name = to.name, from.name + from.parent.children[from.name] = from + to.parent.children[to.name] = to + vfs.mountMu.RUnlock() +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go new file mode 100644 index 000000000..86bde7fb3 --- /dev/null +++ b/pkg/sentry/vfs/file_description.go @@ -0,0 +1,213 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// A FileDescription represents an open file description, which is the entity +// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File +// Description"). +// +// FileDescriptions are reference-counted. Unless otherwise specified, all +// FileDescription methods require that a reference is held. +// +// FileDescription is analogous to Linux's struct file. +type FileDescription struct { + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // vd is the filesystem location at which this FileDescription was opened. + // A reference is held on vd. vd is immutable. + vd VirtualDentry + + // impl is the FileDescriptionImpl associated with this Filesystem. impl is + // immutable. This should be the last field in FileDescription. + impl FileDescriptionImpl +} + +// Init must be called before first use of fd. It takes references on mnt and +// d. +func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { + fd.refs = 1 + fd.vd = VirtualDentry{ + mount: mnt, + dentry: d, + } + fd.vd.IncRef() + fd.impl = impl +} + +// Impl returns the FileDescriptionImpl associated with fd. +func (fd *FileDescription) Impl() FileDescriptionImpl { + return fd.impl +} + +// VirtualDentry returns the location at which fd was opened. It does not take +// a reference on the returned VirtualDentry. +func (fd *FileDescription) VirtualDentry() VirtualDentry { + return fd.vd +} + +// IncRef increments fd's reference count. +func (fd *FileDescription) IncRef() { + atomic.AddInt64(&fd.refs, 1) +} + +// DecRef decrements fd's reference count. +func (fd *FileDescription) DecRef() { + if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { + fd.impl.Release() + fd.vd.DecRef() + } else if refs < 0 { + panic("FileDescription.DecRef() called without holding a reference") + } +} + +// FileDescriptionImpl contains implementation details for an FileDescription. +// Implementations of FileDescriptionImpl should contain their associated +// FileDescription by value as their first field. +// +// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will +// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and +// auth.KGID respectively). +// +// FileDescriptionImpl is analogous to Linux's struct file_operations. +type FileDescriptionImpl interface { + // Release is called when the associated FileDescription reaches zero + // references. + Release() + + // OnClose is called when a file descriptor representing the + // FileDescription is closed. Note that returning a non-nil error does not + // prevent the file descriptor from being closed. + OnClose() error + + // StatusFlags returns file description status flags, as for + // fcntl(F_GETFL). + StatusFlags(ctx context.Context) (uint32, error) + + // SetStatusFlags sets file description status flags, as for + // fcntl(F_SETFL). + SetStatusFlags(ctx context.Context, flags uint32) error + + // Stat returns metadata for the file represented by the FileDescription. + Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) + + // SetStat updates metadata for the file represented by the + // FileDescription. + SetStat(ctx context.Context, opts SetStatOptions) error + + // StatFS returns metadata for the filesystem containing the file + // represented by the FileDescription. + StatFS(ctx context.Context) (linux.Statfs, error) + + // waiter.Waitable methods may be used to poll for I/O events. + waiter.Waitable + + // PRead reads from the file into dst, starting at the given offset, and + // returns the number of bytes read. PRead is permitted to return partial + // reads with a nil error. + PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) + + // Read is similar to PRead, but does not specify an offset. + // + // For files with an implicit FileDescription offset (e.g. regular files), + // Read begins at the FileDescription offset, and advances the offset by + // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions + // with Regular File Operations" requires that all operations that may + // mutate the FileDescription offset are serialized. + Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) + + // PWrite writes src to the file, starting at the given offset, and returns + // the number of bytes written. PWrite is permitted to return partial + // writes with a nil error. + // + // As in Linux (but not POSIX), if O_APPEND is in effect for the + // FileDescription, PWrite should ignore the offset and append data to the + // end of the file. + PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) + + // Write is similar to PWrite, but does not specify an offset, which is + // implied as for Read. + // + // Write is a FileDescriptionImpl method, instead of a wrapper around + // PWrite that uses a FileDescription offset, to make it possible for + // remote filesystems to implement O_APPEND correctly (i.e. atomically with + // respect to writers outside the scope of VFS). + Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) + + // IterDirents invokes cb on each entry in the directory represented by the + // FileDescription. If IterDirents has been called since the last call to + // Seek, it continues iteration from the end of the last call. + IterDirents(ctx context.Context, cb IterDirentsCallback) error + + // Seek changes the FileDescription offset (assuming one exists) and + // returns its new value. + // + // For directories, if whence == SEEK_SET and offset == 0, the caller is + // rewinddir(), such that Seek "shall also cause the directory stream to + // refer to the current state of the corresponding directory" - + // POSIX.1-2017. + Seek(ctx context.Context, offset int64, whence int32) (int64, error) + + // Sync requests that cached state associated with the file represented by + // the FileDescription is synchronized with persistent storage, and blocks + // until this is complete. + Sync(ctx context.Context) error + + // ConfigureMMap mutates opts to implement mmap(2) for the file. Most + // implementations that support memory mapping can call + // GenericConfigureMMap with the appropriate memmap.Mappable. + ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error + + // Ioctl implements the ioctl(2) syscall. + Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) + + // TODO: extended attributes; file locking +} + +// Dirent holds the information contained in struct linux_dirent64. +type Dirent struct { + // Name is the filename. + Name string + + // Type is the file type, a linux.DT_* constant. + Type uint8 + + // Ino is the inode number. + Ino uint64 + + // Off is this Dirent's offset. + Off int64 +} + +// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. +type IterDirentsCallback interface { + // Handle handles the given iterated Dirent. It returns true if iteration + // should continue, and false if FileDescriptionImpl.IterDirents should + // terminate now and restart with the same Dirent the next time it is + // called. + Handle(dirent Dirent) bool +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go new file mode 100644 index 000000000..486893e70 --- /dev/null +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -0,0 +1,142 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// FileDescriptionDefaultImpl may be embedded by implementations of +// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl +// methods with default behavior analogous to Linux's. +type FileDescriptionDefaultImpl struct{} + +// OnClose implements FileDescriptionImpl.OnClose analogously to +// file_operations::flush == NULL in Linux. +func (FileDescriptionDefaultImpl) OnClose() error { + return nil +} + +// StatFS implements FileDescriptionImpl.StatFS analogously to +// super_operations::statfs == NULL in Linux. +func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) { + return linux.Statfs{}, syserror.ENOSYS +} + +// Readiness implements waiter.Waitable.Readiness analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { + // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK + return waiter.EventIn | waiter.EventOut +} + +// EventRegister implements waiter.Waitable.EventRegister analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +} + +// EventUnregister implements waiter.Waitable.EventUnregister analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) { +} + +// PRead implements FileDescriptionImpl.PRead analogously to +// file_operations::read == file_operations::read_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// Read implements FileDescriptionImpl.Read analogously to +// file_operations::read == file_operations::read_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// PWrite implements FileDescriptionImpl.PWrite analogously to +// file_operations::write == file_operations::write_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// Write implements FileDescriptionImpl.Write analogously to +// file_operations::write == file_operations::write_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// IterDirents implements FileDescriptionImpl.IterDirents analogously to +// file_operations::iterate == file_operations::iterate_shared == NULL in +// Linux. +func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements FileDescriptionImpl.Seek analogously to +// file_operations::llseek == NULL in Linux. +func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.ESPIPE +} + +// Sync implements FileDescriptionImpl.Sync analogously to +// file_operations::fsync == NULL in Linux. +func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { + return syserror.EINVAL +} + +// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to +// file_operations::mmap == NULL in Linux. +func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { + return syserror.ENODEV +} + +// Ioctl implements FileDescriptionImpl.Ioctl analogously to +// file_operations::unlocked_ioctl == NULL in Linux. +func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of +// FileDescriptionImpl that always represent directories to obtain +// implementations of non-directory I/O methods that return EISDIR, and +// implementations of other methods consistent with FileDescriptionDefaultImpl. +type DirectoryFileDescriptionDefaultImpl struct { + FileDescriptionDefaultImpl +} + +// PRead implements FileDescriptionImpl.PRead. +func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Read implements FileDescriptionImpl.Read. +func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// PWrite implements FileDescriptionImpl.PWrite. +func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements FileDescriptionImpl.Write. +func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go new file mode 100644 index 000000000..7a074b718 --- /dev/null +++ b/pkg/sentry/vfs/filesystem.go @@ -0,0 +1,155 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" +) + +// A Filesystem is a tree of nodes represented by Dentries, which forms part of +// a VirtualFilesystem. +// +// Filesystems are reference-counted. Unless otherwise specified, all +// Filesystem methods require that a reference is held. +// +// Filesystem is analogous to Linux's struct super_block. +type Filesystem struct { + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // impl is the FilesystemImpl associated with this Filesystem. impl is + // immutable. This should be the last field in Dentry. + impl FilesystemImpl +} + +// Init must be called before first use of fs. +func (fs *Filesystem) Init(impl FilesystemImpl) { + fs.refs = 1 + fs.impl = impl +} + +// Impl returns the FilesystemImpl associated with fs. +func (fs *Filesystem) Impl() FilesystemImpl { + return fs.impl +} + +func (fs *Filesystem) incRef() { + if atomic.AddInt64(&fs.refs, 1) <= 1 { + panic("Filesystem.incRef() called without holding a reference") + } +} + +func (fs *Filesystem) decRef() { + if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.impl.Release() + } else if refs < 0 { + panic("Filesystem.decRef() called without holding a reference") + } +} + +// FilesystemImpl contains implementation details for a Filesystem. +// Implementations of FilesystemImpl should contain their associated Filesystem +// by value as their first field. +// +// All methods that take a ResolvingPath must resolve the path before +// performing any other checks, including rejection of the operation if not +// supported by the FilesystemImpl. This is because the final FilesystemImpl +// (responsible for actually implementing the operation) isn't known until path +// resolution is complete. +// +// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid +// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID +// and auth.KGID respectively). +// +// FilesystemImpl combines elements of Linux's struct super_operations and +// struct inode_operations, for reasons described in the documentation for +// Dentry. +type FilesystemImpl interface { + // Release is called when the associated Filesystem reaches zero + // references. + Release() + + // Sync "causes all pending modifications to filesystem metadata and cached + // file data to be written to the underlying [filesystem]", as by syncfs(2). + Sync(ctx context.Context) error + + // GetDentryAt returns a Dentry representing the file at rp. A reference is + // taken on the returned Dentry. + // + // GetDentryAt does not correspond directly to a Linux syscall; it is used + // in the implementation of: + // + // - Syscalls that need to resolve two paths: rename(), renameat(), + // renameat2(), link(), linkat(). + // + // - Syscalls that need to refer to a filesystem position outside the + // context of a file description: chdir(), fchdir(), chroot(), mount(), + // umount(). + GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) + + // LinkAt creates a hard link at rp representing the same file as vd. It + // does not take ownership of references on vd. + // + // The implementation is responsible for checking that vd.Mount() == + // rp.Mount(), and that vd does not represent a directory. + LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error + + // MkdirAt creates a directory at rp. + MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error + + // MknodAt creates a regular file, device special file, or named pipe at + // rp. + MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error + + // OpenAt returns an FileDescription providing access to the file at rp. A + // reference is taken on the returned FileDescription. + OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) + + // ReadlinkAt returns the target of the symbolic link at rp. + ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) + + // RenameAt renames the Dentry represented by vd to rp. It does not take + // ownership of references on vd. + // + // The implementation is responsible for checking that vd.Mount() == + // rp.Mount(). + RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error + + // RmdirAt removes the directory at rp. + RmdirAt(ctx context.Context, rp *ResolvingPath) error + + // SetStatAt updates metadata for the file at the given path. + SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error + + // StatAt returns metadata for the file at rp. + StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) + + // StatFSAt returns metadata for the filesystem containing the file at rp. + // (This method takes a path because a FilesystemImpl may consist of any + // number of constituent filesystems.) + StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) + + // SymlinkAt creates a symbolic link at rp referring to the given target. + SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error + + // UnlinkAt removes the non-directory file at rp. + UnlinkAt(ctx context.Context, rp *ResolvingPath) error + + // TODO: d_path(); extended attributes; inotify_add_watch(); bind() +} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go new file mode 100644 index 000000000..f401ad7f3 --- /dev/null +++ b/pkg/sentry/vfs/filesystem_type.go @@ -0,0 +1,70 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// A FilesystemType constructs filesystems. +// +// FilesystemType is analogous to Linux's struct file_system_type. +type FilesystemType interface { + // NewFilesystem returns a Filesystem configured by the given options, + // along with its mount root. A reference is taken on the returned + // Filesystem and Dentry. + NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) +} + +// NewFilesystemOptions contains options to FilesystemType.NewFilesystem. +type NewFilesystemOptions struct { + // Data is the string passed as the 5th argument to mount(2), which is + // usually a comma-separated list of filesystem-specific mount options. + Data string + + // InternalData holds opaque FilesystemType-specific data. There is + // intentionally no way for applications to specify InternalData; if it is + // not nil, the call to NewFilesystem originates from within the sentry. + InternalData interface{} +} + +// RegisterFilesystemType registers the given FilesystemType in vfs with the +// given name. +func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error { + vfs.fsTypesMu.Lock() + defer vfs.fsTypesMu.Unlock() + if existing, ok := vfs.fsTypes[name]; ok { + return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing) + } + vfs.fsTypes[name] = fsType + return nil +} + +// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but +// panics on failure. +func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) { + if err := vfs.RegisterFilesystemType(name, fsType); err != nil { + panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err)) + } +} + +func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType { + vfs.fsTypesMu.RLock() + defer vfs.fsTypesMu.RUnlock() + return vfs.fsTypes[name] +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go new file mode 100644 index 000000000..11702f720 --- /dev/null +++ b/pkg/sentry/vfs/mount.go @@ -0,0 +1,411 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "math" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem +// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem +// (Mount.fs), which applies to path resolution in the context of a particular +// Mount (Mount.key.parent). +// +// Mounts are reference-counted. Unless otherwise specified, all Mount methods +// require that a reference is held. +// +// Mount and Filesystem are distinct types because it's possible for a single +// Filesystem to be mounted at multiple locations and/or in multiple mount +// namespaces. +// +// Mount is analogous to Linux's struct mount. (gVisor does not distinguish +// between struct mount and struct vfsmount.) +type Mount struct { + // The lower 63 bits of refs are a reference count. The MSB of refs is set + // if the Mount has been eagerly unmounted, as by umount(2) without the + // MNT_DETACH flag. refs is accessed using atomic memory operations. + refs int64 + + // The lower 63 bits of writers is the number of calls to + // Mount.CheckBeginWrite() that have not yet been paired with a call to + // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. + // writers is accessed using atomic memory operations. + writers int64 + + // key is protected by VirtualFilesystem.mountMu and + // VirtualFilesystem.mounts.seq, and may be nil. References are held on + // key.parent and key.point if they are not nil. + // + // Invariant: key.parent != nil iff key.point != nil. key.point belongs to + // key.parent.fs. + key mountKey + + // fs, root, and ns are immutable. References are held on fs and root (but + // not ns). + // + // Invariant: root belongs to fs. + fs *Filesystem + root *Dentry + ns *MountNamespace +} + +// A MountNamespace is a collection of Mounts. +// +// MountNamespaces are reference-counted. Unless otherwise specified, all +// MountNamespace methods require that a reference is held. +// +// MountNamespace is analogous to Linux's struct mnt_namespace. +type MountNamespace struct { + refs int64 // accessed using atomic memory operations + + // root is the MountNamespace's root mount. root is immutable. + root *Mount + + // mountpoints contains all Dentries which are mount points in this + // namespace. mountpoints is protected by VirtualFilesystem.mountMu. + // + // mountpoints is used to determine if a Dentry can be moved or removed + // (which requires that the Dentry is not a mount point in the calling + // namespace). + // + // mountpoints is maintained even if there are no references held on the + // MountNamespace; this is required to ensure that + // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate + // correctly on unreferenced MountNamespaces. + mountpoints map[*Dentry]struct{} +} + +// NewMountNamespace returns a new mount namespace with a root filesystem +// configured by the given arguments. A reference is taken on the returned +// MountNamespace. +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) { + fsType := vfs.getFilesystemType(fsTypeName) + if fsType == nil { + return nil, syserror.ENODEV + } + fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + if err != nil { + return nil, err + } + mntns := &MountNamespace{ + refs: 1, + mountpoints: make(map[*Dentry]struct{}), + } + mntns.root = &Mount{ + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + return mntns, nil +} + +// NewMount creates and mounts a new Filesystem. +func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error { + fsType := vfs.getFilesystemType(fsTypeName) + if fsType == nil { + return syserror.ENODEV + } + fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + if err != nil { + return err + } + // We can't hold vfs.mountMu while calling FilesystemImpl methods due to + // lock ordering. + vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) + if err != nil { + root.decRef(fs) + fs.decRef() + return err + } + vfs.mountMu.Lock() + for { + if vd.dentry.IsDisowned() { + vfs.mountMu.Unlock() + vd.DecRef() + root.decRef(fs) + fs.decRef() + return syserror.ENOENT + } + // vd might have been mounted over between vfs.GetDentryAt() and + // vfs.mountMu.Lock(). + if !vd.dentry.isMounted() { + break + } + nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) + if nextmnt == nil { + break + } + nextmnt.incRef() + nextmnt.root.incRef(nextmnt.fs) + vd.DecRef() + vd = VirtualDentry{ + mount: nextmnt, + dentry: nextmnt.root, + } + } + // TODO: Linux requires that either both the mount point and the mount root + // are directories, or neither are, and returns ENOTDIR if this is not the + // case. + mntns := vd.mount.ns + mnt := &Mount{ + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + mnt.storeKey(vd.mount, vd.dentry) + atomic.AddUint32(&vd.dentry.mounts, 1) + mntns.mountpoints[vd.dentry] = struct{}{} + vfsmpmounts, ok := vfs.mountpoints[vd.dentry] + if !ok { + vfsmpmounts = make(map[*Mount]struct{}) + vfs.mountpoints[vd.dentry] = vfsmpmounts + } + vfsmpmounts[mnt] = struct{}{} + vfs.mounts.Insert(mnt) + vfs.mountMu.Unlock() + return nil +} + +// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes +// a reference on the returned Mount. If (mnt, d) is not a mount point, +// getMountAt returns nil. +// +// getMountAt is analogous to Linux's fs/namei.c:follow_mount(). +// +// Preconditions: References are held on mnt and d. +func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { + // The first mount is special-cased: + // + // - The caller is assumed to have checked d.isMounted() already. (This + // isn't a precondition because it doesn't matter for correctness.) + // + // - We return nil, instead of mnt, if there is no mount at (mnt, d). + // + // - We don't drop the caller's references on mnt and d. +retryFirst: + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + return nil + } + if !next.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + mnt = next + d = next.root + // We don't need to take Dentry refs anywhere in this function because + // Mounts hold references on Mount.root, which is immutable. + for d.isMounted() { + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + break + } + if !next.tryIncMountedRef() { + // Raced with umount. + continue + } + mnt.decRef() + mnt = next + d = next.root + } + return mnt +} + +// getMountpointAt returns the mount point for the stack of Mounts including +// mnt. It takes a reference on the returned Mount and Dentry. If no such mount +// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). +// +// Preconditions: References are held on mnt and root. vfsroot is not (mnt, +// mnt.root). +func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) { + // The first mount is special-cased: + // + // - The caller must have already checked mnt against vfsroot. + // + // - We return nil, instead of mnt, if there is no mount point for mnt. + // + // - We don't drop the caller's reference on mnt. +retryFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.loadKey() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryFirst + } + if parent == nil { + return nil, nil + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + if !point.tryIncRef(parent.fs) { + // Since Mount holds a reference on Mount.key.point, this can only + // happen due to a racing change to Mount.key. + parent.decRef() + goto retryFirst + } + mnt = parent + d := point + for { + if mnt == vfsroot.mount && d == vfsroot.dentry { + break + } + if d != mnt.root { + break + } + retryNotFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.loadKey() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryNotFirst + } + if parent == nil { + break + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryNotFirst + } + if !point.tryIncRef(parent.fs) { + // Since Mount holds a reference on Mount.key.point, this can + // only happen due to a racing change to Mount.key. + parent.decRef() + goto retryNotFirst + } + if !vfs.mounts.seq.ReadOk(epoch) { + point.decRef(parent.fs) + parent.decRef() + goto retryNotFirst + } + d.decRef(mnt.fs) + mnt.decRef() + mnt = parent + d = point + } + return mnt, d +} + +// tryIncMountedRef increments mnt's reference count and returns true. If mnt's +// reference count is already zero, or has been eagerly unmounted, +// tryIncMountedRef does nothing and returns false. +// +// tryIncMountedRef does not require that a reference is held on mnt. +func (mnt *Mount) tryIncMountedRef() bool { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted + return false + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { + return true + } + } +} + +func (mnt *Mount) incRef() { + // In general, negative values for mnt.refs are valid because the MSB is + // the eager-unmount bit. + atomic.AddInt64(&mnt.refs, 1) +} + +func (mnt *Mount) decRef() { + refs := atomic.AddInt64(&mnt.refs, -1) + if refs&^math.MinInt64 == 0 { // mask out MSB + parent, point := mnt.loadKey() + if point != nil { + point.decRef(parent.fs) + parent.decRef() + } + mnt.root.decRef(mnt.fs) + mnt.fs.decRef() + } +} + +// CheckBeginWrite increments the counter of in-progress write operations on +// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns +// EROFS. +// +// If CheckBeginWrite succeeds, EndWrite must be called when the write +// operation is finished. +func (mnt *Mount) CheckBeginWrite() error { + if atomic.AddInt64(&mnt.writers, 1) < 0 { + atomic.AddInt64(&mnt.writers, -1) + return syserror.EROFS + } + return nil +} + +// EndWrite indicates that a write operation signaled by a previous successful +// call to CheckBeginWrite has finished. +func (mnt *Mount) EndWrite() { + atomic.AddInt64(&mnt.writers, -1) +} + +// Preconditions: VirtualFilesystem.mountMu must be locked for writing. +func (mnt *Mount) setReadOnlyLocked(ro bool) error { + if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { + return nil + } + if ro { + if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { + return syserror.EBUSY + } + return nil + } + // Unset MSB without dropping any temporary increments from failed calls to + // mnt.CheckBeginWrite(). + atomic.AddInt64(&mnt.writers, math.MinInt64) + return nil +} + +// Filesystem returns the mounted Filesystem. It does not take a reference on +// the returned Filesystem. +func (mnt *Mount) Filesystem() *Filesystem { + return mnt.fs +} + +// IncRef increments mntns' reference count. +func (mntns *MountNamespace) IncRef() { + if atomic.AddInt64(&mntns.refs, 1) <= 1 { + panic("MountNamespace.IncRef() called without holding a reference") + } +} + +// DecRef decrements mntns' reference count. +func (mntns *MountNamespace) DecRef() { + if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 { + // TODO: unmount mntns.root + } else if refs < 0 { + panic("MountNamespace.DecRef() called without holding a reference") + } +} + +// Root returns mntns' root. A reference is taken on the returned +// VirtualDentry. +func (mntns *MountNamespace) Root() VirtualDentry { + vd := VirtualDentry{ + mount: mntns.root, + dentry: mntns.root.root, + } + vd.IncRef() + return vd +} diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go new file mode 100644 index 000000000..f394d7483 --- /dev/null +++ b/pkg/sentry/vfs/mount_test.go @@ -0,0 +1,465 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "runtime" + "sync" + "testing" +) + +func TestMountTableLookupEmpty(t *testing.T) { + var mt mountTable + mt.Init() + + parent := &Mount{} + point := &Dentry{} + if m := mt.Lookup(parent, point); m != nil { + t.Errorf("empty mountTable lookup: got %p, wanted nil", m) + } +} + +func TestMountTableInsertLookup(t *testing.T) { + var mt mountTable + mt.Init() + + mount := &Mount{} + mount.storeKey(&Mount{}, &Dentry{}) + mt.Insert(mount) + + if m := mt.Lookup(mount.parent(), mount.point()); m != mount { + t.Errorf("mountTable positive lookup: got %p, wanted %p", m, mount) + } + + otherParent := &Mount{} + if m := mt.Lookup(otherParent, mount.point()); m != nil { + t.Errorf("mountTable lookup with wrong mount parent: got %p, wanted nil", m) + } + otherPoint := &Dentry{} + if m := mt.Lookup(mount.parent(), otherPoint); m != nil { + t.Errorf("mountTable lookup with wrong mount point: got %p, wanted nil", m) + } +} + +// TODO: concurrent lookup/insertion/removal + +// must be powers of 2 +var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8} + +// For all of the following: +// +// - BenchmarkMountTableFoo tests usage pattern "Foo" for mountTable. +// +// - BenchmarkMountMapFoo tests usage pattern "Foo" for a +// sync.RWMutex-protected map. (Mutator benchmarks do not use a RWMutex, since +// mountTable also requires external synchronization between mutators.) +// +// - BenchmarkMountSyncMapFoo tests usage pattern "Foo" for a sync.Map. +// +// ParallelLookup is by far the most common and performance-sensitive operation +// for this application. NegativeLookup is also important, but less so (only +// relevant with multiple mount namespaces and significant differences in +// mounts between them). Insertion and removal are benchmarked for +// completeness. +const enableComparativeBenchmarks = false + +func newBenchMount() *Mount { + mount := &Mount{} + mount.storeKey(&Mount{}, &Dentry{}) + return mount +} + +func vdkey(mnt *Mount) VirtualDentry { + parent, point := mnt.loadKey() + return VirtualDentry{ + mount: parent, + dentry: point, + } +} + +func BenchmarkMountTableParallelLookup(b *testing.B) { + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var mt mountTable + mt.Init() + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + mt.Insert(mount) + keys = append(keys, vdkey(mount)) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + m := mt.Lookup(k.mount, k.dentry) + if m == nil { + b.Fatalf("lookup failed") + } + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountMapParallelLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var mu sync.RWMutex + ms := make(map[VirtualDentry]*Mount) + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + key := vdkey(mount) + ms[key] = mount + keys = append(keys, key) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + mu.RLock() + m := ms[k] + mu.RUnlock() + if m == nil { + b.Fatalf("lookup failed") + } + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountSyncMapParallelLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var ms sync.Map + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + key := vdkey(mount) + ms.Store(key, mount) + keys = append(keys, key) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + mi, ok := ms.Load(k) + if !ok { + b.Fatalf("lookup failed") + } + m := mi.(*Mount) + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountTableNegativeLookup(b *testing.B) { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var mt mountTable + mt.Init() + for i := 0; i < numMounts; i++ { + mt.Insert(newBenchMount()) + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + m := mt.Lookup(k.mount, k.dentry) + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountMapNegativeLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var mu sync.RWMutex + ms := make(map[VirtualDentry]*Mount) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + ms[vdkey(mount)] = mount + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + mu.RLock() + m := ms[k] + mu.RUnlock() + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var ms sync.Map + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + ms.Store(vdkey(mount), mount) + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + m, _ := ms.Load(k) + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountTableInsert(b *testing.B) { + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + var mt mountTable + mt.Init() + b.ResetTimer() + for i := range mounts { + mt.Insert(mounts[i]) + } +} + +func BenchmarkMountMapInsert(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + ms := make(map[VirtualDentry]*Mount) + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms[vdkey(mount)] = mount + } +} + +func BenchmarkMountSyncMapInsert(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + var ms sync.Map + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms.Store(vdkey(mount), mount) + } +} + +func BenchmarkMountTableRemove(b *testing.B) { + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + var mt mountTable + mt.Init() + for i := range mounts { + mt.Insert(mounts[i]) + } + + b.ResetTimer() + for i := range mounts { + mt.Remove(mounts[i]) + } +} + +func BenchmarkMountMapRemove(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + ms := make(map[VirtualDentry]*Mount) + for i := range mounts { + mount := mounts[i] + ms[vdkey(mount)] = mount + } + + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + delete(ms, vdkey(mount)) + } +} + +func BenchmarkMountSyncMapRemove(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + var ms sync.Map + for i := range mounts { + mount := mounts[i] + ms.Store(vdkey(mount), mount) + } + + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms.Delete(vdkey(mount)) + } +} diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go new file mode 100644 index 000000000..b0511aa40 --- /dev/null +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -0,0 +1,356 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.12 +// +build !go1.14 + +// Check go:linkname function signatures when updating Go version. + +package vfs + +import ( + "fmt" + "math/bits" + "reflect" + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/third_party/gvsync" +) + +// mountKey represents the location at which a Mount is mounted. It is +// structurally identical to VirtualDentry, but stores its fields as +// unsafe.Pointer since mutators synchronize with VFS path traversal using +// seqcounts. +type mountKey struct { + parent unsafe.Pointer // *Mount + point unsafe.Pointer // *Dentry +} + +// Invariant: mnt.key's fields are nil. parent and point are non-nil. +func (mnt *Mount) storeKey(parent *Mount, point *Dentry) { + atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent)) + atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point)) +} + +func (mnt *Mount) loadKey() (*Mount, *Dentry) { + return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point)) +} + +func (mnt *Mount) parent() *Mount { + return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) +} + +func (mnt *Mount) point() *Dentry { + return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) +} + +// mountTable maps (mount parent, mount point) pairs to mounts. It supports +// efficient concurrent lookup, even in the presence of concurrent mutators +// (provided mutation is sufficiently uncommon). +// +// mountTable.Init() must be called on new mountTables before use. +type mountTable struct { + // mountTable is implemented as a seqcount-protected hash table that + // resolves collisions with linear probing, featuring Robin Hood insertion + // and backward shift deletion. These minimize probe length variance, + // significantly improving the performance of linear probing at high load + // factors. (mountTable doesn't use bucketing, which is the other major + // technique commonly used in high-performance hash tables; the efficiency + // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD + // intrinsics and inline assembly, limiting the performance of this + // approach.) + + seq gvsync.SeqCount + seed uint32 // for hashing keys + + // size holds both length (number of elements) and capacity (number of + // slots): capacity is stored as its base-2 log (referred to as order) in + // the least significant bits of size, and length is stored in the + // remaining bits. Go defines bit shifts >= width of shifted unsigned + // operand as shifting to 0, which differs from x86's SHL, so the Go + // compiler inserts a bounds check for each bit shift unless we mask order + // anyway (cf. runtime.bucketShift()), and length isn't used by lookup; + // thus this bit packing gets us more bits for the length (vs. storing + // length and cap in separate uint32s) for ~free. + size uint64 + + slots unsafe.Pointer // []mountSlot; never nil after Init +} + +type mountSlot struct { + // We don't store keys in slots; instead, we just check Mount.parent and + // Mount.point directly. Any practical use of lookup will need to touch + // Mounts anyway, and comparing hashes means that false positives are + // extremely rare, so this isn't an extra cache line touch overall. + value unsafe.Pointer // *Mount + hash uintptr +} + +const ( + mtSizeOrderBits = 6 // log2 of pointer size in bits + mtSizeOrderMask = (1 << mtSizeOrderBits) - 1 + mtSizeOrderOne = 1 + mtSizeLenLSB = mtSizeOrderBits + mtSizeLenOne = 1 << mtSizeLenLSB + mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB + + mountSlotBytes = unsafe.Sizeof(mountSlot{}) + mountKeyBytes = unsafe.Sizeof(mountKey{}) + + // Tuning parameters. + // + // Essentially every mountTable will contain at least /proc, /sys, and + // /dev/shm, so there is ~no reason for mtInitCap to be < 4. + mtInitOrder = 2 + mtInitCap = 1 << mtInitOrder + mtMaxLoadNum = 13 + mtMaxLoadDen = 16 +) + +func init() { + // We can't just define mtSizeOrderBits as follows because Go doesn't have + // constexpr. + if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) { + panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits)) + } + if bits.OnesCount(uint(mountSlotBytes)) != 1 { + panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes)) + } + if mtInitCap <= 1 { + panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap)) + } + if mtMaxLoadNum >= mtMaxLoadDen { + panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen)) + } +} + +// Init must be called exactly once on each mountTable before use. +func (mt *mountTable) Init() { + mt.seed = rand32() + mt.size = mtInitOrder + mt.slots = newMountTableSlots(mtInitCap) +} + +func newMountTableSlots(cap uintptr) unsafe.Pointer { + slice := make([]mountSlot, cap, cap) + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + return unsafe.Pointer(hdr.Data) +} + +// Lookup returns the Mount with the given parent, mounted at the given point. +// If no such Mount exists, Lookup returns nil. +// +// Lookup may be called even if there are concurrent mutators of mt. +func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { + key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} + hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) + +loop: + for { + epoch := mt.seq.BeginRead() + size := atomic.LoadUint64(&mt.size) + slots := atomic.LoadPointer(&mt.slots) + if !mt.seq.ReadOk(epoch) { + continue + } + tcap := uintptr(1) << (size & mtSizeOrderMask) + mask := tcap - 1 + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + for { + // This avoids bounds checking. + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := atomic.LoadPointer(&slot.value) + slotHash := atomic.LoadUintptr(&slot.hash) + if !mt.seq.ReadOk(epoch) { + // The element we're looking for might have been moved into a + // slot we've previously checked, so restart entirely. + continue loop + } + if slotValue == nil { + return nil + } + if slotHash == hash { + mount := (*Mount)(slotValue) + var mountKey mountKey + mountKey.parent = atomic.LoadPointer(&mount.key.parent) + mountKey.point = atomic.LoadPointer(&mount.key.point) + if !mt.seq.ReadOk(epoch) { + continue loop + } + if key == mountKey { + return mount + } + } + off = (off + mountSlotBytes) & offmask + } + } +} + +// Insert inserts the given mount into mt. +// +// Preconditions: There are no concurrent mutators of mt. mt must not already +// contain a Mount with the same mount point and parent. +func (mt *mountTable) Insert(mount *Mount) { + hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + + // We're under the maximum load factor if: + // + // (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen + // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap + tlen := mt.size >> mtSizeLenLSB + order := mt.size & mtSizeOrderMask + tcap := uintptr(1) << order + if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) { + // Atomically insert the new element into the table. + mt.seq.BeginWrite() + atomic.AddUint64(&mt.size, mtSizeLenOne) + mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash) + mt.seq.EndWrite() + return + } + + // Otherwise, we have to expand. Double the number of slots in the new + // table. + newOrder := order + 1 + if newOrder > mtSizeOrderMask { + panic("mount table size overflow") + } + newCap := uintptr(1) << newOrder + newSlots := newMountTableSlots(newCap) + // Copy existing elements to the new table. + oldCur := mt.slots + // Go does not permit pointers to the end of allocated objects, so we + // must use a pointer to the last element of the old table. The + // following expression is equivalent to + // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2 + // arithmetic instructions instead of 3. + oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes)) + for { + oldSlot := (*mountSlot)(oldCur) + if oldSlot.value != nil { + // Don't need to lock mt.seq yet since newSlots isn't visible + // to readers. + mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash) + } + if oldCur == oldLast { + break + } + oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes) + } + // Insert the new element into the new table. + mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash) + // Atomically switch to the new table. + mt.seq.BeginWrite() + atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne) + atomic.StorePointer(&mt.slots, newSlots) + mt.seq.EndWrite() +} + +// Preconditions: There are no concurrent mutators of the table (slots, cap). +// If the table is visible to readers, then mt.seq must be in a writer critical +// section. cap must be a power of 2. +func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) { + mask := cap - 1 + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + disp := uintptr(0) + for { + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := slot.value + if slotValue == nil { + atomic.StorePointer(&slot.value, value) + atomic.StoreUintptr(&slot.hash, hash) + return + } + // If we've been displaced farther from our first-probed slot than the + // element stored in this one, swap elements and switch to inserting + // the replaced one. (This is Robin Hood insertion.) + slotHash := slot.hash + slotDisp := ((off / mountSlotBytes) - slotHash) & mask + if disp > slotDisp { + atomic.StorePointer(&slot.value, value) + atomic.StoreUintptr(&slot.hash, hash) + value = slotValue + hash = slotHash + disp = slotDisp + } + off = (off + mountSlotBytes) & offmask + disp++ + } +} + +// Remove removes the given mount from mt. +// +// Preconditions: There are no concurrent mutators of mt. mt must contain +// mount. +func (mt *mountTable) Remove(mount *Mount) { + hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + tcap := uintptr(1) << (mt.size & mtSizeOrderMask) + mask := tcap - 1 + slots := mt.slots + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + for { + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := slot.value + if slotValue == unsafe.Pointer(mount) { + // Found the element to remove. Move all subsequent elements + // backward until we either find an empty slot, or an element that + // is already in its first-probed slot. (This is backward shift + // deletion.) + mt.seq.BeginWrite() + for { + nextOff := (off + mountSlotBytes) & offmask + nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff)) + nextSlotValue := nextSlot.value + if nextSlotValue == nil { + break + } + nextSlotHash := nextSlot.hash + if (nextOff / mountSlotBytes) == (nextSlotHash & mask) { + break + } + atomic.StorePointer(&slot.value, nextSlotValue) + atomic.StoreUintptr(&slot.hash, nextSlotHash) + off = nextOff + slot = nextSlot + } + atomic.StorePointer(&slot.value, nil) + atomic.AddUint64(&mt.size, mtSizeLenNegOne) + mt.seq.EndWrite() + return + } + if checkInvariants && slotValue == nil { + panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount)) + } + off = (off + mountSlotBytes) & offmask + } +} + +//go:linkname memhash runtime.memhash +func memhash(p unsafe.Pointer, seed, s uintptr) uintptr + +//go:linkname rand32 runtime.fastrand +func rand32() uint32 + +// This is copy/pasted from runtime.noescape(), and is needed because arguments +// apparently escape from all functions defined by linkname. +// +//go:nosplit +func noescape(p unsafe.Pointer) unsafe.Pointer { + x := uintptr(p) + return unsafe.Pointer(x ^ 0) +} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go new file mode 100644 index 000000000..187e5410c --- /dev/null +++ b/pkg/sentry/vfs/options.go @@ -0,0 +1,123 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and +// FilesystemImpl.GetDentryAt(). +type GetDentryOptions struct { + // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that + // the returned Dentry is a directory for which creds has search + // permission. + CheckSearchable bool +} + +// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and +// FilesystemImpl.MkdirAt(). +type MkdirOptions struct { + // Mode is the file mode bits for the created directory. + Mode uint16 +} + +// MknodOptions contains options to VirtualFilesystem.MknodAt() and +// FilesystemImpl.MknodAt(). +type MknodOptions struct { + // Mode is the file type and mode bits for the created file. + Mode uint16 + + // If Mode specifies a character or block device special file, DevMajor and + // DevMinor are the major and minor device numbers for the created device. + DevMajor uint32 + DevMinor uint32 +} + +// OpenOptions contains options to VirtualFilesystem.OpenAt() and +// FilesystemImpl.OpenAt(). +type OpenOptions struct { + // Flags contains access mode and flags as specified for open(2). + // + // FilesystemImpls is reponsible for implementing the following flags: + // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, + // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and + // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and + // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file + // descriptors are mostly outside the scope of VFS. + Flags uint32 + + // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the + // created file. + Mode uint16 +} + +// ReadOptions contains options to FileDescription.PRead(), +// FileDescriptionImpl.PRead(), FileDescription.Read(), and +// FileDescriptionImpl.Read(). +type ReadOptions struct { + // Flags contains flags as specified for preadv2(2). + Flags uint32 +} + +// RenameOptions contains options to VirtualFilesystem.RenameAt() and +// FilesystemImpl.RenameAt(). +type RenameOptions struct { + // Flags contains flags as specified for renameat2(2). + Flags uint32 +} + +// SetStatOptions contains options to VirtualFilesystem.SetStatAt(), +// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and +// FileDescriptionImpl.SetStat(). +type SetStatOptions struct { + // Stat is the metadata that should be set. Only fields indicated by + // Stat.Mask should be set. + // + // If Stat specifies that a timestamp should be set, + // FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must + // special-case StatxTimestamp.Nsec == UTIME_NOW as described by + // utimensat(2); however, they do not need to check for StatxTimestamp.Nsec + // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask + // instead). + Stat linux.Statx +} + +// StatOptions contains options to VirtualFilesystem.StatAt(), +// FilesystemImpl.StatAt(), FileDescription.Stat(), and +// FileDescriptionImpl.Stat(). +type StatOptions struct { + // Mask is the set of fields in the returned Statx that the FilesystemImpl + // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask. + // + // The FilesystemImpl or FileDescriptionImpl may return fields not + // requested in Mask, and may fail to return fields requested in Mask that + // are not supported by the underlying filesystem implementation, without + // returning an error. + Mask uint32 + + // Sync specifies the synchronization required, and is one of + // linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default), + // linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC. + Sync uint32 +} + +// WriteOptions contains options to FileDescription.PWrite(), +// FileDescriptionImpl.PWrite(), FileDescription.Write(), and +// FileDescriptionImpl.Write(). +type WriteOptions struct { + // Flags contains flags as specified for pwritev2(2). + Flags uint32 +} diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go new file mode 100644 index 000000000..f8e74355c --- /dev/null +++ b/pkg/sentry/vfs/permissions.go @@ -0,0 +1,121 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// AccessTypes is a bitmask of Unix file permissions. +type AccessTypes uint16 + +// Bits in AccessTypes. +const ( + MayRead AccessTypes = 4 + MayWrite = 2 + MayExec = 1 +) + +// GenericCheckPermissions checks that creds has the given access rights on a +// file with the given permissions, UID, and GID, subject to the rules of +// fs/namei.c:generic_permission(). isDir is true if the file is a directory. +func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error { + // Check permission bits. + perms := mode + if creds.EffectiveKUID == kuid { + perms >>= 6 + } else if creds.InGroup(kgid) { + perms >>= 3 + } + if uint16(ats)&perms == uint16(ats) { + return nil + } + + // Caller capabilities require that the file's KUID and KGID are mapped in + // the caller's user namespace; compare + // kernel/capability.c:privileged_wrt_inode_uidgid(). + if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() { + return syserror.EACCES + } + // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary + // directories, and read arbitrary non-directory files. + if (isDir && (ats&MayWrite == 0)) || ats == MayRead { + if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { + return nil + } + } + // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write + // access to non-directory files, and execute access to non-directory files + // for which at least one execute bit is set. + if isDir || (ats&MayExec == 0) || (mode&0111 != 0) { + if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { + return nil + } + } + return syserror.EACCES +} + +// AccessTypesForOpenFlags returns the access types required to open a file +// with the given OpenOptions.Flags. Note that this is NOT the same thing as +// the set of accesses permitted for the opened file: +// +// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it +// mutates the file), but does not permit the opened to write to the file +// thereafter. +// +// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in +// flags to mean: check for read and write permission on the file and return a +// file descriptor that can't be used for reading or writing." - open(2). Thus +// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but +// filesystems are responsible for ensuring that access is denied. +// +// Use May{Read,Write}FileWithOpenFlags() for these checks instead. +func AccessTypesForOpenFlags(flags uint32) AccessTypes { + switch flags & linux.O_ACCMODE { + case linux.O_RDONLY: + if flags&linux.O_TRUNC != 0 { + return MayRead | MayWrite + } + return MayRead + case linux.O_WRONLY: + return MayWrite + default: + return MayRead | MayWrite + } +} + +// MayReadFileWithOpenFlags returns true if a file with the given open flags +// should be readable. +func MayReadFileWithOpenFlags(flags uint32) bool { + switch flags & linux.O_ACCMODE { + case linux.O_RDONLY, linux.O_RDWR: + return true + default: + return false + } +} + +// MayWriteFileWithOpenFlags returns true if a file with the given open flags +// should be writable. +func MayWriteFileWithOpenFlags(flags uint32) bool { + switch flags & linux.O_ACCMODE { + case linux.O_WRONLY, linux.O_RDWR: + return true + default: + return false + } +} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go new file mode 100644 index 000000000..8d05c8583 --- /dev/null +++ b/pkg/sentry/vfs/resolving_path.go @@ -0,0 +1,453 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ResolvingPath represents the state of an in-progress path resolution, shared +// between VFS and FilesystemImpl methods that take a path. +// +// From the perspective of FilesystemImpl methods, a ResolvingPath represents a +// starting Dentry on the associated Filesystem (on which a reference is +// already held) and a stream of path components relative to that Dentry. +// +// ResolvingPath is loosely analogous to Linux's struct nameidata. +type ResolvingPath struct { + vfs *VirtualFilesystem + root VirtualDentry // refs borrowed from PathOperation + mount *Mount + start *Dentry + pit fspath.Iterator + + flags uint16 + mustBeDir bool // final file must be a directory? + mustBeDirOrig bool + symlinks uint8 // number of symlinks traversed + symlinksOrig uint8 + curPart uint8 // index into parts + numOrigParts uint8 + + creds *auth.Credentials + + // Data associated with resolve*Errors, stored in ResolvingPath so that + // those errors don't need to allocate. + nextMount *Mount // ref held if not nil + nextStart *Dentry // ref held if not nil + absSymlinkTarget fspath.Path + + // ResolvingPath must track up to two relative paths: the "current" + // relative path, which is updated whenever a relative symlink is + // encountered, and the "original" relative path, which is updated from the + // current relative path by handleError() when resolution must change + // filesystems (due to reaching a mount boundary or absolute symlink) and + // overwrites the current relative path when Restart() is called. + parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator + origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator +} + +const ( + rpflagsHaveMountRef = 1 << iota // do we hold a reference on mount? + rpflagsHaveStartRef // do we hold a reference on start? + rpflagsFollowFinalSymlink // same as PathOperation.FollowFinalSymlink +) + +func init() { + if maxParts := len(ResolvingPath{}.parts); maxParts > 255 { + panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts)) + } +} + +// Error types that communicate state from the FilesystemImpl-caller, +// VFS-callee side of path resolution (i.e. errors returned by +// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side +// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs +// rather than error values because Go doesn't support non-primitive constants, +// so error "constants" are really mutable vars, necessitating somewhat +// expensive interface object comparisons. + +type resolveMountRootError struct{} + +// Error implements error.Error. +func (resolveMountRootError) Error() string { + return "resolving mount root" +} + +type resolveMountPointError struct{} + +// Error implements error.Error. +func (resolveMountPointError) Error() string { + return "resolving mount point" +} + +type resolveAbsSymlinkError struct{} + +// Error implements error.Error. +func (resolveAbsSymlinkError) Error() string { + return "resolving absolute symlink" +} + +var resolvingPathPool = sync.Pool{ + New: func() interface{} { + return &ResolvingPath{} + }, +} + +func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) { + path, err := fspath.Parse(pop.Pathname) + if err != nil { + return nil, err + } + rp := resolvingPathPool.Get().(*ResolvingPath) + rp.vfs = vfs + rp.root = pop.Root + rp.mount = pop.Start.mount + rp.start = pop.Start.dentry + rp.pit = path.Begin + rp.flags = 0 + if pop.FollowFinalSymlink { + rp.flags |= rpflagsFollowFinalSymlink + } + rp.mustBeDir = path.Dir + rp.mustBeDirOrig = path.Dir + rp.symlinks = 0 + rp.curPart = 0 + rp.numOrigParts = 1 + rp.creds = creds + rp.parts[0] = path.Begin + rp.origParts[0] = path.Begin + return rp, nil +} + +func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { + rp.root = VirtualDentry{} + rp.decRefStartAndMount() + rp.mount = nil + rp.start = nil + rp.releaseErrorState() + resolvingPathPool.Put(rp) +} + +func (rp *ResolvingPath) decRefStartAndMount() { + if rp.flags&rpflagsHaveStartRef != 0 { + rp.start.decRef(rp.mount.fs) + } + if rp.flags&rpflagsHaveMountRef != 0 { + rp.mount.decRef() + } +} + +func (rp *ResolvingPath) releaseErrorState() { + if rp.nextStart != nil { + rp.nextStart.decRef(rp.nextMount.fs) + rp.nextStart = nil + } + if rp.nextMount != nil { + rp.nextMount.decRef() + rp.nextMount = nil + } +} + +// VirtualFilesystem returns the containing VirtualFilesystem. +func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem { + return rp.vfs +} + +// Credentials returns the credentials of rp's provider. +func (rp *ResolvingPath) Credentials() *auth.Credentials { + return rp.creds +} + +// Mount returns the Mount on which path resolution is currently occurring. It +// does not take a reference on the returned Mount. +func (rp *ResolvingPath) Mount() *Mount { + return rp.mount +} + +// Start returns the starting Dentry represented by rp. It does not take a +// reference on the returned Dentry. +func (rp *ResolvingPath) Start() *Dentry { + return rp.start +} + +// Done returns true if there are no remaining path components in the stream +// represented by rp. +func (rp *ResolvingPath) Done() bool { + // We don't need to check for rp.curPart == 0 because rp.Advance() won't + // set rp.pit to a terminal iterator otherwise. + return !rp.pit.Ok() +} + +// Final returns true if there is exactly one remaining path component in the +// stream represented by rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Final() bool { + return rp.curPart == 0 && !rp.pit.NextOk() +} + +// Component returns the current path component in the stream represented by +// rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Component() string { + if checkInvariants { + if !rp.pit.Ok() { + panic("ResolvingPath.Component() called at end of relative path") + } + } + return rp.pit.String() +} + +// Advance advances the stream of path components represented by rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Advance() { + if checkInvariants { + if !rp.pit.Ok() { + panic("ResolvingPath.Advance() called at end of relative path") + } + } + next := rp.pit.Next() + if next.Ok() || rp.curPart == 0 { // have next component, or at end of path + rp.pit = next + } else { // at end of path segment, continue with next one + rp.curPart-- + rp.pit = rp.parts[rp.curPart-1] + } +} + +// Restart resets the stream of path components represented by rp to its state +// on entry to the current FilesystemImpl method. +func (rp *ResolvingPath) Restart() { + rp.pit = rp.origParts[rp.numOrigParts-1] + rp.mustBeDir = rp.mustBeDirOrig + rp.symlinks = rp.symlinksOrig + rp.curPart = rp.numOrigParts - 1 + copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) + rp.releaseErrorState() +} + +func (rp *ResolvingPath) relpathCommit() { + rp.mustBeDirOrig = rp.mustBeDir + rp.symlinksOrig = rp.symlinks + rp.numOrigParts = rp.curPart + 1 + copy(rp.origParts[:rp.curPart], rp.parts[:]) + rp.origParts[rp.curPart] = rp.pit +} + +// ResolveParent returns the VFS parent of d. It does not take a reference on +// the returned Dentry. +// +// Preconditions: There are no concurrent mutators of d. +// +// Postconditions: If the returned error is nil, then the returned Dentry is +// not nil. +func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { + var parent *Dentry + if d == rp.root.dentry && rp.mount == rp.root.mount { + // At contextual VFS root. + parent = d + } else if d == rp.mount.root { + // At mount root ... + mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root) + if mnt != nil { + // ... of non-root mount. + rp.nextMount = mnt + rp.nextStart = mntpt + return nil, resolveMountRootError{} + } + // ... of root mount. + parent = d + } else if d.parent == nil { + // At filesystem root. + parent = d + } else { + parent = d.parent + } + if parent.isMounted() { + if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil { + rp.nextMount = mnt + return nil, resolveMountPointError{} + } + } + return parent, nil +} + +// ResolveChild returns the VFS child of d with the given name. It does not +// take a reference on the returned Dentry. If no such child exists, +// ResolveChild returns (nil, nil). +// +// Preconditions: There are no concurrent mutators of d. +func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) { + child := d.children[name] + if child == nil { + return nil, nil + } + if child.isMounted() { + if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil { + rp.nextMount = mnt + return nil, resolveMountPointError{} + } + } + return child, nil +} + +// ResolveComponent returns the Dentry reached by starting at d and resolving +// the current path component in the stream represented by rp. It does not +// advance the stream. It does not take a reference on the returned Dentry. If +// no such Dentry exists, ResolveComponent returns (nil, nil). +// +// Preconditions: !rp.Done(). There are no concurrent mutators of d. +func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) { + switch pc := rp.Component(); pc { + case ".": + return d, nil + case "..": + return rp.ResolveParent(d) + default: + return rp.ResolveChild(d, pc) + } +} + +// ShouldFollowSymlink returns true if, supposing that the current path +// component in pcs represents a symbolic link, the symbolic link should be +// followed. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) ShouldFollowSymlink() bool { + // Non-final symlinks are always followed. + return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() +} + +// HandleSymlink is called when the current path component is a symbolic link +// to the given target. If the calling Filesystem method should continue path +// traversal, HandleSymlink updates the path component stream to reflect the +// symlink target and returns nil. Otherwise it returns a non-nil error. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) HandleSymlink(target string) error { + if rp.symlinks >= linux.MaxSymlinkTraversals { + return syserror.ELOOP + } + targetPath, err := fspath.Parse(target) + if err != nil { + return err + } + rp.symlinks++ + if targetPath.Absolute { + rp.absSymlinkTarget = targetPath + return resolveAbsSymlinkError{} + } + if !targetPath.Begin.Ok() { + panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target)) + } + // Consume the path component that represented the symlink. + rp.Advance() + // Prepend the symlink target to the relative path. + rp.relpathPrepend(targetPath) + return nil +} + +func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { + if rp.pit.Ok() { + rp.parts[rp.curPart] = rp.pit + rp.pit = path.Begin + rp.curPart++ + } else { + // The symlink was the final path component, so now the symlink target + // is the whole path. + rp.pit = path.Begin + // Symlink targets can set rp.mustBeDir (if they end in a trailing /), + // but can't unset it. + if path.Dir { + rp.mustBeDir = true + } + } +} + +func (rp *ResolvingPath) handleError(err error) bool { + switch err.(type) { + case resolveMountRootError: + // Switch to the new Mount. We hold references on the Mount and Dentry + // (from VFS.getMountpointAt()). + rp.decRefStartAndMount() + rp.mount = rp.nextMount + rp.start = rp.nextStart + rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef + rp.nextMount = nil + rp.nextStart = nil + // Commit the previous FileystemImpl's progress through the relative + // path. (Don't consume the path component that caused us to traverse + // through the mount root - i.e. the ".." - because we still need to + // resolve the mount point's parent in the new FilesystemImpl.) + rp.relpathCommit() + // Restart path resolution on the new Mount. Don't bother calling + // rp.releaseErrorState() since we already set nextMount and nextStart + // to nil above. + return true + + case resolveMountPointError: + // Switch to the new Mount. We hold a reference on the Mount (from + // VFS.getMountAt()), but borrow the reference on the mount root from + // the Mount. + rp.decRefStartAndMount() + rp.mount = rp.nextMount + rp.start = rp.nextMount.root + rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef + rp.nextMount = nil + // Consume the path component that represented the mount point. + rp.Advance() + // Commit the previous FilesystemImpl's progress through the relative + // path. + rp.relpathCommit() + // Restart path resolution on the new Mount. + rp.releaseErrorState() + return true + + case resolveAbsSymlinkError: + // Switch to the new Mount. References are borrowed from rp.root. + rp.decRefStartAndMount() + rp.mount = rp.root.mount + rp.start = rp.root.dentry + rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef + // Consume the path component that represented the symlink. + rp.Advance() + // Prepend the symlink target to the relative path. + rp.relpathPrepend(rp.absSymlinkTarget) + // Commit the previous FilesystemImpl's progress through the relative + // path, including the symlink target we just prepended. + rp.relpathCommit() + // Restart path resolution on the new Mount. + rp.releaseErrorState() + return true + + default: + // Not an error we can handle. + return false + } +} + +// MustBeDir returns true if the file traversed by rp must be a directory. +func (rp *ResolvingPath) MustBeDir() bool { + return rp.mustBeDir +} diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go new file mode 100644 index 000000000..23f2b9e08 --- /dev/null +++ b/pkg/sentry/vfs/syscalls.go @@ -0,0 +1,217 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// PathOperation specifies the path operated on by a VFS method. +// +// PathOperation is passed to VFS methods by pointer to reduce memory copying: +// it's somewhat large and should never escape. (Options structs are passed by +// pointer to VFS and FileDescription methods for the same reason.) +type PathOperation struct { + // Root is the VFS root. References on Root are borrowed from the provider + // of the PathOperation. + // + // Invariants: Root.Ok(). + Root VirtualDentry + + // Start is the starting point for the path traversal. References on Start + // are borrowed from the provider of the PathOperation (i.e. the caller of + // the VFS method to which the PathOperation was passed). + // + // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. + Start VirtualDentry + + // Path is the pathname traversed by this operation. + Pathname string + + // If FollowFinalSymlink is true, and the Dentry traversed by the final + // path component represents a symbolic link, the symbolic link should be + // followed. + FollowFinalSymlink bool +} + +// GetDentryAt returns a VirtualDentry representing the given path, at which a +// file must exist. A reference is taken on the returned VirtualDentry. +func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return VirtualDentry{}, err + } + for { + d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) + if err == nil { + vd := VirtualDentry{ + mount: rp.mount, + dentry: d, + } + rp.mount.incRef() + vfs.putResolvingPath(rp) + return vd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, err + } + } +} + +// MkdirAt creates a directory at the given path. +func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { + // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is + // also honored." - mkdir(2) + opts.Mode &= 01777 + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// OpenAt returns a FileDescription providing access to the file at the given +// path. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { + // Remove: + // + // - O_LARGEFILE, which we always report in FileDescription status flags + // since only 64-bit architectures are supported at this time. + // + // - O_CLOEXEC, which affects file descriptors and therefore must be + // handled outside of VFS. + // + // - Unknown flags. + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE + // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. + if opts.Flags&linux.O_SYNC != 0 { + opts.Flags |= linux.O_DSYNC + } + // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified + // with O_DIRECTORY and a writable access mode (to ensure that it fails on + // filesystem implementations that do not support it). + if opts.Flags&linux.O_TMPFILE != 0 { + if opts.Flags&linux.O_DIRECTORY == 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { + return nil, syserror.EINVAL + } + } + // O_PATH causes most other flags to be ignored. + if opts.Flags&linux.O_PATH != 0 { + opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH + } + // "On Linux, the following bits are also honored in mode: [S_ISUID, + // S_ISGID, S_ISVTX]" - open(2) + opts.Mode &= 07777 + + if opts.Flags&linux.O_NOFOLLOW != 0 { + pop.FollowFinalSymlink = false + } + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil, err + } + if opts.Flags&linux.O_DIRECTORY != 0 { + rp.mustBeDir = true + rp.mustBeDirOrig = true + } + for { + fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return fd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// StatAt returns metadata for the file at the given path. +func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return linux.Statx{}, err + } + for { + stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return stat, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statx{}, err + } + } +} + +// StatusFlags returns file description status flags. +func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { + flags, err := fd.impl.StatusFlags(ctx) + flags |= linux.O_LARGEFILE + return flags, err +} + +// SetStatusFlags sets file description status flags. +func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + return fd.impl.SetStatusFlags(ctx, flags) +} + +// TODO: +// +// - VFS.SyncAllFilesystems() for sync(2) +// +// - Something for syncfs(2) +// +// - VFS.LinkAt() +// +// - VFS.MknodAt() +// +// - VFS.ReadlinkAt() +// +// - VFS.RenameAt() +// +// - VFS.RmdirAt() +// +// - VFS.SetStatAt() +// +// - VFS.StatFSAt() +// +// - VFS.SymlinkAt() +// +// - VFS.UnlinkAt() +// +// - FileDescription.(almost everything) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go new file mode 100644 index 000000000..4a8a69540 --- /dev/null +++ b/pkg/sentry/vfs/vfs.go @@ -0,0 +1,135 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs implements a virtual filesystem layer. +// +// Lock order: +// +// Filesystem implementation locks +// VirtualFilesystem.mountMu +// VirtualFilesystem.fsTypesMu +package vfs + +import ( + "sync" +) + +// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. +// +// There is no analogue to the VirtualFilesystem type in Linux, as the +// equivalent state in Linux is global. +type VirtualFilesystem struct { + // mountMu serializes mount mutations. + // + // mountMu is analogous to Linux's namespace_sem. + mountMu sync.RWMutex + + // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts + // are uniquely namespaced, including mount parent in the key correctly + // handles both bind mounts and mount namespaces; Linux does the same.) + // Synchronization between mutators and readers is provided by mounts.seq; + // synchronization between mutators is provided by mountMu. + // + // mounts is used to follow mount points during path traversal. We use a + // single table rather than per-Dentry tables to reduce size (and therefore + // cache footprint) for the vast majority of Dentries that are not mount + // points. + // + // mounts is analogous to Linux's mount_hashtable. + mounts mountTable + + // mountpoints maps mount points to mounts at those points in all + // namespaces. mountpoints is protected by mountMu. + // + // mountpoints is used to find mounts that must be unmounted due to + // removal of a mount point Dentry from another mount namespace. ("A file + // or directory that is a mount point in one namespace that is not a mount + // point in another namespace, may be renamed, unlinked, or removed + // (rmdir(2)) in the mount namespace in which it is not a mount point + // (subject to the usual permission checks)." - mount_namespaces(7)) + // + // mountpoints is analogous to Linux's mountpoint_hashtable. + mountpoints map[*Dentry]map[*Mount]struct{} + + // fsTypes contains all FilesystemTypes that are usable in the + // VirtualFilesystem. fsTypes is protected by fsTypesMu. + fsTypesMu sync.RWMutex + fsTypes map[string]FilesystemType +} + +// New returns a new VirtualFilesystem with no mounts or FilesystemTypes. +func New() *VirtualFilesystem { + vfs := &VirtualFilesystem{ + mountpoints: make(map[*Dentry]map[*Mount]struct{}), + fsTypes: make(map[string]FilesystemType), + } + vfs.mounts.Init() + return vfs +} + +// A VirtualDentry represents a node in a VFS tree, by combining a Dentry +// (which represents a node in a Filesystem's tree) and a Mount (which +// represents the Filesystem's position in a VFS mount tree). +// +// VirtualDentry's semantics are similar to that of a Go interface object +// representing a pointer: it is a copyable value type that represents +// references to another entity. The zero value of VirtualDentry is an "empty +// VirtualDentry", directly analogous to a nil interface object. +// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless +// otherwise specified, all other VirtualDentry methods require +// VirtualDentry.Ok() == true. +// +// Mounts and Dentries are reference-counted, requiring that users call +// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to +// references on the Mount and Dentry referred to by a VirtualDentry as +// references on the VirtualDentry itself. Unless otherwise specified, all +// VirtualDentry methods require that a reference is held on the VirtualDentry. +// +// VirtualDentry is analogous to Linux's struct path. +type VirtualDentry struct { + mount *Mount + dentry *Dentry +} + +// Ok returns true if vd is not empty. It does not require that a reference is +// held. +func (vd VirtualDentry) Ok() bool { + return vd.mount != nil +} + +// IncRef increments the reference counts on the Mount and Dentry represented +// by vd. +func (vd VirtualDentry) IncRef() { + vd.mount.incRef() + vd.dentry.incRef(vd.mount.fs) +} + +// DecRef decrements the reference counts on the Mount and Dentry represented +// by vd. +func (vd VirtualDentry) DecRef() { + vd.dentry.decRef(vd.mount.fs) + vd.mount.decRef() +} + +// Mount returns the Mount associated with vd. It does not take a reference on +// the returned Mount. +func (vd VirtualDentry) Mount() *Mount { + return vd.mount +} + +// Dentry returns the Dentry associated with vd. It does not take a reference +// on the returned Dentry. +func (vd VirtualDentry) Dentry() *Dentry { + return vd.dentry +} diff --git a/pkg/state/encode.go b/pkg/state/encode.go index 1cfdff643..5d9409a45 100644 --- a/pkg/state/encode.go +++ b/pkg/state/encode.go @@ -84,7 +84,7 @@ type encodeState struct { // register looks up an ID, registering if necessary. // -// If the object was not previosly registered, it is enqueued to be serialized. +// If the object was not previously registered, it is enqueued to be serialized. // See the documentation for idsByObject for more information. func (es *encodeState) register(obj reflect.Value) uint64 { // It is not legal to call register for any non-pointer objects (see diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go index 965808a27..8ff922c69 100644 --- a/pkg/syserr/netstack.go +++ b/pkg/syserr/netstack.go @@ -49,43 +49,44 @@ var ( ) var netstackErrorTranslations = map[*tcpip.Error]*Error{ - tcpip.ErrUnknownProtocol: ErrUnknownProtocol, - tcpip.ErrUnknownNICID: ErrUnknownNICID, - tcpip.ErrUnknownDevice: ErrUnknownDevice, - tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption, - tcpip.ErrDuplicateNICID: ErrDuplicateNICID, - tcpip.ErrDuplicateAddress: ErrDuplicateAddress, - tcpip.ErrNoRoute: ErrNoRoute, - tcpip.ErrBadLinkEndpoint: ErrBadLinkEndpoint, - tcpip.ErrAlreadyBound: ErrAlreadyBound, - tcpip.ErrInvalidEndpointState: ErrInvalidEndpointState, - tcpip.ErrAlreadyConnecting: ErrAlreadyConnecting, - tcpip.ErrAlreadyConnected: ErrAlreadyConnected, - tcpip.ErrNoPortAvailable: ErrNoPortAvailable, - tcpip.ErrPortInUse: ErrPortInUse, - tcpip.ErrBadLocalAddress: ErrBadLocalAddress, - tcpip.ErrClosedForSend: ErrClosedForSend, - tcpip.ErrClosedForReceive: ErrClosedForReceive, - tcpip.ErrWouldBlock: ErrWouldBlock, - tcpip.ErrConnectionRefused: ErrConnectionRefused, - tcpip.ErrTimeout: ErrTimeout, - tcpip.ErrAborted: ErrAborted, - tcpip.ErrConnectStarted: ErrConnectStarted, - tcpip.ErrDestinationRequired: ErrDestinationRequired, - tcpip.ErrNotSupported: ErrNotSupported, - tcpip.ErrQueueSizeNotSupported: ErrQueueSizeNotSupported, - tcpip.ErrNotConnected: ErrNotConnected, - tcpip.ErrConnectionReset: ErrConnectionReset, - tcpip.ErrConnectionAborted: ErrConnectionAborted, - tcpip.ErrNoSuchFile: ErrNoSuchFile, - tcpip.ErrInvalidOptionValue: ErrInvalidOptionValue, - tcpip.ErrNoLinkAddress: ErrHostDown, - tcpip.ErrBadAddress: ErrBadAddress, - tcpip.ErrNetworkUnreachable: ErrNetworkUnreachable, - tcpip.ErrMessageTooLong: ErrMessageTooLong, - tcpip.ErrNoBufferSpace: ErrNoBufferSpace, - tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled, - tcpip.ErrNotPermitted: ErrNotPermittedNet, + tcpip.ErrUnknownProtocol: ErrUnknownProtocol, + tcpip.ErrUnknownNICID: ErrUnknownNICID, + tcpip.ErrUnknownDevice: ErrUnknownDevice, + tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption, + tcpip.ErrDuplicateNICID: ErrDuplicateNICID, + tcpip.ErrDuplicateAddress: ErrDuplicateAddress, + tcpip.ErrNoRoute: ErrNoRoute, + tcpip.ErrBadLinkEndpoint: ErrBadLinkEndpoint, + tcpip.ErrAlreadyBound: ErrAlreadyBound, + tcpip.ErrInvalidEndpointState: ErrInvalidEndpointState, + tcpip.ErrAlreadyConnecting: ErrAlreadyConnecting, + tcpip.ErrAlreadyConnected: ErrAlreadyConnected, + tcpip.ErrNoPortAvailable: ErrNoPortAvailable, + tcpip.ErrPortInUse: ErrPortInUse, + tcpip.ErrBadLocalAddress: ErrBadLocalAddress, + tcpip.ErrClosedForSend: ErrClosedForSend, + tcpip.ErrClosedForReceive: ErrClosedForReceive, + tcpip.ErrWouldBlock: ErrWouldBlock, + tcpip.ErrConnectionRefused: ErrConnectionRefused, + tcpip.ErrTimeout: ErrTimeout, + tcpip.ErrAborted: ErrAborted, + tcpip.ErrConnectStarted: ErrConnectStarted, + tcpip.ErrDestinationRequired: ErrDestinationRequired, + tcpip.ErrNotSupported: ErrNotSupported, + tcpip.ErrQueueSizeNotSupported: ErrQueueSizeNotSupported, + tcpip.ErrNotConnected: ErrNotConnected, + tcpip.ErrConnectionReset: ErrConnectionReset, + tcpip.ErrConnectionAborted: ErrConnectionAborted, + tcpip.ErrNoSuchFile: ErrNoSuchFile, + tcpip.ErrInvalidOptionValue: ErrInvalidOptionValue, + tcpip.ErrNoLinkAddress: ErrHostDown, + tcpip.ErrBadAddress: ErrBadAddress, + tcpip.ErrNetworkUnreachable: ErrNetworkUnreachable, + tcpip.ErrMessageTooLong: ErrMessageTooLong, + tcpip.ErrNoBufferSpace: ErrNoBufferSpace, + tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled, + tcpip.ErrNotPermitted: ErrNotPermittedNet, + tcpip.ErrAddressFamilyNotSupported: ErrAddressFamilyNotSupported, } // TranslateNetstackError converts an error from the tcpip package to a sentry diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index 345653544..1987e89cc 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -48,6 +48,7 @@ var ( EMSGSIZE = error(syscall.EMSGSIZE) ENAMETOOLONG = error(syscall.ENAMETOOLONG) ENOATTR = ENODATA + ENOBUFS = error(syscall.ENOBUFS) ENODATA = error(syscall.ENODATA) ENODEV = error(syscall.ENODEV) ENOENT = error(syscall.ENOENT) @@ -59,6 +60,7 @@ var ( ENOSYS = error(syscall.ENOSYS) ENOTDIR = error(syscall.ENOTDIR) ENOTEMPTY = error(syscall.ENOTEMPTY) + ENOTSOCK = error(syscall.ENOTSOCK) ENOTSUP = error(syscall.ENOTSUP) ENOTTY = error(syscall.ENOTTY) ENXIO = error(syscall.ENXIO) diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go index 1a9d40778..150310c11 100644 --- a/pkg/tcpip/buffer/view.go +++ b/pkg/tcpip/buffer/view.go @@ -50,7 +50,7 @@ func (v View) ToVectorisedView() VectorisedView { return NewVectorisedView(len(v), []View{v}) } -// VectorisedView is a vectorised version of View using non contigous memory. +// VectorisedView is a vectorised version of View using non contiguous memory. // It supports all the convenience methods supported by View. // // +stateify savable diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go index c081de61f..c52c0d851 100644 --- a/pkg/tcpip/header/icmpv4.go +++ b/pkg/tcpip/header/icmpv4.go @@ -24,15 +24,11 @@ import ( type ICMPv4 []byte const ( - // ICMPv4MinimumSize is the minimum size of a valid ICMP packet. - ICMPv4MinimumSize = 4 - - // ICMPv4EchoMinimumSize is the minimum size of a valid ICMP echo packet. - ICMPv4EchoMinimumSize = 6 + // ICMPv4PayloadOffset defines the start of ICMP payload. + ICMPv4PayloadOffset = 4 - // ICMPv4DstUnreachableMinimumSize is the minimum size of a valid ICMP - // destination unreachable packet. - ICMPv4DstUnreachableMinimumSize = ICMPv4MinimumSize + 4 + // ICMPv4MinimumSize is the minimum size of a valid ICMP packet. + ICMPv4MinimumSize = 8 // ICMPv4ProtocolNumber is the ICMP transport protocol number. ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1 @@ -104,5 +100,5 @@ func (ICMPv4) SetDestinationPort(uint16) { // Payload implements Transport.Payload. func (b ICMPv4) Payload() []byte { - return b[ICMPv4MinimumSize:] + return b[ICMPv4PayloadOffset:] } diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go index 7da4c4845..94a3af289 100644 --- a/pkg/tcpip/header/ipv4.go +++ b/pkg/tcpip/header/ipv4.go @@ -85,6 +85,10 @@ const ( // units, the header cannot exceed 15*4 = 60 bytes. IPv4MaximumHeaderSize = 60 + // MinIPFragmentPayloadSize is the minimum number of payload bytes that + // the first fragment must carry when an IPv4 packet is fragmented. + MinIPFragmentPayloadSize = 8 + // IPv4AddressSize is the size, in bytes, of an IPv4 address. IPv4AddressSize = 4 @@ -268,6 +272,10 @@ func (b IPv4) IsValid(pktSize int) bool { return false } + if IPVersion(b) != IPv4Version { + return false + } + return true } diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go index 7163eaa36..95fe8bfc3 100644 --- a/pkg/tcpip/header/ipv6.go +++ b/pkg/tcpip/header/ipv6.go @@ -184,6 +184,10 @@ func (b IPv6) IsValid(pktSize int) bool { return false } + if IPVersion(b) != IPv6Version { + return false + } + return true } diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go index 1141443bb..82cfe785c 100644 --- a/pkg/tcpip/header/tcp.go +++ b/pkg/tcpip/header/tcp.go @@ -176,6 +176,21 @@ const ( // TCPProtocolNumber is TCP's transport protocol number. TCPProtocolNumber tcpip.TransportProtocolNumber = 6 + + // TCPMinimumMSS is the minimum acceptable value for MSS. This is the + // same as the value TCP_MIN_MSS defined net/tcp.h. + TCPMinimumMSS = IPv4MaximumHeaderSize + TCPHeaderMaximumSize + MinIPFragmentPayloadSize - IPv4MinimumSize - TCPMinimumSize + + // TCPMaximumMSS is the maximum acceptable value for MSS. + TCPMaximumMSS = 0xffff + + // TCPDefaultMSS is the MSS value that should be used if an MSS option + // is not received from the peer. It's also the value returned by + // TCP_MAXSEG option for a socket in an unconnected state. + // + // Per RFC 1122, page 85: "If an MSS option is not received at + // connection setup, TCP MUST assume a default send MSS of 536." + TCPDefaultMSS = 536 ) // SourcePort returns the "source port" field of the tcp header. @@ -306,7 +321,7 @@ func ParseSynOptions(opts []byte, isAck bool) TCPSynOptions { synOpts := TCPSynOptions{ // Per RFC 1122, page 85: "If an MSS option is not received at // connection setup, TCP MUST assume a default send MSS of 536." - MSS: 536, + MSS: TCPDefaultMSS, // If no window scale option is specified, WS in options is // returned as -1; this is because the absence of the option // indicates that the we cannot use window scaling on the diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD new file mode 100644 index 000000000..fc9abbb55 --- /dev/null +++ b/pkg/tcpip/iptables/BUILD @@ -0,0 +1,18 @@ +package(licenses = ["notice"]) + +load("//tools/go_stateify:defs.bzl", "go_library") + +go_library( + name = "iptables", + srcs = [ + "iptables.go", + "targets.go", + "types.go", + ], + importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables", + visibility = ["//visibility:public"], + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/buffer", + ], +) diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go new file mode 100644 index 000000000..f1e1d1fad --- /dev/null +++ b/pkg/tcpip/iptables/iptables.go @@ -0,0 +1,81 @@ +// Copyright 2019 The gVisor authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package iptables supports packet filtering and manipulation via the iptables +// tool. +package iptables + +const ( + tablenameNat = "nat" + tablenameMangle = "mangle" +) + +// Chain names as defined by net/ipv4/netfilter/ip_tables.c. +const ( + chainNamePrerouting = "PREROUTING" + chainNameInput = "INPUT" + chainNameForward = "FORWARD" + chainNameOutput = "OUTPUT" + chainNamePostrouting = "POSTROUTING" +) + +// DefaultTables returns a default set of tables. Each chain is set to accept +// all packets. +func DefaultTables() *IPTables { + return &IPTables{ + Tables: map[string]Table{ + tablenameNat: Table{ + BuiltinChains: map[Hook]Chain{ + Prerouting: unconditionalAcceptChain(chainNamePrerouting), + Input: unconditionalAcceptChain(chainNameInput), + Output: unconditionalAcceptChain(chainNameOutput), + Postrouting: unconditionalAcceptChain(chainNamePostrouting), + }, + DefaultTargets: map[Hook]Target{ + Prerouting: UnconditionalAcceptTarget{}, + Input: UnconditionalAcceptTarget{}, + Output: UnconditionalAcceptTarget{}, + Postrouting: UnconditionalAcceptTarget{}, + }, + UserChains: map[string]Chain{}, + }, + tablenameMangle: Table{ + BuiltinChains: map[Hook]Chain{ + Prerouting: unconditionalAcceptChain(chainNamePrerouting), + Output: unconditionalAcceptChain(chainNameOutput), + }, + DefaultTargets: map[Hook]Target{ + Prerouting: UnconditionalAcceptTarget{}, + Output: UnconditionalAcceptTarget{}, + }, + UserChains: map[string]Chain{}, + }, + }, + Priorities: map[Hook][]string{ + Prerouting: []string{tablenameMangle, tablenameNat}, + Output: []string{tablenameMangle, tablenameNat}, + }, + } +} + +func unconditionalAcceptChain(name string) Chain { + return Chain{ + Name: name, + Rules: []Rule{ + Rule{ + Target: UnconditionalAcceptTarget{}, + }, + }, + } +} diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go new file mode 100644 index 000000000..19a7f77e3 --- /dev/null +++ b/pkg/tcpip/iptables/targets.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file contains various Targets. + +package iptables + +import "gvisor.dev/gvisor/pkg/tcpip/buffer" + +// UnconditionalAcceptTarget accepts all packets. +type UnconditionalAcceptTarget struct{} + +// Action implements Target.Action. +func (UnconditionalAcceptTarget) Action(packet buffer.VectorisedView) (Verdict, string) { + return Accept, "" +} + +// UnconditionalDropTarget denies all packets. +type UnconditionalDropTarget struct{} + +// Action implements Target.Action. +func (UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, string) { + return Drop, "" +} diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go new file mode 100644 index 000000000..600bd9a10 --- /dev/null +++ b/pkg/tcpip/iptables/types.go @@ -0,0 +1,183 @@ +// Copyright 2019 The gVisor authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" +) + +// A Hook specifies one of the hooks built into the network stack. +// +// Userspace app Userspace app +// ^ | +// | v +// [Input] [Output] +// ^ | +// | v +// | routing +// | | +// | v +// ----->[Prerouting]----->routing----->[Forward]---------[Postrouting]-----> +type Hook uint + +// These values correspond to values in include/uapi/linux/netfilter.h. +const ( + // Prerouting happens before a packet is routed to applications or to + // be forwarded. + Prerouting Hook = iota + + // Input happens before a packet reaches an application. + Input + + // Forward happens once it's decided that a packet should be forwarded + // to another host. + Forward + + // Output happens after a packet is written by an application to be + // sent out. + Output + + // Postrouting happens just before a packet goes out on the wire. + Postrouting + + // The total number of hooks. + NumHooks +) + +// A Verdict is returned by a rule's target to indicate how traversal of rules +// should (or should not) continue. +type Verdict int + +const ( + // Accept indicates the packet should continue traversing netstack as + // normal. + Accept Verdict = iota + + // Drop inicates the packet should be dropped, stopping traversing + // netstack. + Drop + + // Stolen indicates the packet was co-opted by the target and should + // stop traversing netstack. + Stolen + + // Queue indicates the packet should be queued for userspace processing. + Queue + + // Repeat indicates the packet should re-traverse the chains for the + // current hook. + Repeat + + // None indicates no verdict was reached. + None + + // Jump indicates a jump to another chain. + Jump + + // Continue indicates that traversal should continue at the next rule. + Continue + + // Return indicates that traversal should return to the calling chain. + Return +) + +// IPTables holds all the tables for a netstack. +type IPTables struct { + // Tables maps table names to tables. User tables have arbitrary names. + Tables map[string]Table + + // Priorities maps each hook to a list of table names. The order of the + // list is the order in which each table should be visited for that + // hook. + Priorities map[Hook][]string +} + +// A Table defines a set of chains and hooks into the network stack. The +// currently supported tables are: +// * nat +// * mangle +type Table struct { + // BuiltinChains holds the un-deletable chains built into netstack. If + // a hook isn't present in the map, this table doesn't utilize that + // hook. + BuiltinChains map[Hook]Chain + + // DefaultTargets holds a target for each hook that will be executed if + // chain traversal doesn't yield a verdict. + DefaultTargets map[Hook]Target + + // UserChains holds user-defined chains for the keyed by name. Users + // can give their chains arbitrary names. + UserChains map[string]Chain + + // Chains maps names to chains for both builtin and user-defined chains. + // Its entries point to Chains already either in BuiltinChains or + // UserChains, and its purpose is to make looking up tables by name + // fast. + Chains map[string]*Chain +} + +// ValidHooks returns a bitmap of the builtin hooks for the given table. +func (table *Table) ValidHooks() (uint32, *tcpip.Error) { + hooks := uint32(0) + for hook, _ := range table.BuiltinChains { + hooks |= 1 << hook + } + return hooks, nil +} + +// A Chain defines a list of rules for packet processing. When a packet +// traverses a chain, it is checked against each rule until either a rule +// returns a verdict or the chain ends. +// +// By convention, builtin chains end with a rule that matches everything and +// returns either Accept or Drop. User-defined chains end with Return. These +// aren't strictly necessary here, but the iptables tool writes tables this way. +type Chain struct { + // Name is the chain name. + Name string + + // Rules is the list of rules to traverse. + Rules []Rule +} + +// A Rule is a packet processing rule. It consists of two pieces. First it +// contains zero or more matchers, each of which is a specification of which +// packets this rule applies to. If there are no matchers in the rule, it +// applies to any packet. +type Rule struct { + // Matchers is the list of matchers for this rule. + Matchers []Matcher + + // Target is the action to invoke if all the matchers match the packet. + Target Target +} + +// A Matcher is the interface for matching packets. +type Matcher interface { + // Match returns whether the packet matches and whether the packet + // should be "hotdropped", i.e. dropped immediately. This is usually + // used for suspicious packets. + Match(hook Hook, packet buffer.VectorisedView, interfaceName string) (matches bool, hotdrop bool) +} + +// A Target is the interface for taking an action for a packet. +type Target interface { + // Action takes an action on the packet and returns a verdict on how + // traversal should (or should not) continue. If the return value is + // Jump, it also returns the name of the chain to jump to. + Action(packet buffer.VectorisedView) (Verdict, string) +} diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go index 80e91bb34..a0a873c84 100644 --- a/pkg/tcpip/link/rawfile/errors.go +++ b/pkg/tcpip/link/rawfile/errors.go @@ -30,7 +30,7 @@ var translations [maxErrno]*tcpip.Error // TranslateErrno translate an errno from the syscall package into a // *tcpip.Error. // -// Valid, but unreconigized errnos will be translated to +// Valid, but unrecognized errnos will be translated to // tcpip.ErrInvalidEndpointState (EINVAL). Panics on invalid errnos. func TranslateErrno(e syscall.Errno) *tcpip.Error { if err := translations[e]; err != nil { diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index 08847f95f..e3fbb15c2 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -110,7 +110,7 @@ type PollEvent struct { // BlockingRead reads from a file descriptor that is set up as non-blocking. If // no data is available, it will block in a poll() syscall until the file -// descirptor becomes readable. +// descriptor becomes readable. func BlockingRead(fd int, b []byte) (int, *tcpip.Error) { for { n, _, e := syscall.RawSyscall(syscall.SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(&b[0])), uintptr(len(b))) diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index ca3d6c0bf..cb35635fc 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -83,6 +83,10 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, buffer.Prependable, buf return tcpip.ErrNotSupported } +func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { + return tcpip.ErrNotSupported +} + func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { v := vv.First() h := header.ARP(v) diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go index 6822059d6..1628a82be 100644 --- a/pkg/tcpip/network/fragmentation/fragmentation.go +++ b/pkg/tcpip/network/fragmentation/fragmentation.go @@ -60,7 +60,7 @@ type Fragmentation struct { // lowMemoryLimit specifies the limit on which we will reach by dropping // fragments after reaching highMemoryLimit. // -// reassemblingTimeout specifes the maximum time allowed to reassemble a packet. +// reassemblingTimeout specifies the maximum time allowed to reassemble a packet. // Fragments are lazily evicted only when a new a packet with an // already existing fragmentation-id arrives after the timeout. func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation { @@ -80,7 +80,7 @@ func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout t } } -// Process processes an incoming fragment beloning to an ID +// Process processes an incoming fragment belonging to an ID // and returns a complete packet when all the packets belonging to that ID have been received. func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool) { f.mu.Lock() diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index db65ee7cc..8ff428445 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -282,10 +282,10 @@ func TestIPv4ReceiveControl(t *testing.T) { {"Truncated (10 bytes missing)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, 10}, {"Truncated (missing IPv4 header)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, header.IPv4MinimumSize + 8}, {"Truncated (missing 'extra info')", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, 4 + header.IPv4MinimumSize + 8}, - {"Truncated (missing ICMP header)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, header.ICMPv4DstUnreachableMinimumSize + header.IPv4MinimumSize + 8}, + {"Truncated (missing ICMP header)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, header.ICMPv4MinimumSize + header.IPv4MinimumSize + 8}, {"Port unreachable", 1, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0}, {"Non-zero fragment offset", 0, 100, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0}, - {"Zero-length packet", 0, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv4MinimumSize + header.ICMPv4DstUnreachableMinimumSize + 8}, + {"Zero-length packet", 0, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv4MinimumSize + header.ICMPv4MinimumSize + 8}, } r, err := buildIPv4Route(localIpv4Addr, "\x0a\x00\x00\xbb") if err != nil { @@ -301,7 +301,7 @@ func TestIPv4ReceiveControl(t *testing.T) { } defer ep.Close() - const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize + 4 + const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize view := buffer.NewView(dataOffset + 8) // Create the outer IPv4 header. @@ -319,10 +319,10 @@ func TestIPv4ReceiveControl(t *testing.T) { icmp := header.ICMPv4(view[header.IPv4MinimumSize:]) icmp.SetType(header.ICMPv4DstUnreachable) icmp.SetCode(c.code) - copy(view[header.IPv4MinimumSize+header.ICMPv4MinimumSize:], []byte{0xde, 0xad, 0xbe, 0xef}) + copy(view[header.IPv4MinimumSize+header.ICMPv4PayloadOffset:], []byte{0xde, 0xad, 0xbe, 0xef}) // Create the inner IPv4 header. - ip = header.IPv4(view[header.IPv4MinimumSize+header.ICMPv4MinimumSize+4:]) + ip = header.IPv4(view[header.IPv4MinimumSize+header.ICMPv4MinimumSize:]) ip.Encode(&header.IPv4Fields{ IHL: header.IPv4MinimumSize, TotalLength: 100, diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go index bc7f1c42a..fbef6947d 100644 --- a/pkg/tcpip/network/ipv4/icmp.go +++ b/pkg/tcpip/network/ipv4/icmp.go @@ -68,10 +68,6 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V switch h.Type() { case header.ICMPv4Echo: received.Echo.Increment() - if len(v) < header.ICMPv4EchoMinimumSize { - received.Invalid.Increment() - return - } // Only send a reply if the checksum is valid. wantChecksum := h.Checksum() @@ -93,9 +89,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv) vv := vv.Clone(nil) - vv.TrimFront(header.ICMPv4EchoMinimumSize) - hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv4EchoMinimumSize) - pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize)) + vv.TrimFront(header.ICMPv4MinimumSize) + hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize) + pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize)) copy(pkt, h) pkt.SetType(header.ICMPv4EchoReply) pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0))) @@ -108,25 +104,19 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V case header.ICMPv4EchoReply: received.EchoReply.Increment() - if len(v) < header.ICMPv4EchoMinimumSize { - received.Invalid.Increment() - return - } + e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv) case header.ICMPv4DstUnreachable: received.DstUnreachable.Increment() - if len(v) < header.ICMPv4DstUnreachableMinimumSize { - received.Invalid.Increment() - return - } - vv.TrimFront(header.ICMPv4DstUnreachableMinimumSize) + + vv.TrimFront(header.ICMPv4MinimumSize) switch h.Code() { case header.ICMPv4PortUnreachable: e.handleControl(stack.ControlPortUnreachable, 0, vv) case header.ICMPv4FragmentationNeeded: - mtu := uint32(binary.BigEndian.Uint16(v[header.ICMPv4DstUnreachableMinimumSize-2:])) + mtu := uint32(binary.BigEndian.Uint16(v[header.ICMPv4PayloadOffset+2:])) e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv) } diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index 1e3a7425a..e44a73d96 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -232,6 +232,55 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return nil } +// WriteHeaderIncludedPacket writes a packet already containing a network +// header through the given route. +func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { + // The packet already has an IP header, but there are a few required + // checks. + ip := header.IPv4(payload.First()) + if !ip.IsValid(payload.Size()) { + return tcpip.ErrInvalidOptionValue + } + + // Always set the total length. + ip.SetTotalLength(uint16(payload.Size())) + + // Set the source address when zero. + if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) { + ip.SetSourceAddress(r.LocalAddress) + } + + // Set the destination. If the packet already included a destination, + // it will be part of the route. + ip.SetDestinationAddress(r.RemoteAddress) + + // Set the packet ID when zero. + if ip.ID() == 0 { + id := uint32(0) + if payload.Size() > header.IPv4MaximumHeaderSize+8 { + // Packets of 68 bytes or less are required by RFC 791 to not be + // fragmented, so we only assign ids to larger packets. + id = atomic.AddUint32(&ids[hashRoute(r, 0 /* protocol */)%buckets], 1) + } + ip.SetID(uint16(id)) + } + + // Always set the checksum. + ip.SetChecksum(0) + ip.SetChecksum(^ip.CalculateChecksum()) + + if loop&stack.PacketLoop != 0 { + e.HandlePacket(r, payload) + } + if loop&stack.PacketOut == 0 { + return nil + } + + hdr := buffer.NewPrependableFromView(payload.ToView()) + r.Stats().IP.PacketsSent.Increment() + return e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber) +} + // HandlePacket is called by the link layer when new ipv4 packets arrive for // this endpoint. func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index 27367d6c5..e3e8739fd 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -120,6 +120,13 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return e.linkEP.WritePacket(r, gso, hdr, payload, ProtocolNumber) } +// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet +// supported by IPv6. +func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { + // TODO(b/119580726): Support IPv6 header-included packets. + return tcpip.ErrNotSupported +} + // HandlePacket is called by the link layer when new ipv6 packets arrive for // this endpoint. func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index 0ecaa0833..462265281 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -174,6 +174,10 @@ type NetworkEndpoint interface { // protocol. WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop PacketLooping) *tcpip.Error + // WriteHeaderIncludedPacket writes a packet that includes a network + // header to the given destination address. + WriteHeaderIncludedPacket(r *Route, payload buffer.VectorisedView, loop PacketLooping) *tcpip.Error + // ID returns the network protocol endpoint ID. ID() *NetworkEndpointID @@ -357,10 +361,19 @@ type TransportProtocolFactory func() TransportProtocol // instantiate network protocols. type NetworkProtocolFactory func() NetworkProtocol +// UnassociatedEndpointFactory produces endpoints for writing packets not +// associated with a particular transport protocol. Such endpoints can be used +// to write arbitrary packets that include the IP header. +type UnassociatedEndpointFactory interface { + NewUnassociatedRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) +} + var ( transportProtocols = make(map[string]TransportProtocolFactory) networkProtocols = make(map[string]NetworkProtocolFactory) + unassociatedFactory UnassociatedEndpointFactory + linkEPMu sync.RWMutex nextLinkEndpointID tcpip.LinkEndpointID = 1 linkEndpoints = make(map[tcpip.LinkEndpointID]LinkEndpoint) @@ -380,6 +393,13 @@ func RegisterNetworkProtocolFactory(name string, p NetworkProtocolFactory) { networkProtocols[name] = p } +// RegisterUnassociatedFactory registers a factory to produce endpoints not +// associated with any particular transport protocol. This function is intended +// to be called by init() functions of the protocols. +func RegisterUnassociatedFactory(f UnassociatedEndpointFactory) { + unassociatedFactory = f +} + // RegisterLinkEndpoint register a link-layer protocol endpoint and returns an // ID that can be used to refer to it. func RegisterLinkEndpoint(linkEP LinkEndpoint) tcpip.LinkEndpointID { diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index 36d7b6ac7..391ab4344 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -163,6 +163,18 @@ func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.Vec return err } +// WriteHeaderIncludedPacket writes a packet already containing a network +// header through the given route. +func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip.Error { + if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.loop); err != nil { + r.Stats().IP.OutgoingPacketErrors.Increment() + return err + } + r.ref.nic.stats.Tx.Packets.Increment() + r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payload.Size())) + return nil +} + // DefaultTTL returns the default TTL of the underlying network endpoint. func (r *Route) DefaultTTL() uint8 { return r.ref.ep.DefaultTTL() diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 2d7f56ca9..3e8fb2a6c 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -340,6 +340,8 @@ type Stack struct { networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol linkAddrResolvers map[tcpip.NetworkProtocolNumber]LinkAddressResolver + unassociatedFactory UnassociatedEndpointFactory + demux *transportDemuxer stats tcpip.Stats @@ -442,6 +444,8 @@ func New(network []string, transport []string, opts Options) *Stack { } } + s.unassociatedFactory = unassociatedFactory + // Create the global transport demuxer. s.demux = newTransportDemuxer(s) @@ -574,11 +578,15 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp // NewRawEndpoint creates a new raw transport layer endpoint of the given // protocol. Raw endpoints receive all traffic for a given protocol regardless // of address. -func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { +func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) { if !s.raw { return nil, tcpip.ErrNotPermitted } + if !associated { + return s.unassociatedFactory.NewUnassociatedRawEndpoint(s, network, transport, waiterQueue) + } + t, ok := s.transportProtocols[transport] if !ok { return nil, tcpip.ErrUnknownProtocol diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 69884af03..959071dbe 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -137,6 +137,10 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr bu return f.linkEP.WritePacket(r, gso, hdr, payload, fakeNetNumber) } +func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { + return tcpip.ErrNotSupported +} + func (*fakeNetworkEndpoint) Close() {} type fakeNetGoodOption bool diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go index 788ffcc8c..b418db046 100644 --- a/pkg/tcpip/stack/transport_test.go +++ b/pkg/tcpip/stack/transport_test.go @@ -90,6 +90,11 @@ func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error { return tcpip.ErrInvalidEndpointState } +// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. +func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { + return -1, tcpip.ErrUnknownProtocolOption +} + // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (*fakeTransportEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { switch opt.(type) { diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 4aafb51ab..c5d79da5e 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -66,43 +66,44 @@ func (e *Error) IgnoreStats() bool { // Errors that can be returned by the network stack. var ( - ErrUnknownProtocol = &Error{msg: "unknown protocol"} - ErrUnknownNICID = &Error{msg: "unknown nic id"} - ErrUnknownDevice = &Error{msg: "unknown device"} - ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"} - ErrDuplicateNICID = &Error{msg: "duplicate nic id"} - ErrDuplicateAddress = &Error{msg: "duplicate address"} - ErrNoRoute = &Error{msg: "no route"} - ErrBadLinkEndpoint = &Error{msg: "bad link layer endpoint"} - ErrAlreadyBound = &Error{msg: "endpoint already bound", ignoreStats: true} - ErrInvalidEndpointState = &Error{msg: "endpoint is in invalid state"} - ErrAlreadyConnecting = &Error{msg: "endpoint is already connecting", ignoreStats: true} - ErrAlreadyConnected = &Error{msg: "endpoint is already connected", ignoreStats: true} - ErrNoPortAvailable = &Error{msg: "no ports are available"} - ErrPortInUse = &Error{msg: "port is in use"} - ErrBadLocalAddress = &Error{msg: "bad local address"} - ErrClosedForSend = &Error{msg: "endpoint is closed for send"} - ErrClosedForReceive = &Error{msg: "endpoint is closed for receive"} - ErrWouldBlock = &Error{msg: "operation would block", ignoreStats: true} - ErrConnectionRefused = &Error{msg: "connection was refused"} - ErrTimeout = &Error{msg: "operation timed out"} - ErrAborted = &Error{msg: "operation aborted"} - ErrConnectStarted = &Error{msg: "connection attempt started", ignoreStats: true} - ErrDestinationRequired = &Error{msg: "destination address is required"} - ErrNotSupported = &Error{msg: "operation not supported"} - ErrQueueSizeNotSupported = &Error{msg: "queue size querying not supported"} - ErrNotConnected = &Error{msg: "endpoint not connected"} - ErrConnectionReset = &Error{msg: "connection reset by peer"} - ErrConnectionAborted = &Error{msg: "connection aborted"} - ErrNoSuchFile = &Error{msg: "no such file"} - ErrInvalidOptionValue = &Error{msg: "invalid option value specified"} - ErrNoLinkAddress = &Error{msg: "no remote link address"} - ErrBadAddress = &Error{msg: "bad address"} - ErrNetworkUnreachable = &Error{msg: "network is unreachable"} - ErrMessageTooLong = &Error{msg: "message too long"} - ErrNoBufferSpace = &Error{msg: "no buffer space available"} - ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"} - ErrNotPermitted = &Error{msg: "operation not permitted"} + ErrUnknownProtocol = &Error{msg: "unknown protocol"} + ErrUnknownNICID = &Error{msg: "unknown nic id"} + ErrUnknownDevice = &Error{msg: "unknown device"} + ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"} + ErrDuplicateNICID = &Error{msg: "duplicate nic id"} + ErrDuplicateAddress = &Error{msg: "duplicate address"} + ErrNoRoute = &Error{msg: "no route"} + ErrBadLinkEndpoint = &Error{msg: "bad link layer endpoint"} + ErrAlreadyBound = &Error{msg: "endpoint already bound", ignoreStats: true} + ErrInvalidEndpointState = &Error{msg: "endpoint is in invalid state"} + ErrAlreadyConnecting = &Error{msg: "endpoint is already connecting", ignoreStats: true} + ErrAlreadyConnected = &Error{msg: "endpoint is already connected", ignoreStats: true} + ErrNoPortAvailable = &Error{msg: "no ports are available"} + ErrPortInUse = &Error{msg: "port is in use"} + ErrBadLocalAddress = &Error{msg: "bad local address"} + ErrClosedForSend = &Error{msg: "endpoint is closed for send"} + ErrClosedForReceive = &Error{msg: "endpoint is closed for receive"} + ErrWouldBlock = &Error{msg: "operation would block", ignoreStats: true} + ErrConnectionRefused = &Error{msg: "connection was refused"} + ErrTimeout = &Error{msg: "operation timed out"} + ErrAborted = &Error{msg: "operation aborted"} + ErrConnectStarted = &Error{msg: "connection attempt started", ignoreStats: true} + ErrDestinationRequired = &Error{msg: "destination address is required"} + ErrNotSupported = &Error{msg: "operation not supported"} + ErrQueueSizeNotSupported = &Error{msg: "queue size querying not supported"} + ErrNotConnected = &Error{msg: "endpoint not connected"} + ErrConnectionReset = &Error{msg: "connection reset by peer"} + ErrConnectionAborted = &Error{msg: "connection aborted"} + ErrNoSuchFile = &Error{msg: "no such file"} + ErrInvalidOptionValue = &Error{msg: "invalid option value specified"} + ErrNoLinkAddress = &Error{msg: "no remote link address"} + ErrBadAddress = &Error{msg: "bad address"} + ErrNetworkUnreachable = &Error{msg: "network is unreachable"} + ErrMessageTooLong = &Error{msg: "message too long"} + ErrNoBufferSpace = &Error{msg: "no buffer space available"} + ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"} + ErrNotPermitted = &Error{msg: "operation not permitted"} + ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"} ) // Errors related to Subnet @@ -287,6 +288,12 @@ type ControlMessages struct { // Timestamp is the time (in ns) that the last packed used to create // the read data was received. Timestamp int64 + + // HasInq indicates whether Inq is valid/set. + HasInq bool + + // Inq is the number of bytes ready to be received. + Inq int32 } // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp) @@ -339,6 +346,10 @@ type Endpoint interface { // get the actual result. The first call to Connect after the socket has // connected returns nil. Calling connect again results in ErrAlreadyConnected. // Anything else -- the attempt to connect failed. + // + // If address.Addr is empty, this means that Enpoint has to be + // disconnected if this is supported, otherwise + // ErrAddressFamilyNotSupported must be returned. Connect(address FullAddress) *Error // Shutdown closes the read and/or write end of the endpoint connection @@ -378,6 +389,10 @@ type Endpoint interface { // *Option types. GetSockOpt(opt interface{}) *Error + // GetSockOptInt gets a socket option for simple cases where a return + // value has the int type. + GetSockOptInt(SockOpt) (int, *Error) + // State returns a socket's lifecycle state. The returned value is // protocol-specific and is primarily used for diagnostics. State() uint32 @@ -403,6 +418,18 @@ type WriteOptions struct { EndOfRecord bool } +// SockOpt represents socket options which values have the int type. +type SockOpt int + +const ( + // ReceiveQueueSizeOption is used in GetSockOpt to specify that the number of + // unread bytes in the input buffer should be returned. + ReceiveQueueSizeOption SockOpt = iota + + // TODO(b/137664753): convert all int socket options to be handled via + // GetSockOptInt. +) + // ErrorOption is used in GetSockOpt to specify that the last error reported by // the endpoint should be cleared and returned. type ErrorOption struct{} @@ -419,10 +446,6 @@ type ReceiveBufferSizeOption int // unread bytes in the output buffer should be returned. type SendQueueSizeOption int -// ReceiveQueueSizeOption is used in GetSockOpt to specify that the number of -// unread bytes in the input buffer should be returned. -type ReceiveQueueSizeOption int - // V6OnlyOption is used by SetSockOpt/GetSockOpt to specify whether an IPv6 // socket is to be restricted to sending and receiving IPv6 packets only. type V6OnlyOption int @@ -491,6 +514,10 @@ type AvailableCongestionControlOption string // buffer moderation. type ModerateReceiveBufferOption bool +// MaxSegOption is used by SetSockOpt/GetSockOpt to set/get the current +// Maximum Segment Size(MSS) value as specified using the TCP_MAXSEG option. +type MaxSegOption int + // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default // TTL value for multicast messages. The default is 1. type MulticastTTLOption uint8 @@ -534,7 +561,7 @@ type BroadcastOption int // Route is a row in the routing table. It specifies through which NIC (and // gateway) sets of packets should be routed. A row is considered viable if the -// masked target address matches the destination adddress in the row. +// masked target address matches the destination address in the row. type Route struct { // Destination is the address that must be matched against the masked // target address to check if this row is viable. diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 33cefd937..ba6671c26 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -291,7 +291,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c switch e.netProto { case header.IPv4ProtocolNumber: - err = e.send4(route, v) + err = send4(route, e.id.LocalPort, v) case header.IPv6ProtocolNumber: err = send6(route, e.id.LocalPort, v) @@ -314,6 +314,22 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { return nil } +// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. +func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { + switch opt { + case tcpip.ReceiveQueueSizeOption: + v := 0 + e.rcvMu.Lock() + if !e.rcvList.Empty() { + p := e.rcvList.Front() + v = p.data.Size() + } + e.rcvMu.Unlock() + return v, nil + } + return -1, tcpip.ErrUnknownProtocolOption +} + // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { switch o := opt.(type) { @@ -332,17 +348,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { e.rcvMu.Unlock() return nil - case *tcpip.ReceiveQueueSizeOption: - e.rcvMu.Lock() - if e.rcvList.Empty() { - *o = 0 - } else { - p := e.rcvList.Front() - *o = tcpip.ReceiveQueueSizeOption(p.data.Size()) - } - e.rcvMu.Unlock() - return nil - case *tcpip.KeepaliveEnabledOption: *o = 0 return nil @@ -352,20 +357,20 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { } } -func (e *endpoint) send4(r *stack.Route, data buffer.View) *tcpip.Error { - if len(data) < header.ICMPv4EchoMinimumSize { +func send4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error { + if len(data) < header.ICMPv4MinimumSize { return tcpip.ErrInvalidEndpointState } // Set the ident to the user-specified port. Sequence number should // already be set by the user. - binary.BigEndian.PutUint16(data[header.ICMPv4MinimumSize:], e.id.LocalPort) + binary.BigEndian.PutUint16(data[header.ICMPv4PayloadOffset:], ident) - hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength())) + hdr := buffer.NewPrependable(header.ICMPv4MinimumSize + int(r.MaxHeaderLength())) - icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize)) + icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize)) copy(icmpv4, data) - data = data[header.ICMPv4EchoMinimumSize:] + data = data[header.ICMPv4MinimumSize:] // Linux performs these basic checks. if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 { @@ -422,6 +427,11 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { e.mu.Lock() defer e.mu.Unlock() + if addr.Addr == "" { + // AF_UNSPEC isn't supported. + return tcpip.ErrAddressFamilyNotSupported + } + nicid := addr.NIC localPort := uint16(0) switch e.state { diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go index c89538131..7fdba5d56 100644 --- a/pkg/tcpip/transport/icmp/protocol.go +++ b/pkg/tcpip/transport/icmp/protocol.go @@ -90,19 +90,18 @@ func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProt func (p *protocol) MinimumPacketSize() int { switch p.number { case ProtocolNumber4: - return header.ICMPv4EchoMinimumSize + return header.ICMPv4MinimumSize case ProtocolNumber6: return header.ICMPv6EchoMinimumSize } panic(fmt.Sprint("unknown protocol number: ", p.number)) } -// ParsePorts returns the source and destination ports stored in the given icmp -// packet. +// ParsePorts in case of ICMP sets src to 0, dst to ICMP ID, and err to nil. func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) { switch p.number { case ProtocolNumber4: - return 0, binary.BigEndian.Uint16(v[header.ICMPv4MinimumSize:]), nil + return 0, binary.BigEndian.Uint16(v[header.ICMPv4PayloadOffset:]), nil case ProtocolNumber6: return 0, binary.BigEndian.Uint16(v[header.ICMPv6MinimumSize:]), nil } diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index 34a14bf7f..bc4b255b4 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -21,6 +21,7 @@ go_library( "endpoint.go", "endpoint_state.go", "packet_list.go", + "protocol.go", ], importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 03f495e48..b633cd9d8 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -16,7 +16,7 @@ // sockets allow applications to: // // * manually write and inspect transport layer headers and payloads -// * receive all traffic of a given transport protcol (e.g. ICMP or UDP) +// * receive all traffic of a given transport protocol (e.g. ICMP or UDP) // * optionally write and inspect network layer and link layer headers for // packets // @@ -67,6 +67,7 @@ type endpoint struct { netProto tcpip.NetworkProtocolNumber transProto tcpip.TransportProtocolNumber waiterQueue *waiter.Queue + associated bool // The following fields are used to manage the receive queue and are // protected by rcvMu. @@ -97,8 +98,12 @@ type endpoint struct { } // NewEndpoint returns a raw endpoint for the given protocols. -// TODO(b/129292371): IP_HDRINCL, IPPROTO_RAW, and AF_PACKET. +// TODO(b/129292371): IP_HDRINCL and AF_PACKET. func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */) +} + +func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) { if netProto != header.IPv4ProtocolNumber { return nil, tcpip.ErrUnknownProtocol } @@ -110,6 +115,16 @@ func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, trans waiterQueue: waiterQueue, rcvBufSizeMax: 32 * 1024, sndBufSize: 32 * 1024, + associated: associated, + } + + // Unassociated endpoints are write-only and users call Write() with IP + // headers included. Because they're write-only, We don't need to + // register with the stack. + if !associated { + ep.rcvBufSizeMax = 0 + ep.waiterQueue = nil + return ep, nil } if err := ep.stack.RegisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep); err != nil { @@ -124,7 +139,7 @@ func (ep *endpoint) Close() { ep.mu.Lock() defer ep.mu.Unlock() - if ep.closed { + if ep.closed || !ep.associated { return } @@ -142,8 +157,11 @@ func (ep *endpoint) Close() { if ep.connected { ep.route.Release() + ep.connected = false } + ep.closed = true + ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) } @@ -152,6 +170,10 @@ func (ep *endpoint) ModerateRecvBuf(copied int) {} // Read implements tcpip.Endpoint.Read. func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { + if !ep.associated { + return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidOptionValue + } + ep.rcvMu.Lock() // If there's no data to read, return that read would block or that the @@ -192,6 +214,33 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintp return 0, nil, tcpip.ErrInvalidEndpointState } + payloadBytes, err := payload.Get(payload.Size()) + if err != nil { + ep.mu.RUnlock() + return 0, nil, err + } + + // If this is an unassociated socket and callee provided a nonzero + // destination address, route using that address. + if !ep.associated { + ip := header.IPv4(payloadBytes) + if !ip.IsValid(payload.Size()) { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrInvalidOptionValue + } + dstAddr := ip.DestinationAddress() + // Update dstAddr with the address in the IP header, unless + // opts.To is set (e.g. if sendto specifies a specific + // address). + if dstAddr != tcpip.Address([]byte{0, 0, 0, 0}) && opts.To == nil { + opts.To = &tcpip.FullAddress{ + NIC: 0, // NIC is unset. + Addr: dstAddr, // The address from the payload. + Port: 0, // There are no ports here. + } + } + } + // Did the user caller provide a destination? If not, use the connected // destination. if opts.To == nil { @@ -216,12 +265,12 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintp return 0, nil, tcpip.ErrInvalidEndpointState } - n, ch, err := ep.finishWrite(payload, savedRoute) + n, ch, err := ep.finishWrite(payloadBytes, savedRoute) ep.mu.Unlock() return n, ch, err } - n, ch, err := ep.finishWrite(payload, &ep.route) + n, ch, err := ep.finishWrite(payloadBytes, &ep.route) ep.mu.RUnlock() return n, ch, err } @@ -248,7 +297,7 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintp return 0, nil, err } - n, ch, err := ep.finishWrite(payload, &route) + n, ch, err := ep.finishWrite(payloadBytes, &route) route.Release() ep.mu.RUnlock() return n, ch, err @@ -256,7 +305,7 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintp // finishWrite writes the payload to a route. It resolves the route if // necessary. It's really just a helper to make defer unnecessary in Write. -func (ep *endpoint) finishWrite(payload tcpip.Payload, route *stack.Route) (uintptr, <-chan struct{}, *tcpip.Error) { +func (ep *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (uintptr, <-chan struct{}, *tcpip.Error) { // We may need to resolve the route (match a link layer address to the // network address). If that requires blocking (e.g. to use ARP), // return a channel on which the caller can wait. @@ -269,13 +318,14 @@ func (ep *endpoint) finishWrite(payload tcpip.Payload, route *stack.Route) (uint } } - payloadBytes, err := payload.Get(payload.Size()) - if err != nil { - return 0, nil, err - } - switch ep.netProto { case header.IPv4ProtocolNumber: + if !ep.associated { + if err := route.WriteHeaderIncludedPacket(buffer.View(payloadBytes).ToVectorisedView()); err != nil { + return 0, nil, err + } + break + } hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength())) if err := route.WritePacket(nil /* gso */, hdr, buffer.View(payloadBytes).ToVectorisedView(), ep.transProto, route.DefaultTTL()); err != nil { return 0, nil, err @@ -298,6 +348,11 @@ func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { ep.mu.Lock() defer ep.mu.Unlock() + if addr.Addr == "" { + // AF_UNSPEC isn't supported. + return tcpip.ErrAddressFamilyNotSupported + } + if ep.closed { return tcpip.ErrInvalidEndpointState } @@ -330,15 +385,17 @@ func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { } defer route.Release() - // Re-register the endpoint with the appropriate NIC. - if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { - return err + if ep.associated { + // Re-register the endpoint with the appropriate NIC. + if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { + return err + } + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + ep.registeredNIC = nic } - ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) - // Save the route and NIC we've connected via. + // Save the route we've connected via. ep.route = route.Clone() - ep.registeredNIC = nic ep.connected = true return nil @@ -381,14 +438,16 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { return tcpip.ErrBadLocalAddress } - // Re-register the endpoint with the appropriate NIC. - if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { - return err + if ep.associated { + // Re-register the endpoint with the appropriate NIC. + if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { + return err + } + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + ep.registeredNIC = addr.NIC + ep.boundNIC = addr.NIC } - ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) - ep.registeredNIC = addr.NIC - ep.boundNIC = addr.NIC ep.boundAddr = addr.Addr ep.bound = true @@ -428,6 +487,23 @@ func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { return nil } +// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. +func (ep *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { + switch opt { + case tcpip.ReceiveQueueSizeOption: + v := 0 + ep.rcvMu.Lock() + if !ep.rcvList.Empty() { + p := ep.rcvList.Front() + v = p.data.Size() + } + ep.rcvMu.Unlock() + return v, nil + } + + return -1, tcpip.ErrUnknownProtocolOption +} + // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { switch o := opt.(type) { @@ -446,17 +522,6 @@ func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { ep.rcvMu.Unlock() return nil - case *tcpip.ReceiveQueueSizeOption: - ep.rcvMu.Lock() - if ep.rcvList.Empty() { - *o = 0 - } else { - p := ep.rcvList.Front() - *o = tcpip.ReceiveQueueSizeOption(p.data.Size()) - } - ep.rcvMu.Unlock() - return nil - case *tcpip.KeepaliveEnabledOption: *o = 0 return nil diff --git a/pkg/tcpip/transport/raw/protocol.go b/pkg/tcpip/transport/raw/protocol.go new file mode 100644 index 000000000..783c21e6b --- /dev/null +++ b/pkg/tcpip/transport/raw/protocol.go @@ -0,0 +1,32 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package raw + +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/waiter" +) + +type factory struct{} + +// NewUnassociatedRawEndpoint implements stack.UnassociatedEndpointFactory. +func (factory) NewUnassociatedRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + return newEndpoint(stack, netProto, transProto, waiterQueue, false /* associated */) +} + +func init() { + stack.RegisterUnassociatedFactory(factory{}) +} diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index ee60ebf58..89154391b 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -117,6 +117,7 @@ const ( notifyDrain notifyReset notifyKeepaliveChanged + notifyMSSChanged ) // SACKInfo holds TCP SACK related information for a given endpoint. @@ -218,8 +219,6 @@ type endpoint struct { mu sync.RWMutex `state:"nosave"` id stack.TransportEndpointID - // state endpointState `state:".(endpointState)"` - // pState ProtocolState state EndpointState `state:".(EndpointState)"` isPortReserved bool `state:"manual"` @@ -313,6 +312,10 @@ type endpoint struct { // in SYN-RCVD state. synRcvdCount int + // userMSS if non-zero is the MSS value explicitly set by the user + // for this endpoint using the TCP_MAXSEG setsockopt. + userMSS int + // The following fields are used to manage the send buffer. When // segments are ready to be sent, they are added to sndQueue and the // protocol goroutine is signaled via sndWaker. @@ -917,6 +920,17 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { } return nil + case tcpip.MaxSegOption: + userMSS := v + if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { + return tcpip.ErrInvalidOptionValue + } + e.mu.Lock() + e.userMSS = int(userMSS) + e.mu.Unlock() + e.notifyProtocolGoroutine(notifyMSSChanged) + return nil + case tcpip.ReceiveBufferSizeOption: // Make sure the receive buffer size is within the min and max // allowed. @@ -1086,6 +1100,15 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) { return e.rcvBufUsed, nil } +// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. +func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { + switch opt { + case tcpip.ReceiveQueueSizeOption: + return e.readyReceiveSize() + } + return -1, tcpip.ErrUnknownProtocolOption +} + // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { switch o := opt.(type) { @@ -1096,6 +1119,14 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { e.lastErrorMu.Unlock() return err + case *tcpip.MaxSegOption: + // This is just stubbed out. Linux never returns the user_mss + // value as it either returns the defaultMSS or returns the + // actual current MSS. Netstack just returns the defaultMSS + // always for now. + *o = header.TCPDefaultMSS + return nil + case *tcpip.SendBufferSizeOption: e.sndBufMu.Lock() *o = tcpip.SendBufferSizeOption(e.sndBufSize) @@ -1108,15 +1139,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { e.rcvListMu.Unlock() return nil - case *tcpip.ReceiveQueueSizeOption: - v, err := e.readyReceiveSize() - if err != nil { - return err - } - - *o = tcpip.ReceiveQueueSizeOption(v) - return nil - case *tcpip.DelayOption: *o = 0 if v := atomic.LoadUint32(&e.delay); v != 0 { @@ -1271,6 +1293,11 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocol // Connect connects the endpoint to its peer. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { + if addr.Addr == "" && addr.Port == 0 { + // AF_UNSPEC isn't supported. + return tcpip.ErrAddressFamilyNotSupported + } + return e.connect(addr, true, true) } diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index ec61a3886..b93959034 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -342,6 +342,7 @@ func loadError(s string) *tcpip.Error { tcpip.ErrNoBufferSpace, tcpip.ErrBroadcastDisabled, tcpip.ErrNotPermitted, + tcpip.ErrAddressFamilyNotSupported, } messageToError = make(map[string]*tcpip.Error) diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index 630dd7925..bcc0f3e28 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -271,7 +271,7 @@ func (c *Context) GetPacketNonBlocking() []byte { // SendICMPPacket builds and sends an ICMPv4 packet via the link layer endpoint. func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byte, maxTotalSize int) { // Allocate a buffer data and headers. - buf := buffer.NewView(header.IPv4MinimumSize + header.ICMPv4MinimumSize + len(p1) + len(p2)) + buf := buffer.NewView(header.IPv4MinimumSize + header.ICMPv4PayloadOffset + len(p1) + len(p2)) if len(buf) > maxTotalSize { buf = buf[:maxTotalSize] } @@ -291,8 +291,8 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt icmp.SetType(typ) icmp.SetCode(code) - copy(icmp[header.ICMPv4MinimumSize:], p1) - copy(icmp[header.ICMPv4MinimumSize+len(p1):], p2) + copy(icmp[header.ICMPv4PayloadOffset:], p1) + copy(icmp[header.ICMPv4PayloadOffset+len(p1):], p2) // Inject packet. c.linkEP.Inject(ipv4.ProtocolNumber, buf.ToVectorisedView()) diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 99fdfb795..91f89a781 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -189,7 +189,6 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess p := e.rcvList.Front() e.rcvList.Remove(p) e.rcvBufSize -= p.data.Size() - e.rcvMu.Unlock() if addr != nil { @@ -253,7 +252,7 @@ func (e *endpoint) connectRoute(nicid tcpip.NICID, addr tcpip.FullAddress) (stac if nicid == 0 { nicid = e.multicastNICID } - if localAddr == "" { + if localAddr == "" && nicid == 0 { localAddr = e.multicastAddr } } @@ -539,6 +538,22 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { return nil } +// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. +func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { + switch opt { + case tcpip.ReceiveQueueSizeOption: + v := 0 + e.rcvMu.Lock() + if !e.rcvList.Empty() { + p := e.rcvList.Front() + v = p.data.Size() + } + e.rcvMu.Unlock() + return v, nil + } + return -1, tcpip.ErrUnknownProtocolOption +} + // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { switch o := opt.(type) { @@ -573,17 +588,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { } return nil - case *tcpip.ReceiveQueueSizeOption: - e.rcvMu.Lock() - if e.rcvList.Empty() { - *o = 0 - } else { - p := e.rcvList.Front() - *o = tcpip.ReceiveQueueSizeOption(p.data.Size()) - } - e.rcvMu.Unlock() - return nil - case *tcpip.MulticastTTLOption: e.mu.Lock() *o = tcpip.MulticastTTLOption(e.multicastTTL) @@ -671,6 +675,9 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) { netProto := e.netProto + if len(addr.Addr) == 0 { + return netProto, nil + } if header.IsV4MappedAddress(addr.Addr) { // Fail if using a v4 mapped address on a v6only endpoint. if e.v6only { @@ -698,8 +705,44 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (t return netProto, nil } +func (e *endpoint) disconnect() *tcpip.Error { + e.mu.Lock() + defer e.mu.Unlock() + + if e.state != stateConnected { + return nil + } + id := stack.TransportEndpointID{} + // Exclude ephemerally bound endpoints. + if e.bindNICID != 0 || e.id.LocalAddress == "" { + var err *tcpip.Error + id = stack.TransportEndpointID{ + LocalPort: e.id.LocalPort, + LocalAddress: e.id.LocalAddress, + } + id, err = e.registerWithStack(e.regNICID, e.effectiveNetProtos, id) + if err != nil { + return err + } + e.state = stateBound + } else { + e.state = stateInitial + } + + e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e) + e.id = id + e.route.Release() + e.route = stack.Route{} + e.dstPort = 0 + + return nil +} + // Connect connects the endpoint to its peer. Specifying a NIC is optional. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { + if addr.Addr == "" { + return e.disconnect() + } if addr.Port == 0 { // We don't support connecting to port zero. return tcpip.ErrInvalidEndpointState @@ -734,12 +777,16 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { defer r.Release() id := stack.TransportEndpointID{ - LocalAddress: r.LocalAddress, + LocalAddress: e.id.LocalAddress, LocalPort: localPort, RemotePort: addr.Port, RemoteAddress: r.RemoteAddress, } + if e.state == stateInitial { + id.LocalAddress = r.LocalAddress + } + // Even if we're connected, this endpoint can still be used to send // packets on a different network protocol, so we register both even if // v6only is set to false and this is an ipv6 endpoint. diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go index 701bdd72b..18e786397 100644 --- a/pkg/tcpip/transport/udp/endpoint_state.go +++ b/pkg/tcpip/transport/udp/endpoint_state.go @@ -92,8 +92,6 @@ func (e *endpoint) afterLoad() { if err != nil { panic(*err) } - - e.id.LocalAddress = e.route.LocalAddress } else if len(e.id.LocalAddress) != 0 { // stateBound if e.stack.CheckLocalAddress(e.regNICID, netProto, e.id.LocalAddress) == 0 { panic(tcpip.ErrBadLocalAddress) diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 75129a2ff..958d5712e 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -847,9 +847,7 @@ func TestWriteIncrementsPacketsSent(t *testing.T) { } } -func TestTTL(t *testing.T) { - payload := tcpip.SlicePayload(buffer.View(newPayload())) - +func setSockOptVariants(t *testing.T, optFunc func(*testing.T, string, tcpip.NetworkProtocolNumber, string)) { for _, name := range []string{"v4", "v6", "dual"} { t.Run(name, func(t *testing.T) { var networkProtocolNumber tcpip.NetworkProtocolNumber @@ -874,134 +872,219 @@ func TestTTL(t *testing.T) { for _, variant := range variants { t.Run(variant, func(t *testing.T) { - for _, typ := range []string{"unicast", "multicast"} { - t.Run(typ, func(t *testing.T) { - var addr tcpip.Address - var port uint16 - switch typ { - case "unicast": - port = testPort - switch variant { - case "v4": - addr = testAddr - case "mapped": - addr = testV4MappedAddr - case "v6": - addr = testV6Addr - default: - t.Fatal("unknown test variant") - } - case "multicast": - port = multicastPort - switch variant { - case "v4": - addr = multicastAddr - case "mapped": - addr = multicastV4MappedAddr - case "v6": - addr = multicastV6Addr - default: - t.Fatal("unknown test variant") - } - default: - t.Fatal("unknown test variant") - } - - c := newDualTestContext(t, defaultMTU) - defer c.cleanup() - - var err *tcpip.Error - c.ep, err = c.s.NewEndpoint(udp.ProtocolNumber, networkProtocolNumber, &c.wq) - if err != nil { - c.t.Fatalf("NewEndpoint failed: %v", err) - } - - switch name { - case "v4": - case "v6": - if err := c.ep.SetSockOpt(tcpip.V6OnlyOption(1)); err != nil { - c.t.Fatalf("SetSockOpt failed: %v", err) - } - case "dual": - if err := c.ep.SetSockOpt(tcpip.V6OnlyOption(0)); err != nil { - c.t.Fatalf("SetSockOpt failed: %v", err) - } - default: - t.Fatal("unknown test variant") - } + optFunc(t, name, networkProtocolNumber, variant) + }) + } + }) + } +} - const multicastTTL = 42 - if err := c.ep.SetSockOpt(tcpip.MulticastTTLOption(multicastTTL)); err != nil { - c.t.Fatalf("SetSockOpt failed: %v", err) - } +func TestTTL(t *testing.T) { + payload := tcpip.SlicePayload(buffer.View(newPayload())) - n, _, err := c.ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{Addr: addr, Port: port}}) - if err != nil { - c.t.Fatalf("Write failed: %v", err) - } - if n != uintptr(len(payload)) { - c.t.Fatalf("got c.ep.Write(...) = %d, want = %d", n, len(payload)) - } + setSockOptVariants(t, func(t *testing.T, name string, networkProtocolNumber tcpip.NetworkProtocolNumber, variant string) { + for _, typ := range []string{"unicast", "multicast"} { + t.Run(typ, func(t *testing.T) { + var addr tcpip.Address + var port uint16 + switch typ { + case "unicast": + port = testPort + switch variant { + case "v4": + addr = testAddr + case "mapped": + addr = testV4MappedAddr + case "v6": + addr = testV6Addr + default: + t.Fatal("unknown test variant") + } + case "multicast": + port = multicastPort + switch variant { + case "v4": + addr = multicastAddr + case "mapped": + addr = multicastV4MappedAddr + case "v6": + addr = multicastV6Addr + default: + t.Fatal("unknown test variant") + } + default: + t.Fatal("unknown test variant") + } + + c := newDualTestContext(t, defaultMTU) + defer c.cleanup() + + var err *tcpip.Error + c.ep, err = c.s.NewEndpoint(udp.ProtocolNumber, networkProtocolNumber, &c.wq) + if err != nil { + c.t.Fatalf("NewEndpoint failed: %v", err) + } + + switch name { + case "v4": + case "v6": + if err := c.ep.SetSockOpt(tcpip.V6OnlyOption(1)); err != nil { + c.t.Fatalf("SetSockOpt failed: %v", err) + } + case "dual": + if err := c.ep.SetSockOpt(tcpip.V6OnlyOption(0)); err != nil { + c.t.Fatalf("SetSockOpt failed: %v", err) + } + default: + t.Fatal("unknown test variant") + } + + const multicastTTL = 42 + if err := c.ep.SetSockOpt(tcpip.MulticastTTLOption(multicastTTL)); err != nil { + c.t.Fatalf("SetSockOpt failed: %v", err) + } + + n, _, err := c.ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{Addr: addr, Port: port}}) + if err != nil { + c.t.Fatalf("Write failed: %v", err) + } + if n != uintptr(len(payload)) { + c.t.Fatalf("got c.ep.Write(...) = %d, want = %d", n, len(payload)) + } + + checkerFn := checker.IPv4 + switch variant { + case "v4", "mapped": + case "v6": + checkerFn = checker.IPv6 + default: + t.Fatal("unknown test variant") + } + var wantTTL uint8 + var multicast bool + switch typ { + case "unicast": + multicast = false + switch variant { + case "v4", "mapped": + ep, err := ipv4.NewProtocol().NewEndpoint(0, "", nil, nil, nil) + if err != nil { + t.Fatal(err) + } + wantTTL = ep.DefaultTTL() + ep.Close() + case "v6": + ep, err := ipv6.NewProtocol().NewEndpoint(0, "", nil, nil, nil) + if err != nil { + t.Fatal(err) + } + wantTTL = ep.DefaultTTL() + ep.Close() + default: + t.Fatal("unknown test variant") + } + case "multicast": + wantTTL = multicastTTL + multicast = true + default: + t.Fatal("unknown test variant") + } + + var networkProtocolNumber tcpip.NetworkProtocolNumber + switch variant { + case "v4", "mapped": + networkProtocolNumber = ipv4.ProtocolNumber + case "v6": + networkProtocolNumber = ipv6.ProtocolNumber + default: + t.Fatal("unknown test variant") + } + + b := c.getPacket(networkProtocolNumber, multicast) + checkerFn(c.t, b, + checker.TTL(wantTTL), + checker.UDP( + checker.DstPort(port), + ), + ) + }) + } + }) +} - checkerFn := checker.IPv4 - switch variant { - case "v4", "mapped": - case "v6": - checkerFn = checker.IPv6 - default: - t.Fatal("unknown test variant") - } - var wantTTL uint8 - var multicast bool - switch typ { - case "unicast": - multicast = false - switch variant { - case "v4", "mapped": - ep, err := ipv4.NewProtocol().NewEndpoint(0, "", nil, nil, nil) - if err != nil { - t.Fatal(err) - } - wantTTL = ep.DefaultTTL() - ep.Close() - case "v6": - ep, err := ipv6.NewProtocol().NewEndpoint(0, "", nil, nil, nil) - if err != nil { - t.Fatal(err) - } - wantTTL = ep.DefaultTTL() - ep.Close() - default: - t.Fatal("unknown test variant") - } - case "multicast": - wantTTL = multicastTTL - multicast = true - default: - t.Fatal("unknown test variant") +func TestMulticastInterfaceOption(t *testing.T) { + setSockOptVariants(t, func(t *testing.T, name string, networkProtocolNumber tcpip.NetworkProtocolNumber, variant string) { + for _, bindTyp := range []string{"bound", "unbound"} { + t.Run(bindTyp, func(t *testing.T) { + for _, optTyp := range []string{"use local-addr", "use NICID", "use local-addr and NIC"} { + t.Run(optTyp, func(t *testing.T) { + var mcastAddr, localIfAddr tcpip.Address + switch variant { + case "v4": + mcastAddr = multicastAddr + localIfAddr = stackAddr + case "mapped": + mcastAddr = multicastV4MappedAddr + localIfAddr = stackAddr + case "v6": + mcastAddr = multicastV6Addr + localIfAddr = stackV6Addr + default: + t.Fatal("unknown test variant") + } + + var ifoptSet tcpip.MulticastInterfaceOption + switch optTyp { + case "use local-addr": + ifoptSet.InterfaceAddr = localIfAddr + case "use NICID": + ifoptSet.NIC = 1 + case "use local-addr and NIC": + ifoptSet.InterfaceAddr = localIfAddr + ifoptSet.NIC = 1 + default: + t.Fatal("unknown test variant") + } + + c := newDualTestContext(t, defaultMTU) + defer c.cleanup() + + var err *tcpip.Error + c.ep, err = c.s.NewEndpoint(udp.ProtocolNumber, networkProtocolNumber, &c.wq) + if err != nil { + c.t.Fatalf("NewEndpoint failed: %v", err) + } + + if bindTyp == "bound" { + // Bind the socket by connecting to the multicast address. + // This may have an influence on how the multicast interface + // is set. + addr := tcpip.FullAddress{ + Addr: mcastAddr, + Port: multicastPort, } - - var networkProtocolNumber tcpip.NetworkProtocolNumber - switch variant { - case "v4", "mapped": - networkProtocolNumber = ipv4.ProtocolNumber - case "v6": - networkProtocolNumber = ipv6.ProtocolNumber - default: - t.Fatal("unknown test variant") + if err := c.ep.Connect(addr); err != nil { + c.t.Fatalf("Connect failed: %v", err) } - - b := c.getPacket(networkProtocolNumber, multicast) - checkerFn(c.t, b, - checker.TTL(wantTTL), - checker.UDP( - checker.DstPort(port), - ), - ) - }) - } - }) - } - }) - } + } + + if err := c.ep.SetSockOpt(ifoptSet); err != nil { + c.t.Fatalf("SetSockOpt failed: %v", err) + } + + // Verify multicast interface addr and NIC were set correctly. + // Note that NIC must be 1 since this is our outgoing interface. + ifoptWant := tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr} + var ifoptGot tcpip.MulticastInterfaceOption + if err := c.ep.GetSockOpt(&ifoptGot); err != nil { + c.t.Fatalf("GetSockOpt failed: %v", err) + } + if ifoptGot != ifoptWant { + c.t.Errorf("got GetSockOpt() = %#v, want = %#v", ifoptGot, ifoptWant) + } + }) + } + }) + } + }) } diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go index be6b4f95b..df59ffab1 100644 --- a/pkg/urpc/urpc.go +++ b/pkg/urpc/urpc.go @@ -72,7 +72,7 @@ type FilePayload struct { Files []*os.File `json:"-"` } -// ReleaseFD releases the indexth FD. +// ReleaseFD releases the FD at the specified index. func (f *FilePayload) ReleaseFD(index int) (*fd.FD, error) { return fd.NewFromFile(f.Files[index]) } |