summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/abi/linux/clone.go41
-rw-r--r--pkg/abi/linux/epoll.go56
-rw-r--r--pkg/abi/linux/fcntl.go36
-rw-r--r--pkg/abi/linux/file.go43
-rw-r--r--pkg/abi/linux/ioctl.go25
-rw-r--r--pkg/abi/linux/ipc.go6
-rw-r--r--pkg/abi/linux/netlink.go8
-rw-r--r--pkg/abi/linux/netlink_route.go12
-rw-r--r--pkg/abi/linux/sched.go6
-rw-r--r--pkg/abi/linux/time.go6
-rw-r--r--pkg/refs/refcounter.go164
-rwxr-xr-xpkg/refs/refs_state_autogen.go4
-rw-r--r--pkg/sentry/control/proc.go12
-rw-r--r--pkg/sentry/fs/ashmem/area.go308
-rwxr-xr-xpkg/sentry/fs/ashmem/ashmem_state_autogen.go123
-rw-r--r--pkg/sentry/fs/ashmem/device.go61
-rw-r--r--pkg/sentry/fs/ashmem/pin_board.go127
-rwxr-xr-xpkg/sentry/fs/ashmem/uint64_range.go62
-rwxr-xr-xpkg/sentry/fs/ashmem/uint64_set.go1270
-rw-r--r--pkg/sentry/fs/binder/binder.go260
-rwxr-xr-xpkg/sentry/fs/binder/binder_state_autogen.go40
-rw-r--r--pkg/sentry/fs/dev/dev.go14
-rw-r--r--pkg/sentry/fs/dev/fs.go37
-rw-r--r--pkg/sentry/fs/dirent.go4
-rw-r--r--pkg/sentry/fs/file.go5
-rw-r--r--pkg/sentry/fs/file_overlay.go5
-rw-r--r--pkg/sentry/fs/gofer/handles.go5
-rw-r--r--pkg/sentry/fs/gofer/path.go7
-rw-r--r--pkg/sentry/fs/gofer/session.go7
-rw-r--r--pkg/sentry/fs/gofer/session_state.go1
-rwxr-xr-xpkg/sentry/fs/host/host_state_autogen.go4
-rw-r--r--pkg/sentry/fs/host/socket.go8
-rw-r--r--pkg/sentry/fs/inode.go4
-rw-r--r--pkg/sentry/fs/inode_overlay.go6
-rw-r--r--pkg/sentry/fs/mount.go4
-rw-r--r--pkg/sentry/fs/mounts.go11
-rw-r--r--pkg/sentry/fs/proc/fds.go38
-rw-r--r--pkg/sentry/fs/proc/task.go4
-rw-r--r--pkg/sentry/fs/tty/terminal.go4
-rwxr-xr-xpkg/sentry/kernel/auth/atomicptr_credentials_unsafe.go (renamed from pkg/sentry/kernel/auth/atomicptr_credentials.go)0
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go3
-rw-r--r--pkg/sentry/kernel/fd_map.go364
-rw-r--r--pkg/sentry/kernel/fd_table.go380
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go103
-rw-r--r--pkg/sentry/kernel/fs_context.go4
-rwxr-xr-xpkg/sentry/kernel/futex/atomicptr_bucket_unsafe.go (renamed from pkg/sentry/kernel/futex/atomicptr_bucket.go)0
-rw-r--r--pkg/sentry/kernel/futex/futex.go25
-rwxr-xr-xpkg/sentry/kernel/kdefs/kdefs_state_autogen.go4
-rw-r--r--pkg/sentry/kernel/kernel.go76
-rwxr-xr-xpkg/sentry/kernel/kernel_state_autogen.go25
-rwxr-xr-xpkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go (renamed from pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go)3
-rw-r--r--pkg/sentry/kernel/sessions.go16
-rw-r--r--pkg/sentry/kernel/shm/shm.go1
-rw-r--r--pkg/sentry/kernel/syslog.go1
-rw-r--r--pkg/sentry/kernel/task.go65
-rw-r--r--pkg/sentry/kernel/task_clone.go40
-rw-r--r--pkg/sentry/kernel/task_exec.go2
-rw-r--r--pkg/sentry/kernel/task_exit.go4
-rw-r--r--pkg/sentry/kernel/task_log.go2
-rw-r--r--pkg/sentry/kernel/task_start.go13
-rw-r--r--pkg/sentry/loader/loader.go2
-rw-r--r--pkg/sentry/mm/aio_context.go4
-rw-r--r--pkg/sentry/mm/special_mappable.go4
-rw-r--r--pkg/sentry/platform/kvm/filters.go (renamed from pkg/sentry/kernel/kdefs/kdefs.go)25
-rw-r--r--pkg/sentry/platform/kvm/kvm.go14
-rw-r--r--pkg/sentry/platform/kvm/machine.go7
-rw-r--r--pkg/sentry/platform/platform.go28
-rw-r--r--pkg/sentry/platform/ptrace/filters.go (renamed from pkg/abi/linux/binder.go)23
-rw-r--r--pkg/sentry/platform/ptrace/ptrace.go15
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux.go4
-rwxr-xr-xpkg/sentry/platform/ring0/defs_impl.go6
-rw-r--r--pkg/sentry/platform/ring0/kernel_amd64.go8
-rw-r--r--pkg/sentry/socket/control/control.go10
-rw-r--r--pkg/sentry/socket/epsocket/epsocket.go25
-rw-r--r--pkg/sentry/socket/hostinet/socket.go9
-rw-r--r--pkg/sentry/socket/netlink/socket.go3
-rw-r--r--pkg/sentry/socket/rpcinet/socket.go8
-rw-r--r--pkg/sentry/socket/socket.go3
-rw-r--r--pkg/sentry/socket/unix/transport/connectioned.go4
-rw-r--r--pkg/sentry/socket/unix/transport/connectionless.go30
-rw-r--r--pkg/sentry/socket/unix/unix.go17
-rw-r--r--pkg/sentry/strace/poll.go3
-rw-r--r--pkg/sentry/strace/socket.go2
-rw-r--r--pkg/sentry/strace/strace.go7
-rw-r--r--pkg/sentry/syscalls/epoll.go56
-rw-r--r--pkg/sentry/syscalls/linux/error.go3
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go604
-rw-r--r--pkg/sentry/syscalls/linux/sigset.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_aio.go9
-rw-r--r--pkg/sentry/syscalls/linux/sys_epoll.go33
-rw-r--r--pkg/sentry/syscalls/linux/sys_eventfd.go10
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go220
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go29
-rw-r--r--pkg/sentry/syscalls/linux/sys_inotify.go28
-rw-r--r--pkg/sentry/syscalls/linux/sys_lseek.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_pipe.go30
-rw-r--r--pkg/sentry/syscalls/linux/sys_poll.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_read.go21
-rw-r--r--pkg/sentry/syscalls/linux/sys_sched.go23
-rw-r--r--pkg/sentry/syscalls/linux/sys_seccomp.go9
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go213
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go25
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat.go17
-rw-r--r--pkg/sentry/syscalls/linux/sys_sync.go17
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go94
-rw-r--r--pkg/sentry/syscalls/linux/sys_timer.go6
-rw-r--r--pkg/sentry/syscalls/linux/sys_timerfd.go13
-rw-r--r--pkg/sentry/syscalls/linux/sys_tls.go7
-rw-r--r--pkg/sentry/syscalls/linux/sys_write.go21
-rw-r--r--pkg/sentry/syscalls/linux/timespec.go7
-rw-r--r--pkg/sentry/syscalls/syscalls.go22
-rwxr-xr-xpkg/sentry/time/seqatomic_parameters_unsafe.go (renamed from pkg/sentry/time/seqatomic_parameters.go)3
-rw-r--r--pkg/syserr/netstack.go75
-rw-r--r--pkg/syserror/syserror.go2
-rw-r--r--pkg/tcpip/tcpip.go79
-rw-r--r--pkg/tcpip/transport/icmp/endpoint.go5
-rw-r--r--pkg/tcpip/transport/raw/endpoint.go5
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go5
-rw-r--r--pkg/tcpip/transport/tcp/endpoint_state.go1
-rw-r--r--pkg/tcpip/transport/udp/endpoint.go42
-rw-r--r--pkg/tcpip/transport/udp/endpoint_state.go2
-rw-r--r--runsc/boot/config.go38
-rw-r--r--runsc/boot/fds.go27
-rw-r--r--runsc/boot/filter/config.go23
-rw-r--r--runsc/boot/filter/extra_filters_race.go1
-rw-r--r--runsc/boot/filter/filter.go13
-rw-r--r--runsc/boot/fs.go17
-rw-r--r--runsc/boot/loader.go50
-rw-r--r--runsc/boot/platforms/platforms.go (renamed from pkg/abi/linux/ashmem.go)27
-rw-r--r--runsc/cmd/boot.go3
-rw-r--r--runsc/main.go7
-rw-r--r--runsc/sandbox/sandbox.go23
134 files changed, 2229 insertions, 4020 deletions
diff --git a/pkg/abi/linux/clone.go b/pkg/abi/linux/clone.go
new file mode 100644
index 000000000..c2cbfca5e
--- /dev/null
+++ b/pkg/abi/linux/clone.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Clone constants per clone(2).
+const (
+ CLONE_VM = 0x100
+ CLONE_FS = 0x200
+ CLONE_FILES = 0x400
+ CLONE_SIGHAND = 0x800
+ CLONE_PARENT = 0x8000
+ CLONE_PTRACE = 0x2000
+ CLONE_VFORK = 0x4000
+ CLONE_THREAD = 0x10000
+ CLONE_NEWNS = 0x20000
+ CLONE_SYSVSEM = 0x40000
+ CLONE_SETTLS = 0x80000
+ CLONE_PARENT_SETTID = 0x100000
+ CLONE_CHILD_CLEARTID = 0x200000
+ CLONE_DETACHED = 0x400000
+ CLONE_UNTRACED = 0x800000
+ CLONE_CHILD_SETTID = 0x1000000
+ CLONE_NEWUTS = 0x4000000
+ CLONE_NEWIPC = 0x8000000
+ CLONE_NEWUSER = 0x10000000
+ CLONE_NEWPID = 0x20000000
+ CLONE_NEWNET = 0x40000000
+ CLONE_IO = 0x80000000
+)
diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go
new file mode 100644
index 000000000..72083b604
--- /dev/null
+++ b/pkg/abi/linux/epoll.go
@@ -0,0 +1,56 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// EpollEvent is equivalent to struct epoll_event from epoll(2).
+type EpollEvent struct {
+ Events uint32
+ Fd int32
+ Data int32
+}
+
+// Event masks.
+const (
+ EPOLLIN = 0x1
+ EPOLLPRI = 0x2
+ EPOLLOUT = 0x4
+ EPOLLERR = 0x8
+ EPOLLHUP = 0x10
+ EPOLLRDNORM = 0x40
+ EPOLLRDBAND = 0x80
+ EPOLLWRNORM = 0x100
+ EPOLLWRBAND = 0x200
+ EPOLLMSG = 0x400
+ EPOLLRDHUP = 0x2000
+)
+
+// Per-file descriptor flags.
+const (
+ EPOLLET = 0x80000000
+ EPOLLONESHOT = 0x40000000
+)
+
+// Operation flags.
+const (
+ EPOLL_CLOEXEC = 0x80000
+ EPOLL_NONBLOCK = 0x800
+)
+
+// Control operations.
+const (
+ EPOLL_CTL_ADD = 0x1
+ EPOLL_CTL_DEL = 0x2
+ EPOLL_CTL_MOD = 0x3
+)
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index 30902a8ca..f78315ebf 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -16,21 +16,39 @@ package linux
// Commands from linux/fcntl.h.
const (
- F_DUPFD = 0
- F_GETFD = 1
- F_GETFL = 3
- F_GETOWN = 9
- F_SETFD = 2
- F_SETFL = 4
- F_SETLK = 6
- F_SETLKW = 7
- F_SETOWN = 8
+ F_DUPFD = 0x0
+ F_GETFD = 0x1
+ F_SETFD = 0x2
+ F_GETFL = 0x3
+ F_SETFL = 0x4
+ F_SETLK = 0x6
+ F_SETLKW = 0x7
+ F_SETOWN = 0x8
+ F_GETOWN = 0x9
F_DUPFD_CLOEXEC = 1024 + 6
F_SETPIPE_SZ = 1024 + 7
F_GETPIPE_SZ = 1024 + 8
)
+// Commands for F_SETLK.
+const (
+ F_RDLCK = 0x0
+ F_WRLCK = 0x1
+ F_UNLCK = 0x2
+)
+
// Flags for fcntl.
const (
FD_CLOEXEC = 00000001
)
+
+// Lock structure for F_SETLK.
+type Flock struct {
+ Type int16
+ Whence int16
+ _ [4]byte
+ Start int64
+ Len int64
+ Pid int32
+ _ [4]byte
+}
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 426003eb7..f78ffaa82 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -161,23 +161,36 @@ const (
// Stat represents struct stat.
type Stat struct {
- Dev uint64
- Ino uint64
- Nlink uint64
- Mode uint32
- UID uint32
- GID uint32
- X_pad0 int32
- Rdev uint64
- Size int64
- Blksize int64
- Blocks int64
- ATime Timespec
- MTime Timespec
- CTime Timespec
- X_unused [3]int64
+ Dev uint64
+ Ino uint64
+ Nlink uint64
+ Mode uint32
+ UID uint32
+ GID uint32
+ _ int32
+ Rdev uint64
+ Size int64
+ Blksize int64
+ Blocks int64
+ ATime Timespec
+ MTime Timespec
+ CTime Timespec
+ _ [3]int64
}
+// File types.
+const (
+ DT_BLK = 0x6
+ DT_CHR = 0x2
+ DT_DIR = 0x4
+ DT_FIFO = 0x1
+ DT_LNK = 0xa
+ DT_REG = 0x8
+ DT_SOCK = 0xc
+ DT_UNKNOWN = 0x0
+ DT_WHT = 0xe
+)
+
// SizeOfStat is the size of a Stat struct.
var SizeOfStat = binary.Size(Stat{})
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 04bb767dc..0e18db9ef 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -72,28 +72,3 @@ const (
SIOCGMIIPHY = 0x8947
SIOCGMIIREG = 0x8948
)
-
-// ioctl(2) requests provided by uapi/linux/android/binder.h
-const (
- BinderWriteReadIoctl = 0xc0306201
- BinderSetIdleTimeoutIoctl = 0x40086203
- BinderSetMaxThreadsIoctl = 0x40046205
- BinderSetIdlePriorityIoctl = 0x40046206
- BinderSetContextMgrIoctl = 0x40046207
- BinderThreadExitIoctl = 0x40046208
- BinderVersionIoctl = 0xc0046209
-)
-
-// ioctl(2) requests provided by drivers/staging/android/uapi/ashmem.h
-const (
- AshmemSetNameIoctl = 0x41007701
- AshmemGetNameIoctl = 0x81007702
- AshmemSetSizeIoctl = 0x40087703
- AshmemGetSizeIoctl = 0x00007704
- AshmemSetProtMaskIoctl = 0x40087705
- AshmemGetProtMaskIoctl = 0x00007706
- AshmemPinIoctl = 0x40087707
- AshmemUnpinIoctl = 0x40087708
- AshmemGetPinStatusIoctl = 0x00007709
- AshmemPurgeAllCachesIoctl = 0x0000770a
-)
diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go
index 2ef8d6cbb..22acd2d43 100644
--- a/pkg/abi/linux/ipc.go
+++ b/pkg/abi/linux/ipc.go
@@ -44,10 +44,10 @@ type IPCPerm struct {
CUID uint32
CGID uint32
Mode uint16
- pad1 uint16
+ _ uint16
Seq uint16
- pad2 uint16
- pad3 uint32
+ _ uint16
+ _ uint32
unused1 uint64
unused2 uint64
}
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index 5e718c363..e8b6544b4 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -41,10 +41,10 @@ const (
// SockAddrNetlink is struct sockaddr_nl, from uapi/linux/netlink.h.
type SockAddrNetlink struct {
- Family uint16
- Padding uint16
- PortID uint32
- Groups uint32
+ Family uint16
+ _ uint16
+ PortID uint32
+ Groups uint32
}
// SockAddrNetlinkSize is the size of SockAddrNetlink.
diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 630dc339a..dd698e2bc 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -86,12 +86,12 @@ const (
// InterfaceInfoMessage is struct ifinfomsg, from uapi/linux/rtnetlink.h.
type InterfaceInfoMessage struct {
- Family uint8
- Padding uint8
- Type uint16
- Index int32
- Flags uint32
- Change uint32
+ Family uint8
+ _ uint8
+ Type uint16
+ Index int32
+ Flags uint32
+ Change uint32
}
// Interface flags, from uapi/linux/if.h.
diff --git a/pkg/abi/linux/sched.go b/pkg/abi/linux/sched.go
index 193d9a242..70e820823 100644
--- a/pkg/abi/linux/sched.go
+++ b/pkg/abi/linux/sched.go
@@ -28,3 +28,9 @@ const (
// reverted back to SCHED_NORMAL on fork.
SCHED_RESET_ON_FORK = 0x40000000
)
+
+const (
+ PRIO_PGRP = 0x1
+ PRIO_PROCESS = 0x0
+ PRIO_USER = 0x2
+)
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index e727066d7..546668bca 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -241,3 +241,9 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) {
Nsec: uint32(nsec % 1e9),
}
}
+
+// Utime represents struct utimbuf used by utimes(2).
+type Utime struct {
+ Actime int64
+ Modtime int64
+}
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 20f515391..6ecbace9e 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -17,9 +17,14 @@
package refs
import (
+ "bytes"
+ "fmt"
"reflect"
+ "runtime"
"sync"
"sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/log"
)
// RefCounter is the interface to be implemented by objects that are reference
@@ -189,6 +194,16 @@ type AtomicRefCount struct {
// how these fields are used.
refCount int64
+ // name is the name of the type which owns this ref count.
+ //
+ // name is immutable after EnableLeakCheck is called.
+ name string
+
+ // stack optionally records the caller of EnableLeakCheck.
+ //
+ // stack is immutable after EnableLeakCheck is called.
+ stack []uintptr
+
// mu protects the list below.
mu sync.Mutex `state:"nosave"`
@@ -196,6 +211,148 @@ type AtomicRefCount struct {
weakRefs weakRefList `state:"nosave"`
}
+// LeakMode configures the leak checker.
+type LeakMode uint32
+
+const (
+ // uninitializedLeakChecking indicates that the leak checker has not yet been initialized.
+ uninitializedLeakChecking LeakMode = iota
+
+ // NoLeakChecking indicates that no effort should be made to check for
+ // leaks.
+ NoLeakChecking
+
+ // LeaksLogWarning indicates that a warning should be logged when leaks
+ // are found.
+ LeaksLogWarning
+
+ // LeaksLogTraces indicates that a trace collected during allocation
+ // should be logged when leaks are found.
+ LeaksLogTraces
+)
+
+// leakMode stores the current mode for the reference leak checker.
+//
+// Values must be one of the LeakMode values.
+//
+// leakMode must be accessed atomically.
+var leakMode uint32
+
+// SetLeakMode configures the reference leak checker.
+func SetLeakMode(mode LeakMode) {
+ atomic.StoreUint32(&leakMode, uint32(mode))
+}
+
+const maxStackFrames = 40
+
+type fileLine struct {
+ file string
+ line int
+}
+
+// A stackKey is a representation of a stack frame for use as a map key.
+//
+// The fileLine type is used as PC values seem to vary across collections, even
+// for the same call stack.
+type stackKey [maxStackFrames]fileLine
+
+var stackCache = struct {
+ sync.Mutex
+ entries map[stackKey][]uintptr
+}{entries: map[stackKey][]uintptr{}}
+
+func makeStackKey(pcs []uintptr) stackKey {
+ frames := runtime.CallersFrames(pcs)
+ var key stackKey
+ keySlice := key[:0]
+ for {
+ frame, more := frames.Next()
+ keySlice = append(keySlice, fileLine{frame.File, frame.Line})
+
+ if !more || len(keySlice) == len(key) {
+ break
+ }
+ }
+ return key
+}
+
+func recordStack() []uintptr {
+ pcs := make([]uintptr, maxStackFrames)
+ n := runtime.Callers(1, pcs)
+ if n == 0 {
+ // No pcs available. Stop now.
+ //
+ // This can happen if the first argument to runtime.Callers
+ // is large.
+ return nil
+ }
+ pcs = pcs[:n]
+ key := makeStackKey(pcs)
+ stackCache.Lock()
+ v, ok := stackCache.entries[key]
+ if !ok {
+ // Reallocate to prevent pcs from escaping.
+ v = append([]uintptr(nil), pcs...)
+ stackCache.entries[key] = v
+ }
+ stackCache.Unlock()
+ return v
+}
+
+func formatStack(pcs []uintptr) string {
+ frames := runtime.CallersFrames(pcs)
+ var trace bytes.Buffer
+ for {
+ frame, more := frames.Next()
+ fmt.Fprintf(&trace, "%s:%d: %s\n", frame.File, frame.Line, frame.Function)
+
+ if !more {
+ break
+ }
+ }
+ return trace.String()
+}
+
+func (r *AtomicRefCount) finalize() {
+ var note string
+ switch LeakMode(atomic.LoadUint32(&leakMode)) {
+ case NoLeakChecking:
+ return
+ case uninitializedLeakChecking:
+ note = "(Leak checker uninitialized): "
+ }
+ if n := r.ReadRefs(); n != 0 {
+ msg := fmt.Sprintf("%sAtomicRefCount %p owned by %q garbage collected with ref count of %d (want 0)", note, r, r.name, n)
+ if len(r.stack) != 0 {
+ msg += ":\nCaller:\n" + formatStack(r.stack)
+ }
+ log.Warningf(msg)
+ }
+}
+
+// EnableLeakCheck checks for reference leaks when the AtomicRefCount gets
+// garbage collected.
+//
+// This function adds a finalizer to the AtomicRefCount, so the AtomicRefCount
+// must be at the beginning of its parent.
+//
+// name is a friendly name that will be listed as the owner of the
+// AtomicRefCount in logs. It should be the name of the parent type, including
+// package.
+func (r *AtomicRefCount) EnableLeakCheck(name string) {
+ if name == "" {
+ panic("invalid name")
+ }
+ switch LeakMode(atomic.LoadUint32(&leakMode)) {
+ case NoLeakChecking:
+ return
+ case LeaksLogTraces:
+ r.stack = recordStack()
+ }
+ r.name = name
+ runtime.SetFinalizer(r, (*AtomicRefCount).finalize)
+}
+
// ReadRefs returns the current number of references. The returned count is
// inherently racy and is unsafe to use without external synchronization.
func (r *AtomicRefCount) ReadRefs() int64 {
@@ -208,6 +365,8 @@ func (r *AtomicRefCount) ReadRefs() int64 {
//
// The sanity check here is limited to real references, since if they have
// dropped beneath zero then the object should have been destroyed.
+//
+//go:nosplit
func (r *AtomicRefCount) IncRef() {
if v := atomic.AddInt64(&r.refCount, 1); v <= 0 {
panic("Incrementing non-positive ref count")
@@ -222,6 +381,8 @@ func (r *AtomicRefCount) IncRef() {
// To do this safely without a loop, a speculative reference is first acquired
// on the object. This allows multiple concurrent TryIncRef calls to
// distinguish other TryIncRef calls from genuine references held.
+//
+//go:nosplit
func (r *AtomicRefCount) TryIncRef() bool {
const speculativeRef = 1 << 32
v := atomic.AddInt64(&r.refCount, speculativeRef)
@@ -263,6 +424,7 @@ func (r *AtomicRefCount) dropWeakRef(w *WeakRef) {
// B: DecRef [real decrease]
// A: TryIncRef [transform speculative to real]
//
+//go:nosplit
func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) {
switch v := atomic.AddInt64(&r.refCount, -1); {
case v < -1:
@@ -298,6 +460,8 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) {
}
// DecRef decrements this object's reference count.
+//
+//go:nosplit
func (r *AtomicRefCount) DecRef() {
r.DecRefWithDestructor(nil)
}
diff --git a/pkg/refs/refs_state_autogen.go b/pkg/refs/refs_state_autogen.go
index 959b6603f..1f69f0c0c 100755
--- a/pkg/refs/refs_state_autogen.go
+++ b/pkg/refs/refs_state_autogen.go
@@ -24,11 +24,15 @@ func (x *AtomicRefCount) beforeSave() {}
func (x *AtomicRefCount) save(m state.Map) {
x.beforeSave()
m.Save("refCount", &x.refCount)
+ m.Save("name", &x.name)
+ m.Save("stack", &x.stack)
}
func (x *AtomicRefCount) afterLoad() {}
func (x *AtomicRefCount) load(m state.Map) {
m.Load("refCount", &x.refCount)
+ m.Load("name", &x.name)
+ m.Load("stack", &x.stack)
}
func (x *savedReference) beforeSave() {}
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 66a506584..6ae60c5cb 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -28,7 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/host"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -123,9 +122,8 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID
// TTYFileOperations that wraps the TTY is also returned.
func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
// Import file descriptors.
- l := limits.NewLimitSet()
- fdm := proc.Kernel.NewFDMap()
- defer fdm.DecRef()
+ fdTable := proc.Kernel.NewFDTable()
+ defer fdTable.DecRef()
// No matter what happens, we should close all files in the FilePayload
// before returning. Any files that are imported will be duped.
@@ -149,9 +147,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
WorkingDirectory: args.WorkingDirectory,
Root: args.Root,
Credentials: creds,
- FDMap: fdm,
+ FDTable: fdTable,
Umask: 0022,
- Limits: l,
+ Limits: limits.NewLimitSet(),
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
UTSNamespace: proc.Kernel.RootUTSNamespace(),
IPCNamespace: proc.Kernel.RootIPCNamespace(),
@@ -212,7 +210,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
}
// Add the file to the FD map.
- if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+ if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
return nil, 0, nil, err
}
}
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
deleted file mode 100644
index 3b8d6ca89..000000000
--- a/pkg/sentry/fs/ashmem/area.go
+++ /dev/null
@@ -1,308 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ashmem
-
-import (
- "sync"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
- "gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usage"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/waiter"
-)
-
-const (
- // namePrefix is the name prefix assumed and forced by the Linux implementation.
- namePrefix = "dev/ashmem"
-
- // nameLen is the maximum name length.
- nameLen = 256
-)
-
-// Area implements fs.FileOperations.
-//
-// +stateify savable
-type Area struct {
- fsutil.FileNoFsync `state:"nosave"`
- fsutil.FileNoSplice `state:"nosave"`
- fsutil.FileNoopFlush `state:"nosave"`
- fsutil.FileNotDirReaddir `state:"nosave"`
- fsutil.FileUseInodeUnstableAttr `state:"nosave"`
- waiter.AlwaysReady `state:"nosave"`
-
- ad *Device
-
- // mu protects fields below.
- mu sync.Mutex `state:"nosave"`
- tmpfsFile *fs.File
- name string
- size uint64
- perms usermem.AccessType
- pb *PinBoard
-}
-
-// Release implements fs.FileOperations.Release.
-func (a *Area) Release() {
- a.mu.Lock()
- defer a.mu.Unlock()
- if a.tmpfsFile != nil {
- a.tmpfsFile.DecRef()
- a.tmpfsFile = nil
- }
-}
-
-// Seek implements fs.FileOperations.Seek.
-func (a *Area) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
- a.mu.Lock()
- defer a.mu.Unlock()
- if a.size == 0 {
- return 0, syserror.EINVAL
- }
- if a.tmpfsFile == nil {
- return 0, syserror.EBADF
- }
- return a.tmpfsFile.FileOperations.Seek(ctx, file, whence, offset)
-}
-
-// Read implements fs.FileOperations.Read.
-func (a *Area) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
- a.mu.Lock()
- defer a.mu.Unlock()
- if a.size == 0 {
- return 0, nil
- }
- if a.tmpfsFile == nil {
- return 0, syserror.EBADF
- }
- return a.tmpfsFile.FileOperations.Read(ctx, file, dst, offset)
-}
-
-// Write implements fs.FileOperations.Write.
-func (a *Area) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
- return 0, syserror.ENOSYS
-}
-
-// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
-func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
- a.mu.Lock()
- defer a.mu.Unlock()
- if a.size == 0 {
- return syserror.EINVAL
- }
-
- if !a.perms.SupersetOf(opts.Perms) {
- return syserror.EPERM
- }
- opts.MaxPerms = opts.MaxPerms.Intersect(a.perms)
-
- if a.tmpfsFile == nil {
- tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{})
- tmpfsInode := fs.NewInode(ctx, tmpfsInodeOps, fs.NewPseudoMountSource(ctx), fs.StableAttr{})
- dirent := fs.NewDirent(ctx, tmpfsInode, namePrefix+"/"+a.name)
- tmpfsFile, err := tmpfsInode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Write: true})
- // Drop the extra reference on the Dirent.
- dirent.DecRef()
-
- if err != nil {
- return err
- }
-
- // Truncate to the size set by ASHMEM_SET_SIZE ioctl.
- err = tmpfsInodeOps.Truncate(ctx, tmpfsInode, int64(a.size))
- if err != nil {
- return err
- }
- a.tmpfsFile = tmpfsFile
- a.pb = NewPinBoard()
- }
-
- return a.tmpfsFile.ConfigureMMap(ctx, opts)
-}
-
-// Ioctl implements fs.FileOperations.Ioctl.
-func (a *Area) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
- // Switch on ioctl request.
- switch args[1].Uint() {
- case linux.AshmemSetNameIoctl:
- name, err := usermem.CopyStringIn(ctx, io, args[2].Pointer(), nameLen-1, usermem.IOOpts{
- AddressSpaceActive: true,
- })
- if err != nil {
- return 0, err
- }
-
- a.mu.Lock()
- defer a.mu.Unlock()
-
- // Cannot set name for already mapped ashmem.
- if a.tmpfsFile != nil {
- return 0, syserror.EINVAL
- }
- a.name = name
- return 0, nil
-
- case linux.AshmemGetNameIoctl:
- a.mu.Lock()
- var local []byte
- if a.name != "" {
- nameLen := len([]byte(a.name))
- local = make([]byte, nameLen, nameLen+1)
- copy(local, []byte(a.name))
- local = append(local, 0)
- } else {
- nameLen := len([]byte(namePrefix))
- local = make([]byte, nameLen, nameLen+1)
- copy(local, []byte(namePrefix))
- local = append(local, 0)
- }
- a.mu.Unlock()
-
- if _, err := io.CopyOut(ctx, args[2].Pointer(), local, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
- return 0, syserror.EFAULT
- }
- return 0, nil
-
- case linux.AshmemSetSizeIoctl:
- a.mu.Lock()
- defer a.mu.Unlock()
-
- // Cannot set size for already mapped ashmem.
- if a.tmpfsFile != nil {
- return 0, syserror.EINVAL
- }
- a.size = uint64(args[2].SizeT())
- return 0, nil
-
- case linux.AshmemGetSizeIoctl:
- return uintptr(a.size), nil
-
- case linux.AshmemPinIoctl, linux.AshmemUnpinIoctl, linux.AshmemGetPinStatusIoctl:
- // Locking and unlocking is ok since once tmpfsFile is set, it won't be nil again
- // even after unmapping! Unlocking is needed in order to avoid a deadlock on
- // usermem.CopyObjectIn.
-
- // Cannot execute pin-related ioctls before mapping.
- a.mu.Lock()
- if a.tmpfsFile == nil {
- a.mu.Unlock()
- return 0, syserror.EINVAL
- }
- a.mu.Unlock()
-
- var pin linux.AshmemPin
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pin, usermem.IOOpts{
- AddressSpaceActive: true,
- })
- if err != nil {
- return 0, syserror.EFAULT
- }
-
- a.mu.Lock()
- defer a.mu.Unlock()
- return a.pinOperation(pin, args[1].Uint())
-
- case linux.AshmemPurgeAllCachesIoctl:
- return 0, nil
-
- case linux.AshmemSetProtMaskIoctl:
- prot := uint64(args[2].ModeT())
- perms := usermem.AccessType{
- Read: prot&linux.PROT_READ != 0,
- Write: prot&linux.PROT_WRITE != 0,
- Execute: prot&linux.PROT_EXEC != 0,
- }
-
- a.mu.Lock()
- defer a.mu.Unlock()
-
- // Can only narrow prot mask.
- if !a.perms.SupersetOf(perms) {
- return 0, syserror.EINVAL
- }
-
- // TODO(b/30946773,gvisor.dev/issue/153): If personality flag
- // READ_IMPLIES_EXEC is set, set PROT_EXEC if PORT_READ is set.
-
- a.perms = perms
- return 0, nil
-
- case linux.AshmemGetProtMaskIoctl:
- return uintptr(a.perms.Prot()), nil
- default:
- // Ioctls irrelevant to Ashmem.
- return 0, syserror.EINVAL
- }
-}
-
-// pinOperation should only be called while holding a.mu.
-func (a *Area) pinOperation(pin linux.AshmemPin, op uint32) (uintptr, error) {
- // Page-align a.size for checks.
- pageAlignedSize, ok := usermem.Addr(a.size).RoundUp()
- if !ok {
- return 0, syserror.EINVAL
- }
- // Len 0 means everything onward.
- if pin.Len == 0 {
- pin.Len = uint32(pageAlignedSize) - pin.Offset
- }
- // Both Offset and Len have to be page-aligned.
- if pin.Offset%uint32(usermem.PageSize) != 0 {
- return 0, syserror.EINVAL
- }
- if pin.Len%uint32(usermem.PageSize) != 0 {
- return 0, syserror.EINVAL
- }
- // Adding Offset and Len must not cause an uint32 overflow.
- if end := pin.Offset + pin.Len; end < pin.Offset {
- return 0, syserror.EINVAL
- }
- // Pin range must not exceed a's size.
- if uint32(pageAlignedSize) < pin.Offset+pin.Len {
- return 0, syserror.EINVAL
- }
- // Handle each operation.
- r := RangeFromAshmemPin(pin)
- switch op {
- case linux.AshmemPinIoctl:
- if a.pb.PinRange(r) {
- return linux.AshmemWasPurged, nil
- }
- return linux.AshmemNotPurged, nil
-
- case linux.AshmemUnpinIoctl:
- // TODO(b/30946773): Implement purge on unpin.
- a.pb.UnpinRange(r)
- return 0, nil
-
- case linux.AshmemGetPinStatusIoctl:
- if a.pb.RangePinnedStatus(r) {
- return linux.AshmemIsPinned, nil
- }
- return linux.AshmemIsUnpinned, nil
-
- default:
- panic("unreachable")
- }
-
-}
diff --git a/pkg/sentry/fs/ashmem/ashmem_state_autogen.go b/pkg/sentry/fs/ashmem/ashmem_state_autogen.go
deleted file mode 100755
index 13defb033..000000000
--- a/pkg/sentry/fs/ashmem/ashmem_state_autogen.go
+++ /dev/null
@@ -1,123 +0,0 @@
-// automatically generated by stateify.
-
-package ashmem
-
-import (
- "gvisor.dev/gvisor/pkg/state"
-)
-
-func (x *Area) beforeSave() {}
-func (x *Area) save(m state.Map) {
- x.beforeSave()
- m.Save("ad", &x.ad)
- m.Save("tmpfsFile", &x.tmpfsFile)
- m.Save("name", &x.name)
- m.Save("size", &x.size)
- m.Save("perms", &x.perms)
- m.Save("pb", &x.pb)
-}
-
-func (x *Area) afterLoad() {}
-func (x *Area) load(m state.Map) {
- m.Load("ad", &x.ad)
- m.Load("tmpfsFile", &x.tmpfsFile)
- m.Load("name", &x.name)
- m.Load("size", &x.size)
- m.Load("perms", &x.perms)
- m.Load("pb", &x.pb)
-}
-
-func (x *Device) beforeSave() {}
-func (x *Device) save(m state.Map) {
- x.beforeSave()
- m.Save("InodeSimpleAttributes", &x.InodeSimpleAttributes)
-}
-
-func (x *Device) afterLoad() {}
-func (x *Device) load(m state.Map) {
- m.Load("InodeSimpleAttributes", &x.InodeSimpleAttributes)
-}
-
-func (x *PinBoard) beforeSave() {}
-func (x *PinBoard) save(m state.Map) {
- x.beforeSave()
- m.Save("Set", &x.Set)
-}
-
-func (x *PinBoard) afterLoad() {}
-func (x *PinBoard) load(m state.Map) {
- m.Load("Set", &x.Set)
-}
-
-func (x *Range) beforeSave() {}
-func (x *Range) save(m state.Map) {
- x.beforeSave()
- m.Save("Start", &x.Start)
- m.Save("End", &x.End)
-}
-
-func (x *Range) afterLoad() {}
-func (x *Range) load(m state.Map) {
- m.Load("Start", &x.Start)
- m.Load("End", &x.End)
-}
-
-func (x *Set) beforeSave() {}
-func (x *Set) save(m state.Map) {
- x.beforeSave()
- var root *SegmentDataSlices = x.saveRoot()
- m.SaveValue("root", root)
-}
-
-func (x *Set) afterLoad() {}
-func (x *Set) load(m state.Map) {
- m.LoadValue("root", new(*SegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*SegmentDataSlices)) })
-}
-
-func (x *node) beforeSave() {}
-func (x *node) save(m state.Map) {
- x.beforeSave()
- m.Save("nrSegments", &x.nrSegments)
- m.Save("parent", &x.parent)
- m.Save("parentIndex", &x.parentIndex)
- m.Save("hasChildren", &x.hasChildren)
- m.Save("keys", &x.keys)
- m.Save("values", &x.values)
- m.Save("children", &x.children)
-}
-
-func (x *node) afterLoad() {}
-func (x *node) load(m state.Map) {
- m.Load("nrSegments", &x.nrSegments)
- m.Load("parent", &x.parent)
- m.Load("parentIndex", &x.parentIndex)
- m.Load("hasChildren", &x.hasChildren)
- m.Load("keys", &x.keys)
- m.Load("values", &x.values)
- m.Load("children", &x.children)
-}
-
-func (x *SegmentDataSlices) beforeSave() {}
-func (x *SegmentDataSlices) save(m state.Map) {
- x.beforeSave()
- m.Save("Start", &x.Start)
- m.Save("End", &x.End)
- m.Save("Values", &x.Values)
-}
-
-func (x *SegmentDataSlices) afterLoad() {}
-func (x *SegmentDataSlices) load(m state.Map) {
- m.Load("Start", &x.Start)
- m.Load("End", &x.End)
- m.Load("Values", &x.Values)
-}
-
-func init() {
- state.Register("ashmem.Area", (*Area)(nil), state.Fns{Save: (*Area).save, Load: (*Area).load})
- state.Register("ashmem.Device", (*Device)(nil), state.Fns{Save: (*Device).save, Load: (*Device).load})
- state.Register("ashmem.PinBoard", (*PinBoard)(nil), state.Fns{Save: (*PinBoard).save, Load: (*PinBoard).load})
- state.Register("ashmem.Range", (*Range)(nil), state.Fns{Save: (*Range).save, Load: (*Range).load})
- state.Register("ashmem.Set", (*Set)(nil), state.Fns{Save: (*Set).save, Load: (*Set).load})
- state.Register("ashmem.node", (*node)(nil), state.Fns{Save: (*node).save, Load: (*node).load})
- state.Register("ashmem.SegmentDataSlices", (*SegmentDataSlices)(nil), state.Fns{Save: (*SegmentDataSlices).save, Load: (*SegmentDataSlices).load})
-}
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
deleted file mode 100644
index 776f54abe..000000000
--- a/pkg/sentry/fs/ashmem/device.go
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package ashmem implements Android ashmem module (Anonymus Shared Memory).
-package ashmem
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
-)
-
-// Device implements fs.InodeOperations.
-//
-// +stateify savable
-type Device struct {
- fsutil.InodeGenericChecker `state:"nosave"`
- fsutil.InodeNoExtendedAttributes `state:"nosave"`
- fsutil.InodeNoopAllocate `state:"nosave"`
- fsutil.InodeNoopRelease `state:"nosave"`
- fsutil.InodeNoopTruncate `state:"nosave"`
- fsutil.InodeNoopWriteOut `state:"nosave"`
- fsutil.InodeNotDirectory `state:"nosave"`
- fsutil.InodeNotMappable `state:"nosave"`
- fsutil.InodeNotSocket `state:"nosave"`
- fsutil.InodeNotSymlink `state:"nosave"`
- fsutil.InodeVirtual `state:"nosave"`
-
- fsutil.InodeSimpleAttributes
-}
-
-var _ fs.InodeOperations = (*Device)(nil)
-
-// NewDevice creates and initializes a Device structure.
-func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device {
- return &Device{
- InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, linux.ANON_INODE_FS_MAGIC),
- }
-}
-
-// GetFile implements fs.InodeOperations.GetFile.
-func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
- return fs.NewFile(ctx, d, flags, &Area{
- ad: ad,
- tmpfsFile: nil,
- perms: usermem.AnyAccess,
- }), nil
-}
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
deleted file mode 100644
index c5400dd94..000000000
--- a/pkg/sentry/fs/ashmem/pin_board.go
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ashmem
-
-import "gvisor.dev/gvisor/pkg/abi/linux"
-
-const maxUint64 = ^uint64(0)
-
-// setFunctions implements segment.Functions generated from segment.Functions for
-// uint64 Key and noValue Value. For more information, see the build file and
-// segment set implementation at pkg/segment/set.go.
-type setFunctions struct{}
-
-// noValue is a type of range attached value, which is irrelevant here.
-type noValue struct{}
-
-// MinKey implements segment.Functions.MinKey.
-func (setFunctions) MinKey() uint64 {
- return 0
-}
-
-// MaxKey implements segment.Functions.MaxKey.
-func (setFunctions) MaxKey() uint64 {
- return maxUint64
-}
-
-// ClearValue implements segment.Functions.ClearValue.
-func (setFunctions) ClearValue(*noValue) {
- return
-}
-
-// Merge implements segment.Functions.Merge.
-func (setFunctions) Merge(Range, noValue, Range, noValue) (noValue, bool) {
- return noValue{}, true
-}
-
-// Split implements segment.Functions.Split.
-func (setFunctions) Split(Range, noValue, uint64) (noValue, noValue) {
- return noValue{}, noValue{}
-}
-
-// PinBoard represents a set of pinned ranges in ashmem.
-//
-// segment.Set is used for implementation where segments represent
-// ranges of pinned bytes, while gaps represent ranges of unpinned
-// bytes. All ranges are page-aligned.
-//
-// +stateify savable
-type PinBoard struct {
- Set
-}
-
-// NewPinBoard creates a new pin board with all pages pinned.
-func NewPinBoard() *PinBoard {
- var pb PinBoard
- pb.PinRange(Range{0, maxUint64})
- return &pb
-}
-
-// PinRange pins all pages in the specified range and returns true
-// if there are any newly pinned pages.
-func (pb *PinBoard) PinRange(r Range) bool {
- pinnedPages := false
- for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; {
- common := gap.Range().Intersect(r)
- if common.Length() == 0 {
- gap = gap.NextGap()
- continue
- }
- pinnedPages = true
- gap = pb.Insert(gap, common, noValue{}).NextGap()
- }
- return pinnedPages
-}
-
-// UnpinRange unpins all pages in the specified range.
-func (pb *PinBoard) UnpinRange(r Range) {
- for seg := pb.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; {
- common := seg.Range().Intersect(r)
- if common.Length() == 0 {
- seg = seg.NextSegment()
- continue
- }
- seg = pb.RemoveRange(common).NextSegment()
- }
-}
-
-// RangePinnedStatus returns false if there's at least one unpinned page in the
-// specified range.
-func (pb *PinBoard) RangePinnedStatus(r Range) bool {
- for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; {
- common := gap.Range().Intersect(r)
- if common.Length() == 0 {
- gap = gap.NextGap()
- continue
- }
- return false
- }
- return true
-}
-
-// RangeFromAshmemPin converts ashmem's original pin structure
-// to Range.
-func RangeFromAshmemPin(ap linux.AshmemPin) Range {
- if ap.Len == 0 {
- return Range{
- uint64(ap.Offset),
- maxUint64,
- }
- }
- return Range{
- uint64(ap.Offset),
- uint64(ap.Offset) + uint64(ap.Len),
- }
-}
diff --git a/pkg/sentry/fs/ashmem/uint64_range.go b/pkg/sentry/fs/ashmem/uint64_range.go
deleted file mode 100755
index d71a10b16..000000000
--- a/pkg/sentry/fs/ashmem/uint64_range.go
+++ /dev/null
@@ -1,62 +0,0 @@
-package ashmem
-
-// A Range represents a contiguous range of T.
-//
-// +stateify savable
-type Range struct {
- // Start is the inclusive start of the range.
- Start uint64
-
- // End is the exclusive end of the range.
- End uint64
-}
-
-// WellFormed returns true if r.Start <= r.End. All other methods on a Range
-// require that the Range is well-formed.
-func (r Range) WellFormed() bool {
- return r.Start <= r.End
-}
-
-// Length returns the length of the range.
-func (r Range) Length() uint64 {
- return r.End - r.Start
-}
-
-// Contains returns true if r contains x.
-func (r Range) Contains(x uint64) bool {
- return r.Start <= x && x < r.End
-}
-
-// Overlaps returns true if r and r2 overlap.
-func (r Range) Overlaps(r2 Range) bool {
- return r.Start < r2.End && r2.Start < r.End
-}
-
-// IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is
-// contained within r.
-func (r Range) IsSupersetOf(r2 Range) bool {
- return r.Start <= r2.Start && r.End >= r2.End
-}
-
-// Intersect returns a range consisting of the intersection between r and r2.
-// If r and r2 do not overlap, Intersect returns a range with unspecified
-// bounds, but for which Length() == 0.
-func (r Range) Intersect(r2 Range) Range {
- if r.Start < r2.Start {
- r.Start = r2.Start
- }
- if r.End > r2.End {
- r.End = r2.End
- }
- if r.End < r.Start {
- r.End = r.Start
- }
- return r
-}
-
-// CanSplitAt returns true if it is legal to split a segment spanning the range
-// r at x; that is, splitting at x would produce two ranges, both of which have
-// non-zero length.
-func (r Range) CanSplitAt(x uint64) bool {
- return r.Contains(x) && r.Start < x
-}
diff --git a/pkg/sentry/fs/ashmem/uint64_set.go b/pkg/sentry/fs/ashmem/uint64_set.go
deleted file mode 100755
index a4860175a..000000000
--- a/pkg/sentry/fs/ashmem/uint64_set.go
+++ /dev/null
@@ -1,1270 +0,0 @@
-package ashmem
-
-import (
- "bytes"
- "fmt"
-)
-
-const (
- // minDegree is the minimum degree of an internal node in a Set B-tree.
- //
- // - Any non-root node has at least minDegree-1 segments.
- //
- // - Any non-root internal (non-leaf) node has at least minDegree children.
- //
- // - The root node may have fewer than minDegree-1 segments, but it may
- // only have 0 segments if the tree is empty.
- //
- // Our implementation requires minDegree >= 3. Higher values of minDegree
- // usually improve performance, but increase memory usage for small sets.
- minDegree = 3
-
- maxDegree = 2 * minDegree
-)
-
-// A Set is a mapping of segments with non-overlapping Range keys. The zero
-// value for a Set is an empty set. Set values are not safely movable nor
-// copyable. Set is thread-compatible.
-//
-// +stateify savable
-type Set struct {
- root node `state:".(*SegmentDataSlices)"`
-}
-
-// IsEmpty returns true if the set contains no segments.
-func (s *Set) IsEmpty() bool {
- return s.root.nrSegments == 0
-}
-
-// IsEmptyRange returns true iff no segments in the set overlap the given
-// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be
-// more efficient.
-func (s *Set) IsEmptyRange(r Range) bool {
- switch {
- case r.Length() < 0:
- panic(fmt.Sprintf("invalid range %v", r))
- case r.Length() == 0:
- return true
- }
- _, gap := s.Find(r.Start)
- if !gap.Ok() {
- return false
- }
- return r.End <= gap.End()
-}
-
-// Span returns the total size of all segments in the set.
-func (s *Set) Span() uint64 {
- var sz uint64
- for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
- sz += seg.Range().Length()
- }
- return sz
-}
-
-// SpanRange returns the total size of the intersection of segments in the set
-// with the given range.
-func (s *Set) SpanRange(r Range) uint64 {
- switch {
- case r.Length() < 0:
- panic(fmt.Sprintf("invalid range %v", r))
- case r.Length() == 0:
- return 0
- }
- var sz uint64
- for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() {
- sz += seg.Range().Intersect(r).Length()
- }
- return sz
-}
-
-// FirstSegment returns the first segment in the set. If the set is empty,
-// FirstSegment returns a terminal iterator.
-func (s *Set) FirstSegment() Iterator {
- if s.root.nrSegments == 0 {
- return Iterator{}
- }
- return s.root.firstSegment()
-}
-
-// LastSegment returns the last segment in the set. If the set is empty,
-// LastSegment returns a terminal iterator.
-func (s *Set) LastSegment() Iterator {
- if s.root.nrSegments == 0 {
- return Iterator{}
- }
- return s.root.lastSegment()
-}
-
-// FirstGap returns the first gap in the set.
-func (s *Set) FirstGap() GapIterator {
- n := &s.root
- for n.hasChildren {
- n = n.children[0]
- }
- return GapIterator{n, 0}
-}
-
-// LastGap returns the last gap in the set.
-func (s *Set) LastGap() GapIterator {
- n := &s.root
- for n.hasChildren {
- n = n.children[n.nrSegments]
- }
- return GapIterator{n, n.nrSegments}
-}
-
-// Find returns the segment or gap whose range contains the given key. If a
-// segment is found, the returned Iterator is non-terminal and the
-// returned GapIterator is terminal. Otherwise, the returned Iterator is
-// terminal and the returned GapIterator is non-terminal.
-func (s *Set) Find(key uint64) (Iterator, GapIterator) {
- n := &s.root
- for {
-
- lower := 0
- upper := n.nrSegments
- for lower < upper {
- i := lower + (upper-lower)/2
- if r := n.keys[i]; key < r.End {
- if key >= r.Start {
- return Iterator{n, i}, GapIterator{}
- }
- upper = i
- } else {
- lower = i + 1
- }
- }
- i := lower
- if !n.hasChildren {
- return Iterator{}, GapIterator{n, i}
- }
- n = n.children[i]
- }
-}
-
-// FindSegment returns the segment whose range contains the given key. If no
-// such segment exists, FindSegment returns a terminal iterator.
-func (s *Set) FindSegment(key uint64) Iterator {
- seg, _ := s.Find(key)
- return seg
-}
-
-// LowerBoundSegment returns the segment with the lowest range that contains a
-// key greater than or equal to min. If no such segment exists,
-// LowerBoundSegment returns a terminal iterator.
-func (s *Set) LowerBoundSegment(min uint64) Iterator {
- seg, gap := s.Find(min)
- if seg.Ok() {
- return seg
- }
- return gap.NextSegment()
-}
-
-// UpperBoundSegment returns the segment with the highest range that contains a
-// key less than or equal to max. If no such segment exists, UpperBoundSegment
-// returns a terminal iterator.
-func (s *Set) UpperBoundSegment(max uint64) Iterator {
- seg, gap := s.Find(max)
- if seg.Ok() {
- return seg
- }
- return gap.PrevSegment()
-}
-
-// FindGap returns the gap containing the given key. If no such gap exists
-// (i.e. the set contains a segment containing that key), FindGap returns a
-// terminal iterator.
-func (s *Set) FindGap(key uint64) GapIterator {
- _, gap := s.Find(key)
- return gap
-}
-
-// LowerBoundGap returns the gap with the lowest range that is greater than or
-// equal to min.
-func (s *Set) LowerBoundGap(min uint64) GapIterator {
- seg, gap := s.Find(min)
- if gap.Ok() {
- return gap
- }
- return seg.NextGap()
-}
-
-// UpperBoundGap returns the gap with the highest range that is less than or
-// equal to max.
-func (s *Set) UpperBoundGap(max uint64) GapIterator {
- seg, gap := s.Find(max)
- if gap.Ok() {
- return gap
- }
- return seg.PrevGap()
-}
-
-// Add inserts the given segment into the set and returns true. If the new
-// segment can be merged with adjacent segments, Add will do so. If the new
-// segment would overlap an existing segment, Add returns false. If Add
-// succeeds, all existing iterators are invalidated.
-func (s *Set) Add(r Range, val noValue) bool {
- if r.Length() <= 0 {
- panic(fmt.Sprintf("invalid segment range %v", r))
- }
- gap := s.FindGap(r.Start)
- if !gap.Ok() {
- return false
- }
- if r.End > gap.End() {
- return false
- }
- s.Insert(gap, r, val)
- return true
-}
-
-// AddWithoutMerging inserts the given segment into the set and returns true.
-// If it would overlap an existing segment, AddWithoutMerging does nothing and
-// returns false. If AddWithoutMerging succeeds, all existing iterators are
-// invalidated.
-func (s *Set) AddWithoutMerging(r Range, val noValue) bool {
- if r.Length() <= 0 {
- panic(fmt.Sprintf("invalid segment range %v", r))
- }
- gap := s.FindGap(r.Start)
- if !gap.Ok() {
- return false
- }
- if r.End > gap.End() {
- return false
- }
- s.InsertWithoutMergingUnchecked(gap, r, val)
- return true
-}
-
-// Insert inserts the given segment into the given gap. If the new segment can
-// be merged with adjacent segments, Insert will do so. Insert returns an
-// iterator to the segment containing the inserted value (which may have been
-// merged with other values). All existing iterators (including gap, but not
-// including the returned iterator) are invalidated.
-//
-// If the gap cannot accommodate the segment, or if r is invalid, Insert panics.
-//
-// Insert is semantically equivalent to a InsertWithoutMerging followed by a
-// Merge, but may be more efficient. Note that there is no unchecked variant of
-// Insert since Insert must retrieve and inspect gap's predecessor and
-// successor segments regardless.
-func (s *Set) Insert(gap GapIterator, r Range, val noValue) Iterator {
- if r.Length() <= 0 {
- panic(fmt.Sprintf("invalid segment range %v", r))
- }
- prev, next := gap.PrevSegment(), gap.NextSegment()
- if prev.Ok() && prev.End() > r.Start {
- panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range()))
- }
- if next.Ok() && next.Start() < r.End {
- panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range()))
- }
- if prev.Ok() && prev.End() == r.Start {
- if mval, ok := (setFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok {
- prev.SetEndUnchecked(r.End)
- prev.SetValue(mval)
- if next.Ok() && next.Start() == r.End {
- val = mval
- if mval, ok := (setFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok {
- prev.SetEndUnchecked(next.End())
- prev.SetValue(mval)
- return s.Remove(next).PrevSegment()
- }
- }
- return prev
- }
- }
- if next.Ok() && next.Start() == r.End {
- if mval, ok := (setFunctions{}).Merge(r, val, next.Range(), next.Value()); ok {
- next.SetStartUnchecked(r.Start)
- next.SetValue(mval)
- return next
- }
- }
- return s.InsertWithoutMergingUnchecked(gap, r, val)
-}
-
-// InsertWithoutMerging inserts the given segment into the given gap and
-// returns an iterator to the inserted segment. All existing iterators
-// (including gap, but not including the returned iterator) are invalidated.
-//
-// If the gap cannot accommodate the segment, or if r is invalid,
-// InsertWithoutMerging panics.
-func (s *Set) InsertWithoutMerging(gap GapIterator, r Range, val noValue) Iterator {
- if r.Length() <= 0 {
- panic(fmt.Sprintf("invalid segment range %v", r))
- }
- if gr := gap.Range(); !gr.IsSupersetOf(r) {
- panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr))
- }
- return s.InsertWithoutMergingUnchecked(gap, r, val)
-}
-
-// InsertWithoutMergingUnchecked inserts the given segment into the given gap
-// and returns an iterator to the inserted segment. All existing iterators
-// (including gap, but not including the returned iterator) are invalidated.
-//
-// Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
-func (s *Set) InsertWithoutMergingUnchecked(gap GapIterator, r Range, val noValue) Iterator {
- gap = gap.node.rebalanceBeforeInsert(gap)
- copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments])
- copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments])
- gap.node.keys[gap.index] = r
- gap.node.values[gap.index] = val
- gap.node.nrSegments++
- return Iterator{gap.node, gap.index}
-}
-
-// Remove removes the given segment and returns an iterator to the vacated gap.
-// All existing iterators (including seg, but not including the returned
-// iterator) are invalidated.
-func (s *Set) Remove(seg Iterator) GapIterator {
-
- if seg.node.hasChildren {
-
- victim := seg.PrevSegment()
-
- seg.SetRangeUnchecked(victim.Range())
- seg.SetValue(victim.Value())
- return s.Remove(victim).NextGap()
- }
- copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments])
- copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments])
- setFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1])
- seg.node.nrSegments--
- return seg.node.rebalanceAfterRemove(GapIterator{seg.node, seg.index})
-}
-
-// RemoveAll removes all segments from the set. All existing iterators are
-// invalidated.
-func (s *Set) RemoveAll() {
- s.root = node{}
-}
-
-// RemoveRange removes all segments in the given range. An iterator to the
-// newly formed gap is returned, and all existing iterators are invalidated.
-func (s *Set) RemoveRange(r Range) GapIterator {
- seg, gap := s.Find(r.Start)
- if seg.Ok() {
- seg = s.Isolate(seg, r)
- gap = s.Remove(seg)
- }
- for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() {
- seg = s.Isolate(seg, r)
- gap = s.Remove(seg)
- }
- return gap
-}
-
-// Merge attempts to merge two neighboring segments. If successful, Merge
-// returns an iterator to the merged segment, and all existing iterators are
-// invalidated. Otherwise, Merge returns a terminal iterator.
-//
-// If first is not the predecessor of second, Merge panics.
-func (s *Set) Merge(first, second Iterator) Iterator {
- if first.NextSegment() != second {
- panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range()))
- }
- return s.MergeUnchecked(first, second)
-}
-
-// MergeUnchecked attempts to merge two neighboring segments. If successful,
-// MergeUnchecked returns an iterator to the merged segment, and all existing
-// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal
-// iterator.
-//
-// Precondition: first is the predecessor of second: first.NextSegment() ==
-// second, first == second.PrevSegment().
-func (s *Set) MergeUnchecked(first, second Iterator) Iterator {
- if first.End() == second.Start() {
- if mval, ok := (setFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok {
-
- first.SetEndUnchecked(second.End())
- first.SetValue(mval)
- return s.Remove(second).PrevSegment()
- }
- }
- return Iterator{}
-}
-
-// MergeAll attempts to merge all adjacent segments in the set. All existing
-// iterators are invalidated.
-func (s *Set) MergeAll() {
- seg := s.FirstSegment()
- if !seg.Ok() {
- return
- }
- next := seg.NextSegment()
- for next.Ok() {
- if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
- seg, next = mseg, mseg.NextSegment()
- } else {
- seg, next = next, next.NextSegment()
- }
- }
-}
-
-// MergeRange attempts to merge all adjacent segments that contain a key in the
-// specific range. All existing iterators are invalidated.
-func (s *Set) MergeRange(r Range) {
- seg := s.LowerBoundSegment(r.Start)
- if !seg.Ok() {
- return
- }
- next := seg.NextSegment()
- for next.Ok() && next.Range().Start < r.End {
- if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
- seg, next = mseg, mseg.NextSegment()
- } else {
- seg, next = next, next.NextSegment()
- }
- }
-}
-
-// MergeAdjacent attempts to merge the segment containing r.Start with its
-// predecessor, and the segment containing r.End-1 with its successor.
-func (s *Set) MergeAdjacent(r Range) {
- first := s.FindSegment(r.Start)
- if first.Ok() {
- if prev := first.PrevSegment(); prev.Ok() {
- s.Merge(prev, first)
- }
- }
- last := s.FindSegment(r.End - 1)
- if last.Ok() {
- if next := last.NextSegment(); next.Ok() {
- s.Merge(last, next)
- }
- }
-}
-
-// Split splits the given segment at the given key and returns iterators to the
-// two resulting segments. All existing iterators (including seg, but not
-// including the returned iterators) are invalidated.
-//
-// If the segment cannot be split at split (because split is at the start or
-// end of the segment's range, so splitting would produce a segment with zero
-// length, or because split falls outside the segment's range altogether),
-// Split panics.
-func (s *Set) Split(seg Iterator, split uint64) (Iterator, Iterator) {
- if !seg.Range().CanSplitAt(split) {
- panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split))
- }
- return s.SplitUnchecked(seg, split)
-}
-
-// SplitUnchecked splits the given segment at the given key and returns
-// iterators to the two resulting segments. All existing iterators (including
-// seg, but not including the returned iterators) are invalidated.
-//
-// Preconditions: seg.Start() < key < seg.End().
-func (s *Set) SplitUnchecked(seg Iterator, split uint64) (Iterator, Iterator) {
- val1, val2 := (setFunctions{}).Split(seg.Range(), seg.Value(), split)
- end2 := seg.End()
- seg.SetEndUnchecked(split)
- seg.SetValue(val1)
- seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), Range{split, end2}, val2)
-
- return seg2.PrevSegment(), seg2
-}
-
-// SplitAt splits the segment straddling split, if one exists. SplitAt returns
-// true if a segment was split and false otherwise. If SplitAt splits a
-// segment, all existing iterators are invalidated.
-func (s *Set) SplitAt(split uint64) bool {
- if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) {
- s.SplitUnchecked(seg, split)
- return true
- }
- return false
-}
-
-// Isolate ensures that the given segment's range does not escape r by
-// splitting at r.Start and r.End if necessary, and returns an updated iterator
-// to the bounded segment. All existing iterators (including seg, but not
-// including the returned iterators) are invalidated.
-func (s *Set) Isolate(seg Iterator, r Range) Iterator {
- if seg.Range().CanSplitAt(r.Start) {
- _, seg = s.SplitUnchecked(seg, r.Start)
- }
- if seg.Range().CanSplitAt(r.End) {
- seg, _ = s.SplitUnchecked(seg, r.End)
- }
- return seg
-}
-
-// ApplyContiguous applies a function to a contiguous range of segments,
-// splitting if necessary. The function is applied until the first gap is
-// encountered, at which point the gap is returned. If the function is applied
-// across the entire range, a terminal gap is returned. All existing iterators
-// are invalidated.
-//
-// N.B. The Iterator must not be invalidated by the function.
-func (s *Set) ApplyContiguous(r Range, fn func(seg Iterator)) GapIterator {
- seg, gap := s.Find(r.Start)
- if !seg.Ok() {
- return gap
- }
- for {
- seg = s.Isolate(seg, r)
- fn(seg)
- if seg.End() >= r.End {
- return GapIterator{}
- }
- gap = seg.NextGap()
- if !gap.IsEmpty() {
- return gap
- }
- seg = gap.NextSegment()
- if !seg.Ok() {
-
- return GapIterator{}
- }
- }
-}
-
-// +stateify savable
-type node struct {
- // An internal binary tree node looks like:
- //
- // K
- // / \
- // Cl Cr
- //
- // where all keys in the subtree rooted by Cl (the left subtree) are less
- // than K (the key of the parent node), and all keys in the subtree rooted
- // by Cr (the right subtree) are greater than K.
- //
- // An internal B-tree node's indexes work out to look like:
- //
- // K0 K1 K2 ... Kn-1
- // / \/ \/ \ ... / \
- // C0 C1 C2 C3 ... Cn-1 Cn
- //
- // where n is nrSegments.
- nrSegments int
-
- // parent is a pointer to this node's parent. If this node is root, parent
- // is nil.
- parent *node
-
- // parentIndex is the index of this node in parent.children.
- parentIndex int
-
- // Flag for internal nodes that is technically redundant with "children[0]
- // != nil", but is stored in the first cache line. "hasChildren" rather
- // than "isLeaf" because false must be the correct value for an empty root.
- hasChildren bool
-
- // Nodes store keys and values in separate arrays to maximize locality in
- // the common case (scanning keys for lookup).
- keys [maxDegree - 1]Range
- values [maxDegree - 1]noValue
- children [maxDegree]*node
-}
-
-// firstSegment returns the first segment in the subtree rooted by n.
-//
-// Preconditions: n.nrSegments != 0.
-func (n *node) firstSegment() Iterator {
- for n.hasChildren {
- n = n.children[0]
- }
- return Iterator{n, 0}
-}
-
-// lastSegment returns the last segment in the subtree rooted by n.
-//
-// Preconditions: n.nrSegments != 0.
-func (n *node) lastSegment() Iterator {
- for n.hasChildren {
- n = n.children[n.nrSegments]
- }
- return Iterator{n, n.nrSegments - 1}
-}
-
-func (n *node) prevSibling() *node {
- if n.parent == nil || n.parentIndex == 0 {
- return nil
- }
- return n.parent.children[n.parentIndex-1]
-}
-
-func (n *node) nextSibling() *node {
- if n.parent == nil || n.parentIndex == n.parent.nrSegments {
- return nil
- }
- return n.parent.children[n.parentIndex+1]
-}
-
-// rebalanceBeforeInsert splits n and its ancestors if they are full, as
-// required for insertion, and returns an updated iterator to the position
-// represented by gap.
-func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator {
- if n.parent != nil {
- gap = n.parent.rebalanceBeforeInsert(gap)
- }
- if n.nrSegments < maxDegree-1 {
- return gap
- }
- if n.parent == nil {
-
- left := &node{
- nrSegments: minDegree - 1,
- parent: n,
- parentIndex: 0,
- hasChildren: n.hasChildren,
- }
- right := &node{
- nrSegments: minDegree - 1,
- parent: n,
- parentIndex: 1,
- hasChildren: n.hasChildren,
- }
- copy(left.keys[:minDegree-1], n.keys[:minDegree-1])
- copy(left.values[:minDegree-1], n.values[:minDegree-1])
- copy(right.keys[:minDegree-1], n.keys[minDegree:])
- copy(right.values[:minDegree-1], n.values[minDegree:])
- n.keys[0], n.values[0] = n.keys[minDegree-1], n.values[minDegree-1]
- zeroValueSlice(n.values[1:])
- if n.hasChildren {
- copy(left.children[:minDegree], n.children[:minDegree])
- copy(right.children[:minDegree], n.children[minDegree:])
- zeroNodeSlice(n.children[2:])
- for i := 0; i < minDegree; i++ {
- left.children[i].parent = left
- left.children[i].parentIndex = i
- right.children[i].parent = right
- right.children[i].parentIndex = i
- }
- }
- n.nrSegments = 1
- n.hasChildren = true
- n.children[0] = left
- n.children[1] = right
- if gap.node != n {
- return gap
- }
- if gap.index < minDegree {
- return GapIterator{left, gap.index}
- }
- return GapIterator{right, gap.index - minDegree}
- }
-
- copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments])
- copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments])
- n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[minDegree-1], n.values[minDegree-1]
- copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1])
- for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ {
- n.parent.children[i].parentIndex = i
- }
- sibling := &node{
- nrSegments: minDegree - 1,
- parent: n.parent,
- parentIndex: n.parentIndex + 1,
- hasChildren: n.hasChildren,
- }
- n.parent.children[n.parentIndex+1] = sibling
- n.parent.nrSegments++
- copy(sibling.keys[:minDegree-1], n.keys[minDegree:])
- copy(sibling.values[:minDegree-1], n.values[minDegree:])
- zeroValueSlice(n.values[minDegree-1:])
- if n.hasChildren {
- copy(sibling.children[:minDegree], n.children[minDegree:])
- zeroNodeSlice(n.children[minDegree:])
- for i := 0; i < minDegree; i++ {
- sibling.children[i].parent = sibling
- sibling.children[i].parentIndex = i
- }
- }
- n.nrSegments = minDegree - 1
-
- if gap.node != n {
- return gap
- }
- if gap.index < minDegree {
- return gap
- }
- return GapIterator{sibling, gap.index - minDegree}
-}
-
-// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient
-// (contain fewer segments than required by B-tree invariants), as required for
-// removal, and returns an updated iterator to the position represented by gap.
-//
-// Precondition: n is the only node in the tree that may currently violate a
-// B-tree invariant.
-func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
- for {
- if n.nrSegments >= minDegree-1 {
- return gap
- }
- if n.parent == nil {
-
- return gap
- }
-
- if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= minDegree {
- copy(n.keys[1:], n.keys[:n.nrSegments])
- copy(n.values[1:], n.values[:n.nrSegments])
- n.keys[0] = n.parent.keys[n.parentIndex-1]
- n.values[0] = n.parent.values[n.parentIndex-1]
- n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1]
- n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1]
- setFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
- if n.hasChildren {
- copy(n.children[1:], n.children[:n.nrSegments+1])
- n.children[0] = sibling.children[sibling.nrSegments]
- sibling.children[sibling.nrSegments] = nil
- n.children[0].parent = n
- n.children[0].parentIndex = 0
- for i := 1; i < n.nrSegments+2; i++ {
- n.children[i].parentIndex = i
- }
- }
- n.nrSegments++
- sibling.nrSegments--
- if gap.node == sibling && gap.index == sibling.nrSegments {
- return GapIterator{n, 0}
- }
- if gap.node == n {
- return GapIterator{n, gap.index + 1}
- }
- return gap
- }
- if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= minDegree {
- n.keys[n.nrSegments] = n.parent.keys[n.parentIndex]
- n.values[n.nrSegments] = n.parent.values[n.parentIndex]
- n.parent.keys[n.parentIndex] = sibling.keys[0]
- n.parent.values[n.parentIndex] = sibling.values[0]
- copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:])
- copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:])
- setFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
- if n.hasChildren {
- n.children[n.nrSegments+1] = sibling.children[0]
- copy(sibling.children[:sibling.nrSegments], sibling.children[1:])
- sibling.children[sibling.nrSegments] = nil
- n.children[n.nrSegments+1].parent = n
- n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1
- for i := 0; i < sibling.nrSegments; i++ {
- sibling.children[i].parentIndex = i
- }
- }
- n.nrSegments++
- sibling.nrSegments--
- if gap.node == sibling {
- if gap.index == 0 {
- return GapIterator{n, n.nrSegments}
- }
- return GapIterator{sibling, gap.index - 1}
- }
- return gap
- }
-
- p := n.parent
- if p.nrSegments == 1 {
-
- left, right := p.children[0], p.children[1]
- p.nrSegments = left.nrSegments + right.nrSegments + 1
- p.hasChildren = left.hasChildren
- p.keys[left.nrSegments] = p.keys[0]
- p.values[left.nrSegments] = p.values[0]
- copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments])
- copy(p.values[:left.nrSegments], left.values[:left.nrSegments])
- copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
- copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments])
- if left.hasChildren {
- copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1])
- copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
- for i := 0; i < p.nrSegments+1; i++ {
- p.children[i].parent = p
- p.children[i].parentIndex = i
- }
- } else {
- p.children[0] = nil
- p.children[1] = nil
- }
- if gap.node == left {
- return GapIterator{p, gap.index}
- }
- if gap.node == right {
- return GapIterator{p, gap.index + left.nrSegments + 1}
- }
- return gap
- }
- // Merge n and either sibling, along with the segment separating the
- // two, into whichever of the two nodes comes first. This is the
- // reverse of the non-root splitting case in
- // node.rebalanceBeforeInsert.
- var left, right *node
- if n.parentIndex > 0 {
- left = n.prevSibling()
- right = n
- } else {
- left = n
- right = n.nextSibling()
- }
-
- if gap.node == right {
- gap = GapIterator{left, gap.index + left.nrSegments + 1}
- }
- left.keys[left.nrSegments] = p.keys[left.parentIndex]
- left.values[left.nrSegments] = p.values[left.parentIndex]
- copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
- copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments])
- if left.hasChildren {
- copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
- for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ {
- left.children[i].parent = left
- left.children[i].parentIndex = i
- }
- }
- left.nrSegments += right.nrSegments + 1
- copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments])
- copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments])
- setFunctions{}.ClearValue(&p.values[p.nrSegments-1])
- copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1])
- for i := 0; i < p.nrSegments; i++ {
- p.children[i].parentIndex = i
- }
- p.children[p.nrSegments] = nil
- p.nrSegments--
-
- n = p
- }
-}
-
-// A Iterator is conceptually one of:
-//
-// - A pointer to a segment in a set; or
-//
-// - A terminal iterator, which is a sentinel indicating that the end of
-// iteration has been reached.
-//
-// Iterators are copyable values and are meaningfully equality-comparable. The
-// zero value of Iterator is a terminal iterator.
-//
-// Unless otherwise specified, any mutation of a set invalidates all existing
-// iterators into the set.
-type Iterator struct {
- // node is the node containing the iterated segment. If the iterator is
- // terminal, node is nil.
- node *node
-
- // index is the index of the segment in node.keys/values.
- index int
-}
-
-// Ok returns true if the iterator is not terminal. All other methods are only
-// valid for non-terminal iterators.
-func (seg Iterator) Ok() bool {
- return seg.node != nil
-}
-
-// Range returns the iterated segment's range key.
-func (seg Iterator) Range() Range {
- return seg.node.keys[seg.index]
-}
-
-// Start is equivalent to Range().Start, but should be preferred if only the
-// start of the range is needed.
-func (seg Iterator) Start() uint64 {
- return seg.node.keys[seg.index].Start
-}
-
-// End is equivalent to Range().End, but should be preferred if only the end of
-// the range is needed.
-func (seg Iterator) End() uint64 {
- return seg.node.keys[seg.index].End
-}
-
-// SetRangeUnchecked mutates the iterated segment's range key. This operation
-// does not invalidate any iterators.
-//
-// Preconditions:
-//
-// - r.Length() > 0.
-//
-// - The new range must not overlap an existing one: If seg.NextSegment().Ok(),
-// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then
-// r.start >= seg.PrevSegment().End().
-func (seg Iterator) SetRangeUnchecked(r Range) {
- seg.node.keys[seg.index] = r
-}
-
-// SetRange mutates the iterated segment's range key. If the new range would
-// cause the iterated segment to overlap another segment, or if the new range
-// is invalid, SetRange panics. This operation does not invalidate any
-// iterators.
-func (seg Iterator) SetRange(r Range) {
- if r.Length() <= 0 {
- panic(fmt.Sprintf("invalid segment range %v", r))
- }
- if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() {
- panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range()))
- }
- if next := seg.NextSegment(); next.Ok() && r.End > next.Start() {
- panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range()))
- }
- seg.SetRangeUnchecked(r)
-}
-
-// SetStartUnchecked mutates the iterated segment's start. This operation does
-// not invalidate any iterators.
-//
-// Preconditions: The new start must be valid: start < seg.End(); if
-// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
-func (seg Iterator) SetStartUnchecked(start uint64) {
- seg.node.keys[seg.index].Start = start
-}
-
-// SetStart mutates the iterated segment's start. If the new start value would
-// cause the iterated segment to overlap another segment, or would result in an
-// invalid range, SetStart panics. This operation does not invalidate any
-// iterators.
-func (seg Iterator) SetStart(start uint64) {
- if start >= seg.End() {
- panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range()))
- }
- if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() {
- panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range()))
- }
- seg.SetStartUnchecked(start)
-}
-
-// SetEndUnchecked mutates the iterated segment's end. This operation does not
-// invalidate any iterators.
-//
-// Preconditions: The new end must be valid: end > seg.Start(); if
-// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
-func (seg Iterator) SetEndUnchecked(end uint64) {
- seg.node.keys[seg.index].End = end
-}
-
-// SetEnd mutates the iterated segment's end. If the new end value would cause
-// the iterated segment to overlap another segment, or would result in an
-// invalid range, SetEnd panics. This operation does not invalidate any
-// iterators.
-func (seg Iterator) SetEnd(end uint64) {
- if end <= seg.Start() {
- panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range()))
- }
- if next := seg.NextSegment(); next.Ok() && end > next.Start() {
- panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range()))
- }
- seg.SetEndUnchecked(end)
-}
-
-// Value returns a copy of the iterated segment's value.
-func (seg Iterator) Value() noValue {
- return seg.node.values[seg.index]
-}
-
-// ValuePtr returns a pointer to the iterated segment's value. The pointer is
-// invalidated if the iterator is invalidated. This operation does not
-// invalidate any iterators.
-func (seg Iterator) ValuePtr() *noValue {
- return &seg.node.values[seg.index]
-}
-
-// SetValue mutates the iterated segment's value. This operation does not
-// invalidate any iterators.
-func (seg Iterator) SetValue(val noValue) {
- seg.node.values[seg.index] = val
-}
-
-// PrevSegment returns the iterated segment's predecessor. If there is no
-// preceding segment, PrevSegment returns a terminal iterator.
-func (seg Iterator) PrevSegment() Iterator {
- if seg.node.hasChildren {
- return seg.node.children[seg.index].lastSegment()
- }
- if seg.index > 0 {
- return Iterator{seg.node, seg.index - 1}
- }
- if seg.node.parent == nil {
- return Iterator{}
- }
- return segmentBeforePosition(seg.node.parent, seg.node.parentIndex)
-}
-
-// NextSegment returns the iterated segment's successor. If there is no
-// succeeding segment, NextSegment returns a terminal iterator.
-func (seg Iterator) NextSegment() Iterator {
- if seg.node.hasChildren {
- return seg.node.children[seg.index+1].firstSegment()
- }
- if seg.index < seg.node.nrSegments-1 {
- return Iterator{seg.node, seg.index + 1}
- }
- if seg.node.parent == nil {
- return Iterator{}
- }
- return segmentAfterPosition(seg.node.parent, seg.node.parentIndex)
-}
-
-// PrevGap returns the gap immediately before the iterated segment.
-func (seg Iterator) PrevGap() GapIterator {
- if seg.node.hasChildren {
-
- return seg.node.children[seg.index].lastSegment().NextGap()
- }
- return GapIterator{seg.node, seg.index}
-}
-
-// NextGap returns the gap immediately after the iterated segment.
-func (seg Iterator) NextGap() GapIterator {
- if seg.node.hasChildren {
- return seg.node.children[seg.index+1].firstSegment().PrevGap()
- }
- return GapIterator{seg.node, seg.index + 1}
-}
-
-// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent,
-// or the gap before the iterated segment otherwise. If seg.Start() ==
-// Functions.MinKey(), PrevNonEmpty will return two terminal iterators.
-// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be
-// non-terminal.
-func (seg Iterator) PrevNonEmpty() (Iterator, GapIterator) {
- gap := seg.PrevGap()
- if gap.Range().Length() != 0 {
- return Iterator{}, gap
- }
- return gap.PrevSegment(), GapIterator{}
-}
-
-// NextNonEmpty returns the iterated segment's successor if it is adjacent, or
-// the gap after the iterated segment otherwise. If seg.End() ==
-// Functions.MaxKey(), NextNonEmpty will return two terminal iterators.
-// Otherwise, exactly one of the iterators returned by NextNonEmpty will be
-// non-terminal.
-func (seg Iterator) NextNonEmpty() (Iterator, GapIterator) {
- gap := seg.NextGap()
- if gap.Range().Length() != 0 {
- return Iterator{}, gap
- }
- return gap.NextSegment(), GapIterator{}
-}
-
-// A GapIterator is conceptually one of:
-//
-// - A pointer to a position between two segments, before the first segment, or
-// after the last segment in a set, called a *gap*; or
-//
-// - A terminal iterator, which is a sentinel indicating that the end of
-// iteration has been reached.
-//
-// Note that the gap between two adjacent segments exists (iterators to it are
-// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true
-// for such gaps. An empty set contains a single gap, spanning the entire range
-// of the set's keys.
-//
-// GapIterators are copyable values and are meaningfully equality-comparable.
-// The zero value of GapIterator is a terminal iterator.
-//
-// Unless otherwise specified, any mutation of a set invalidates all existing
-// iterators into the set.
-type GapIterator struct {
- // The representation of a GapIterator is identical to that of an Iterator,
- // except that index corresponds to positions between segments in the same
- // way as for node.children (see comment for node.nrSegments).
- node *node
- index int
-}
-
-// Ok returns true if the iterator is not terminal. All other methods are only
-// valid for non-terminal iterators.
-func (gap GapIterator) Ok() bool {
- return gap.node != nil
-}
-
-// Range returns the range spanned by the iterated gap.
-func (gap GapIterator) Range() Range {
- return Range{gap.Start(), gap.End()}
-}
-
-// Start is equivalent to Range().Start, but should be preferred if only the
-// start of the range is needed.
-func (gap GapIterator) Start() uint64 {
- if ps := gap.PrevSegment(); ps.Ok() {
- return ps.End()
- }
- return setFunctions{}.MinKey()
-}
-
-// End is equivalent to Range().End, but should be preferred if only the end of
-// the range is needed.
-func (gap GapIterator) End() uint64 {
- if ns := gap.NextSegment(); ns.Ok() {
- return ns.Start()
- }
- return setFunctions{}.MaxKey()
-}
-
-// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is
-// between two adjacent segments.)
-func (gap GapIterator) IsEmpty() bool {
- return gap.Range().Length() == 0
-}
-
-// PrevSegment returns the segment immediately before the iterated gap. If no
-// such segment exists, PrevSegment returns a terminal iterator.
-func (gap GapIterator) PrevSegment() Iterator {
- return segmentBeforePosition(gap.node, gap.index)
-}
-
-// NextSegment returns the segment immediately after the iterated gap. If no
-// such segment exists, NextSegment returns a terminal iterator.
-func (gap GapIterator) NextSegment() Iterator {
- return segmentAfterPosition(gap.node, gap.index)
-}
-
-// PrevGap returns the iterated gap's predecessor. If no such gap exists,
-// PrevGap returns a terminal iterator.
-func (gap GapIterator) PrevGap() GapIterator {
- seg := gap.PrevSegment()
- if !seg.Ok() {
- return GapIterator{}
- }
- return seg.PrevGap()
-}
-
-// NextGap returns the iterated gap's successor. If no such gap exists, NextGap
-// returns a terminal iterator.
-func (gap GapIterator) NextGap() GapIterator {
- seg := gap.NextSegment()
- if !seg.Ok() {
- return GapIterator{}
- }
- return seg.NextGap()
-}
-
-// segmentBeforePosition returns the predecessor segment of the position given
-// by n.children[i], which may or may not contain a child. If no such segment
-// exists, segmentBeforePosition returns a terminal iterator.
-func segmentBeforePosition(n *node, i int) Iterator {
- for i == 0 {
- if n.parent == nil {
- return Iterator{}
- }
- n, i = n.parent, n.parentIndex
- }
- return Iterator{n, i - 1}
-}
-
-// segmentAfterPosition returns the successor segment of the position given by
-// n.children[i], which may or may not contain a child. If no such segment
-// exists, segmentAfterPosition returns a terminal iterator.
-func segmentAfterPosition(n *node, i int) Iterator {
- for i == n.nrSegments {
- if n.parent == nil {
- return Iterator{}
- }
- n, i = n.parent, n.parentIndex
- }
- return Iterator{n, i}
-}
-
-func zeroValueSlice(slice []noValue) {
-
- for i := range slice {
- setFunctions{}.ClearValue(&slice[i])
- }
-}
-
-func zeroNodeSlice(slice []*node) {
- for i := range slice {
- slice[i] = nil
- }
-}
-
-// String stringifies a Set for debugging.
-func (s *Set) String() string {
- return s.root.String()
-}
-
-// String stringifies a node (and all of its children) for debugging.
-func (n *node) String() string {
- var buf bytes.Buffer
- n.writeDebugString(&buf, "")
- return buf.String()
-}
-
-func (n *node) writeDebugString(buf *bytes.Buffer, prefix string) {
- if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) {
- buf.WriteString(prefix)
- buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren))
- }
- for i := 0; i < n.nrSegments; i++ {
- if child := n.children[i]; child != nil {
- cprefix := fmt.Sprintf("%s- % 3d ", prefix, i)
- if child.parent != n || child.parentIndex != i {
- buf.WriteString(cprefix)
- buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i))
- }
- child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i))
- }
- buf.WriteString(prefix)
- buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
- }
- if child := n.children[n.nrSegments]; child != nil {
- child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments))
- }
-}
-
-// SegmentDataSlices represents segments from a set as slices of start, end, and
-// values. SegmentDataSlices is primarily used as an intermediate representation
-// for save/restore and the layout here is optimized for that.
-//
-// +stateify savable
-type SegmentDataSlices struct {
- Start []uint64
- End []uint64
- Values []noValue
-}
-
-// ExportSortedSlice returns a copy of all segments in the given set, in ascending
-// key order.
-func (s *Set) ExportSortedSlices() *SegmentDataSlices {
- var sds SegmentDataSlices
- for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
- sds.Start = append(sds.Start, seg.Start())
- sds.End = append(sds.End, seg.End())
- sds.Values = append(sds.Values, seg.Value())
- }
- sds.Start = sds.Start[:len(sds.Start):len(sds.Start)]
- sds.End = sds.End[:len(sds.End):len(sds.End)]
- sds.Values = sds.Values[:len(sds.Values):len(sds.Values)]
- return &sds
-}
-
-// ImportSortedSlice initializes the given set from the given slice.
-//
-// Preconditions: s must be empty. sds must represent a valid set (the segments
-// in sds must have valid lengths that do not overlap). The segments in sds
-// must be sorted in ascending key order.
-func (s *Set) ImportSortedSlices(sds *SegmentDataSlices) error {
- if !s.IsEmpty() {
- return fmt.Errorf("cannot import into non-empty set %v", s)
- }
- gap := s.FirstGap()
- for i := range sds.Start {
- r := Range{sds.Start[i], sds.End[i]}
- if !gap.Range().IsSupersetOf(r) {
- return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i])
- }
- gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap()
- }
- return nil
-}
-func (s *Set) saveRoot() *SegmentDataSlices {
- return s.ExportSortedSlices()
-}
-
-func (s *Set) loadRoot(sds *SegmentDataSlices) {
- if err := s.ImportSortedSlices(sds); err != nil {
- panic(err)
- }
-}
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
deleted file mode 100644
index eef54d787..000000000
--- a/pkg/sentry/fs/binder/binder.go
+++ /dev/null
@@ -1,260 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package binder implements Android Binder IPC module.
-package binder
-
-import (
- "sync"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
- "gvisor.dev/gvisor/pkg/sentry/usage"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/waiter"
-)
-
-const (
- currentProtocolVersion = 8
-
- // mmapSizeLimit is the upper limit for mapped memory size in Binder.
- mmapSizeLimit = 4 * 1024 * 1024 // 4MB
-)
-
-// Device implements fs.InodeOperations.
-//
-// +stateify savable
-type Device struct {
- fsutil.InodeGenericChecker `state:"nosave"`
- fsutil.InodeNoExtendedAttributes `state:"nosave"`
- fsutil.InodeNoopAllocate `state:"nosave"`
- fsutil.InodeNoopRelease `state:"nosave"`
- fsutil.InodeNoopTruncate `state:"nosave"`
- fsutil.InodeNoopWriteOut `state:"nosave"`
- fsutil.InodeNotDirectory `state:"nosave"`
- fsutil.InodeNotMappable `state:"nosave"`
- fsutil.InodeNotSocket `state:"nosave"`
- fsutil.InodeNotSymlink `state:"nosave"`
- fsutil.InodeVirtual `state:"nosave"`
-
- fsutil.InodeSimpleAttributes
-}
-
-var _ fs.InodeOperations = (*Device)(nil)
-
-// NewDevice creates and initializes a Device structure.
-func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device {
- return &Device{
- InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, 0),
- }
-}
-
-// GetFile implements fs.InodeOperations.GetFile.
-//
-// TODO(b/30946773): Add functionality to GetFile: Additional fields will be
-// needed in the Device structure, initialize them here. Also, Device will need
-// to keep track of the created Procs in order to implement BINDER_READ_WRITE
-// ioctl.
-func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
- return fs.NewFile(ctx, d, flags, &Proc{
- bd: bd,
- task: kernel.TaskFromContext(ctx),
- mfp: pgalloc.MemoryFileProviderFromContext(ctx),
- }), nil
-}
-
-// Proc implements fs.FileOperations and fs.IoctlGetter.
-//
-// +stateify savable
-type Proc struct {
- fsutil.FileNoFsync `state:"nosave"`
- fsutil.FileNoSplice `state:"nosave"`
- fsutil.FileNotDirReaddir `state:"nosave"`
- fsutil.FileUseInodeUnstableAttr `state:"nosave"`
- waiter.AlwaysReady `state:"nosave"`
-
- bd *Device
- task *kernel.Task
- mfp pgalloc.MemoryFileProvider
-
- // mu protects fr.
- mu sync.Mutex `state:"nosave"`
-
- // mapped is memory allocated from mfp.MemoryFile() by AddMapping.
- mapped platform.FileRange
-}
-
-// Release implements fs.FileOperations.Release.
-func (bp *Proc) Release() {
- bp.mu.Lock()
- defer bp.mu.Unlock()
- if bp.mapped.Length() != 0 {
- bp.mfp.MemoryFile().DecRef(bp.mapped)
- }
-}
-
-// Seek implements fs.FileOperations.Seek.
-//
-// Binder doesn't support seek operation (unless in debug mode).
-func (bp *Proc) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
- return offset, syserror.EOPNOTSUPP
-}
-
-// Read implements fs.FileOperations.Read.
-//
-// Binder doesn't support read operation (unless in debug mode).
-func (bp *Proc) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
- return 0, syserror.EOPNOTSUPP
-}
-
-// Write implements fs.FileOperations.Write.
-//
-// Binder doesn't support write operation.
-func (bp *Proc) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
- return 0, syserror.EOPNOTSUPP
-}
-
-// Flush implements fs.FileOperations.Flush.
-//
-// TODO(b/30946773): Implement.
-func (bp *Proc) Flush(ctx context.Context, file *fs.File) error {
- return nil
-}
-
-// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
-func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
- // Compare drivers/android/binder.c:binder_mmap().
- if caller := kernel.TaskFromContext(ctx); caller != bp.task {
- return syserror.EINVAL
- }
- if opts.Length > mmapSizeLimit {
- opts.Length = mmapSizeLimit
- }
- opts.MaxPerms.Write = false
-
- // TODO(b/30946773): Binder sets VM_DONTCOPY, preventing the created vma
- // from being copied across fork(), but we don't support this yet. As
- // a result, MMs containing a Binder mapping cannot be forked (MM.Fork will
- // fail when AddMapping returns EBUSY).
-
- return fsutil.GenericConfigureMMap(file, bp, opts)
-}
-
-// Ioctl implements fs.FileOperations.Ioctl.
-//
-// TODO(b/30946773): Implement.
-func (bp *Proc) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
- // Switch on ioctl request.
- switch uint32(args[1].Int()) {
- case linux.BinderVersionIoctl:
- ver := &linux.BinderVersion{
- ProtocolVersion: currentProtocolVersion,
- }
- // Copy result to user-space.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ver, usermem.IOOpts{
- AddressSpaceActive: true,
- })
- return 0, err
- case linux.BinderWriteReadIoctl:
- // TODO(b/30946773): Implement.
- fallthrough
- case linux.BinderSetIdleTimeoutIoctl:
- // TODO(b/30946773): Implement.
- fallthrough
- case linux.BinderSetMaxThreadsIoctl:
- // TODO(b/30946773): Implement.
- fallthrough
- case linux.BinderSetIdlePriorityIoctl:
- // TODO(b/30946773): Implement.
- fallthrough
- case linux.BinderSetContextMgrIoctl:
- // TODO(b/30946773): Implement.
- fallthrough
- case linux.BinderThreadExitIoctl:
- // TODO(b/30946773): Implement.
- return 0, syserror.ENOSYS
- default:
- // Ioctls irrelevant to Binder.
- return 0, syserror.EINVAL
- }
-}
-
-// AddMapping implements memmap.Mappable.AddMapping.
-func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error {
- bp.mu.Lock()
- defer bp.mu.Unlock()
- if bp.mapped.Length() != 0 {
- // mmap has been called before, which binder_mmap() doesn't like.
- return syserror.EBUSY
- }
- // Binder only allocates and maps a single page up-front
- // (drivers/android/binder.c:binder_mmap() => binder_update_page_range()).
- fr, err := bp.mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous)
- if err != nil {
- return err
- }
- bp.mapped = fr
- return nil
-}
-
-// RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (*Proc) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) {
- // Nothing to do. Notably, we don't free bp.mapped to allow another mmap.
-}
-
-// CopyMapping implements memmap.Mappable.CopyMapping.
-func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error {
- // Nothing to do. Notably, this is one case where CopyMapping isn't
- // equivalent to AddMapping, as AddMapping would return EBUSY.
- return nil
-}
-
-// Translate implements memmap.Mappable.Translate.
-func (bp *Proc) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
- // TODO(b/30946773): In addition to the page initially allocated and mapped
- // in AddMapping (Linux: binder_mmap), Binder allocates and maps pages for
- // each transaction (Linux: binder_ioctl => binder_ioctl_write_read =>
- // binder_thread_write => binder_transaction => binder_alloc_buf =>
- // binder_update_page_range). Since we don't actually implement
- // BinderWriteReadIoctl (Linux: BINDER_WRITE_READ), we only ever have the
- // first page.
- var err error
- if required.End > usermem.PageSize {
- err = &memmap.BusError{syserror.EFAULT}
- }
- if required.Start == 0 {
- return []memmap.Translation{
- {
- Source: memmap.MappableRange{0, usermem.PageSize},
- File: bp.mfp.MemoryFile(),
- Offset: bp.mapped.Start,
- Perms: usermem.AnyAccess,
- },
- }, err
- }
- return nil, err
-}
-
-// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
-func (bp *Proc) InvalidateUnsavable(ctx context.Context) error {
- return nil
-}
diff --git a/pkg/sentry/fs/binder/binder_state_autogen.go b/pkg/sentry/fs/binder/binder_state_autogen.go
deleted file mode 100755
index 1f321e3b6..000000000
--- a/pkg/sentry/fs/binder/binder_state_autogen.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// automatically generated by stateify.
-
-package binder
-
-import (
- "gvisor.dev/gvisor/pkg/state"
-)
-
-func (x *Device) beforeSave() {}
-func (x *Device) save(m state.Map) {
- x.beforeSave()
- m.Save("InodeSimpleAttributes", &x.InodeSimpleAttributes)
-}
-
-func (x *Device) afterLoad() {}
-func (x *Device) load(m state.Map) {
- m.Load("InodeSimpleAttributes", &x.InodeSimpleAttributes)
-}
-
-func (x *Proc) beforeSave() {}
-func (x *Proc) save(m state.Map) {
- x.beforeSave()
- m.Save("bd", &x.bd)
- m.Save("task", &x.task)
- m.Save("mfp", &x.mfp)
- m.Save("mapped", &x.mapped)
-}
-
-func (x *Proc) afterLoad() {}
-func (x *Proc) load(m state.Map) {
- m.Load("bd", &x.bd)
- m.Load("task", &x.task)
- m.Load("mfp", &x.mfp)
- m.Load("mapped", &x.mapped)
-}
-
-func init() {
- state.Register("binder.Device", (*Device)(nil), state.Fns{Save: (*Device).save, Load: (*Device).load})
- state.Register("binder.Proc", (*Proc)(nil), state.Fns{Save: (*Proc).save, Load: (*Proc).load})
-}
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index fb6c30ff0..d4bbd9807 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -20,8 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/ashmem"
- "gvisor.dev/gvisor/pkg/sentry/fs/binder"
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -81,7 +79,7 @@ func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.In
}
// New returns the root node of a device filesystem.
-func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEnabled bool) *fs.Inode {
+func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
contents := map[string]*fs.Inode{
"fd": newSymlink(ctx, "/proc/self/fd", msrc),
"stdin": newSymlink(ctx, "/proc/self/fd/0", msrc),
@@ -118,16 +116,6 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn
"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
}
- if binderEnabled {
- binder := binder.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666))
- contents["binder"] = newCharacterDevice(ctx, binder, msrc)
- }
-
- if ashmemEnabled {
- ashmem := ashmem.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666))
- contents["ashmem"] = newCharacterDevice(ctx, ashmem, msrc)
- }
-
iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
return fs.NewInode(ctx, iops, msrc, fs.StableAttr{
DeviceID: devDevice.DeviceID(),
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index cbc2c2f9b..55f8af704 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -15,19 +15,10 @@
package dev
import (
- "strconv"
-
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/syserror"
)
-// Optional key containing boolean flag which specifies if Android Binder IPC should be enabled.
-const binderEnabledKey = "binder_enabled"
-
-// Optional key containing boolean flag which specifies if Android ashmem should be enabled.
-const ashmemEnabledKey = "ashmem_enabled"
-
// filesystem is a devtmpfs.
//
// +stateify savable
@@ -67,33 +58,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
// Mount returns a devtmpfs root that can be positioned in the vfs.
func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
- // device is always ignored.
// devtmpfs backed by ramfs ignores bad options. See fs/ramfs/inode.c:ramfs_parse_options.
// -> we should consider parsing the mode and backing devtmpfs by this.
-
- // Parse generic comma-separated key=value options.
- options := fs.GenericMountSourceOptions(data)
-
- // binerEnabledKey is optional and binder is disabled by default.
- binderEnabled := false
- if beStr, exists := options[binderEnabledKey]; exists {
- var err error
- binderEnabled, err = strconv.ParseBool(beStr)
- if err != nil {
- return nil, syserror.EINVAL
- }
- }
-
- // ashmemEnabledKey is optional and ashmem is disabled by default.
- ashmemEnabled := false
- if aeStr, exists := options[ashmemEnabledKey]; exists {
- var err error
- ashmemEnabled, err = strconv.ParseBool(aeStr)
- if err != nil {
- return nil, syserror.EINVAL
- }
- }
-
- // Construct the devtmpfs root.
- return New(ctx, fs.NewNonCachingMountSource(ctx, f, flags), binderEnabled, ashmemEnabled), nil
+ return New(ctx, fs.NewNonCachingMountSource(ctx, f, flags)), nil
}
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 28651e58b..fbca06761 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -229,11 +229,13 @@ func newDirent(inode *Inode, name string) *Dirent {
if inode != nil {
inode.MountSource.IncDirentRefs()
}
- return &Dirent{
+ d := Dirent{
Inode: inode,
name: name,
children: make(map[string]*refs.WeakRef),
}
+ d.EnableLeakCheck("fs.Dirent")
+ return &d
}
// NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 8e1f5674d..bb8117f89 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -130,14 +130,15 @@ type File struct {
// to false respectively.
func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOperations) *File {
dirent.IncRef()
- f := &File{
+ f := File{
UniqueID: uniqueid.GlobalFromContext(ctx),
Dirent: dirent,
FileOperations: fops,
flags: flags,
}
f.mu.Init()
- return f
+ f.EnableLeakCheck("fs.File")
+ return &f
}
// DecRef destroys the File when it is no longer referenced.
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index c6cbd5631..9820f0b13 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -347,13 +347,14 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt
// preventing us from saving a proper inode mapping for the
// file.
file.IncRef()
- id := &overlayMappingIdentity{
+ id := overlayMappingIdentity{
id: opts.MappingIdentity,
overlayFile: file,
}
+ id.EnableLeakCheck("fs.overlayMappingIdentity")
// Swap out the old MappingIdentity for the wrapped one.
- opts.MappingIdentity = id
+ opts.MappingIdentity = &id
return nil
}
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index b87c4f150..27eeae3d9 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -79,11 +79,12 @@ func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*han
newFile.close(ctx)
return nil, err
}
- h := &handles{
+ h := handles{
File: newFile,
Host: hostFile,
}
- return h, nil
+ h.EnableLeakCheck("gofer.handles")
+ return &h, nil
}
type handleReadWriter struct {
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index b91386909..8c17603f8 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -145,16 +145,17 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
defer d.DecRef()
// Construct the new file, caching the handles if allowed.
- h := &handles{
+ h := handles{
File: newFile,
Host: hostFile,
}
+ h.EnableLeakCheck("gofer.handles")
if iops.fileState.canShareHandles() {
iops.fileState.handlesMu.Lock()
- iops.fileState.setSharedHandlesLocked(flags, h)
+ iops.fileState.setSharedHandlesLocked(flags, &h)
iops.fileState.handlesMu.Unlock()
}
- return NewFile(ctx, d, name, flags, iops, h), nil
+ return NewFile(ctx, d, name, flags, iops, &h), nil
}
// CreateLink uses Create to create a symlink between oldname and newname.
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 9f7660ed1..69d08a627 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -241,7 +241,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
}
// Construct the session.
- s := &session{
+ s := session{
connID: dev,
msize: o.msize,
version: o.version,
@@ -250,13 +250,14 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
superBlockFlags: superBlockFlags,
mounter: mounter,
}
+ s.EnableLeakCheck("gofer.session")
if o.privateunixsocket {
s.endpoints = newEndpointMaps()
}
// Construct the MountSource with the session and superBlockFlags.
- m := fs.NewMountSource(ctx, s, filesystem, superBlockFlags)
+ m := fs.NewMountSource(ctx, &s, filesystem, superBlockFlags)
// Given that gofer files can consume host FDs, restrict the number
// of files that can be held by the cache.
@@ -290,7 +291,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
return nil, err
}
- sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr, false)
+ sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false)
return fs.NewInode(ctx, iops, m, sattr), nil
}
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 29a79441e..d045e04ff 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -111,5 +111,4 @@ func (s *session) afterLoad() {
panic("failed to restore endpoint maps: " + err.Error())
}
}
-
}
diff --git a/pkg/sentry/fs/host/host_state_autogen.go b/pkg/sentry/fs/host/host_state_autogen.go
index 9611da42a..f0e1c4b88 100755
--- a/pkg/sentry/fs/host/host_state_autogen.go
+++ b/pkg/sentry/fs/host/host_state_autogen.go
@@ -95,17 +95,17 @@ func (x *inodeFileState) load(m state.Map) {
func (x *ConnectedEndpoint) save(m state.Map) {
x.beforeSave()
+ m.Save("ref", &x.ref)
m.Save("queue", &x.queue)
m.Save("path", &x.path)
- m.Save("ref", &x.ref)
m.Save("srfd", &x.srfd)
m.Save("stype", &x.stype)
}
func (x *ConnectedEndpoint) load(m state.Map) {
+ m.Load("ref", &x.ref)
m.Load("queue", &x.queue)
m.Load("path", &x.path)
- m.Load("ref", &x.ref)
m.LoadWait("srfd", &x.srfd)
m.Load("stype", &x.stype)
m.AfterLoad(x.afterLoad)
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 7fedc88bc..44c4ee5f2 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -47,12 +47,12 @@ const maxSendBufferSize = 8 << 20
//
// +stateify savable
type ConnectedEndpoint struct {
- queue *waiter.Queue
- path string
-
// ref keeps track of references to a connectedEndpoint.
ref refs.AtomicRefCount
+ queue *waiter.Queue
+ path string
+
// If srfd >= 0, it is the host FD that file was imported from.
srfd int `state:"wait"`
@@ -133,6 +133,8 @@ func NewConnectedEndpoint(ctx context.Context, file *fd.FD, queue *waiter.Queue,
// AtomicRefCounters start off with a single reference. We need two.
e.ref.IncRef()
+ e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+
return &e, nil
}
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index e4aae1135..f4ddfa406 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -86,12 +86,14 @@ type LockCtx struct {
// NewInode takes a reference on msrc.
func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, sattr StableAttr) *Inode {
msrc.IncRef()
- return &Inode{
+ i := Inode{
InodeOperations: iops,
StableAttr: sattr,
Watches: newWatches(),
MountSource: msrc,
}
+ i.EnableLeakCheck("fs.Inode")
+ return &i
}
// DecRef drops a reference on the Inode.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index b247fa514..24b769cfc 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -567,6 +567,12 @@ func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
if o.upper != nil {
err = o.upper.check(ctx, p)
} else {
+ if p.Write {
+ // Since writes will be redirected to the upper filesystem, the lower
+ // filesystem need not be writable, but must be readable for copy-up.
+ p.Write = false
+ p.Read = true
+ }
err = o.lower.check(ctx, p)
}
o.copyMu.RUnlock()
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 912495528..7a9692800 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -138,12 +138,14 @@ func NewMountSource(ctx context.Context, mops MountSourceOperations, filesystem
if filesystem != nil {
fsType = filesystem.Name()
}
- return &MountSource{
+ msrc := MountSource{
MountSourceOperations: mops,
Flags: flags,
FilesystemType: fsType,
fscache: NewDirentCache(DefaultDirentCacheSize),
}
+ msrc.EnableLeakCheck("fs.MountSource")
+ return &msrc
}
// DirentRefs returns the current mount direntRefs.
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 281364dfc..693ffc760 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -181,12 +181,14 @@ func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error
d: newRootMount(1, d),
}
- return &MountNamespace{
+ mns := MountNamespace{
userns: creds.UserNamespace,
root: d,
mounts: mnts,
mountID: 2,
- }, nil
+ }
+ mns.EnableLeakCheck("fs.MountNamespace")
+ return &mns, nil
}
// UserNamespace returns the user namespace associated with this mount manager.
@@ -661,6 +663,11 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s
}
defer d.DecRef()
+ // Check that it is a regular file.
+ if !IsRegular(d.Inode.StableAttr) {
+ continue
+ }
+
// Check whether we can read and execute the found file.
if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil {
log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index ea7aded9a..bee421d76 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -42,8 +41,8 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
var file *fs.File
var fdFlags kernel.FDFlags
t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- file, fdFlags = fdm.GetDescriptor(kdefs.FD(n))
+ if fdTable := t.FDTable(); fdTable != nil {
+ file, fdFlags = fdTable.Get(int32(n))
}
})
if file == nil {
@@ -56,36 +55,31 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
// toDentAttr callback for each to get a DentAttr, which it then emits. This is
// a helper for implementing fs.InodeOperations.Readdir.
func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
- var fds kernel.FDs
+ var fds []int32
t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- fds = fdm.GetFDs()
+ if fdTable := t.FDTable(); fdTable != nil {
+ fds = fdTable.GetFDs()
}
})
- fdInts := make([]int, 0, len(fds))
- for _, fd := range fds {
- fdInts = append(fdInts, int(fd))
- }
-
- // Find the fd to start at.
- idx := sort.SearchInts(fdInts, int(offset))
- if idx == len(fdInts) {
+ // Find the appropriate starting point.
+ idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(offset) })
+ if idx == len(fds) {
return offset, nil
}
- fdInts = fdInts[idx:]
+ fds = fds[idx:]
- var fd int
- for _, fd = range fdInts {
+ // Serialize all FDs.
+ for _, fd := range fds {
name := strconv.FormatUint(uint64(fd), 10)
- if err := c.DirEmit(name, toDentAttr(fd)); err != nil {
+ if err := c.DirEmit(name, toDentAttr(int(fd))); err != nil {
// Returned offset is the next fd to serialize.
return int64(fd), err
}
}
// We serialized them all. Next offset should be higher than last
// serialized fd.
- return int64(fd + 1), nil
+ return int64(fds[len(fds)-1] + 1), nil
}
// fd implements fs.InodeOperations for a file in /proc/TID/fd/.
@@ -154,9 +148,9 @@ func (f *fd) Close() error {
type fdDir struct {
ramfs.Dir
- // We hold a reference on the task's fdmap but only keep an indirect
- // task pointer to avoid Dirent loading circularity caused by fdmap's
- // potential back pointers into the dirent tree.
+ // We hold a reference on the task's FDTable but only keep an indirect
+ // task pointer to avoid Dirent loading circularity caused by the
+ // table's back pointers into the dirent tree.
t *kernel.Task
}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index b2e36aeee..ef0ca3301 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -580,8 +580,8 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
var fds int
var vss, rss, data uint64
s.t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- fds = fdm.Size()
+ if fdTable := t.FDTable(); fdTable != nil {
+ fds = fdTable.Size()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 8290f2530..b7cecb2ed 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -38,9 +38,11 @@ type Terminal struct {
func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
termios := linux.DefaultSlaveTermios
- return &Terminal{
+ t := Terminal{
d: d,
n: n,
ld: newLineDiscipline(termios),
}
+ t.EnableLeakCheck("tty.Terminal")
+ return &t
}
diff --git a/pkg/sentry/kernel/auth/atomicptr_credentials.go b/pkg/sentry/kernel/auth/atomicptr_credentials_unsafe.go
index 4535c958f..4535c958f 100755
--- a/pkg/sentry/kernel/auth/atomicptr_credentials.go
+++ b/pkg/sentry/kernel/auth/atomicptr_credentials_unsafe.go
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 33c7dccae..9c0a4e1b4 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -61,7 +60,7 @@ const (
// +stateify savable
type FileIdentifier struct {
File *fs.File `state:"wait"`
- Fd kdefs.FD
+ Fd int32
}
// pollEntry holds all the state associated with an event poll entry, that is,
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
deleted file mode 100644
index 786936a7d..000000000
--- a/pkg/sentry/kernel/fd_map.go
+++ /dev/null
@@ -1,364 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
- "bytes"
- "fmt"
- "sort"
- "sync"
- "sync/atomic"
- "syscall"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/refs"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/lock"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
- "gvisor.dev/gvisor/pkg/sentry/limits"
-)
-
-// FDs is an ordering of FD's that can be made stable.
-type FDs []kdefs.FD
-
-func (f FDs) Len() int {
- return len(f)
-}
-
-func (f FDs) Swap(i, j int) {
- f[i], f[j] = f[j], f[i]
-}
-
-func (f FDs) Less(i, j int) bool {
- return f[i] < f[j]
-}
-
-// FDFlags define flags for an individual descriptor.
-//
-// +stateify savable
-type FDFlags struct {
- // CloseOnExec indicates the descriptor should be closed on exec.
- CloseOnExec bool
-}
-
-// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
-// representation.
-func (f FDFlags) ToLinuxFileFlags() (mask uint) {
- if f.CloseOnExec {
- mask |= linux.O_CLOEXEC
- }
- return
-}
-
-// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
-// representation.
-func (f FDFlags) ToLinuxFDFlags() (mask uint) {
- if f.CloseOnExec {
- mask |= linux.FD_CLOEXEC
- }
- return
-}
-
-// descriptor holds the details about a file descriptor, namely a pointer the
-// file itself and the descriptor flags.
-//
-// +stateify savable
-type descriptor struct {
- file *fs.File
- flags FDFlags
-}
-
-// FDMap is used to manage File references and flags.
-//
-// +stateify savable
-type FDMap struct {
- refs.AtomicRefCount
- k *Kernel
- files map[kdefs.FD]descriptor
- mu sync.RWMutex `state:"nosave"`
- uid uint64
-}
-
-// ID returns a unique identifier for this FDMap.
-func (f *FDMap) ID() uint64 {
- return f.uid
-}
-
-// NewFDMap allocates a new FDMap that may be used by tasks in k.
-func (k *Kernel) NewFDMap() *FDMap {
- return &FDMap{
- k: k,
- files: make(map[kdefs.FD]descriptor),
- uid: atomic.AddUint64(&k.fdMapUids, 1),
- }
-}
-
-// destroy removes all of the file descriptors from the map.
-func (f *FDMap) destroy() {
- f.RemoveIf(func(*fs.File, FDFlags) bool {
- return true
- })
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FDMap) DecRef() {
- f.DecRefWithDestructor(f.destroy)
-}
-
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDMap) Size() int {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- return len(f.files)
-}
-
-// String is a stringer for FDMap.
-func (f *FDMap) String() string {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- var b bytes.Buffer
- for k, v := range f.files {
- n, _ := v.file.Dirent.FullName(nil /* root */)
- b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n))
- }
- return b.String()
-}
-
-// NewFDFrom allocates a new FD guaranteed to be the lowest number available
-// greater than or equal to from. This property is important as Unix programs
-// tend to count on this allocation order.
-func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) {
- if fd < 0 {
- // Don't accept negative FDs.
- return 0, syscall.EINVAL
- }
-
- f.mu.Lock()
- defer f.mu.Unlock()
-
- // Finds the lowest fd not in the handles map.
- lim := limitSet.Get(limits.NumberOfFiles)
- for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ {
- if _, ok := f.files[i]; !ok {
- file.IncRef()
- f.files[i] = descriptor{file, flags}
- return i, nil
- }
- }
-
- return -1, syscall.EMFILE
-}
-
-// NewFDAt sets the file reference for the given FD. If there is an
-// active reference for that FD, the ref count for that existing reference
-// is decremented.
-func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error {
- if fd < 0 {
- // Don't accept negative FDs.
- return syscall.EBADF
- }
-
- // In this one case we do not do a defer of the Unlock. The
- // reason is that we must have done all the work needed for
- // discarding any old open file before we return to the
- // caller. In other words, the DecRef(), below, must have
- // completed by the time we return to the caller to ensure
- // side effects are, in fact, effected. A classic example is
- // dup2(fd1, fd2); if fd2 was already open, it must be closed,
- // and we don't want to resume the caller until it is; we have
- // to block on the DecRef(). Hence we can not just do a 'go
- // oldfile.DecRef()', since there would be no guarantee that
- // it would be done before we the caller resumed. Since we
- // must wait for the DecRef() to finish, and that could take
- // time, it's best to first call f.muUnlock beore so we are
- // not blocking other uses of this FDMap on the DecRef() call.
- f.mu.Lock()
- oldDesc, oldExists := f.files[fd]
- lim := limitSet.Get(limits.NumberOfFiles).Cur
- // if we're closing one then the effective limit is one
- // more than the actual limit.
- if oldExists && lim != limits.Infinity {
- lim++
- }
- if lim != limits.Infinity && fd >= kdefs.FD(lim) {
- f.mu.Unlock()
- return syscall.EMFILE
- }
-
- file.IncRef()
- f.files[fd] = descriptor{file, flags}
- f.mu.Unlock()
-
- if oldExists {
- oldDesc.file.DecRef()
- }
- return nil
-}
-
-// SetFlags sets the flags for the given file descriptor, if it is valid.
-func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- desc, ok := f.files[fd]
- if !ok {
- return
- }
-
- f.files[fd] = descriptor{desc.file, flags}
-}
-
-// GetDescriptor returns a reference to the file and the flags for the FD. It
-// bumps its reference count as well. It returns nil if there is no File
-// for the FD, i.e. if the FD is invalid. The caller must use DecRef
-// when they are done.
-func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- if desc, ok := f.files[fd]; ok {
- desc.file.IncRef()
- return desc.file, desc.flags
- }
- return nil, FDFlags{}
-}
-
-// GetFile returns a reference to the File for the FD and bumps
-// its reference count as well. It returns nil if there is no File
-// for the FD, i.e. if the FD is invalid. The caller must use DecRef
-// when they are done.
-func (f *FDMap) GetFile(fd kdefs.FD) *fs.File {
- f.mu.RLock()
- if desc, ok := f.files[fd]; ok {
- desc.file.IncRef()
- f.mu.RUnlock()
- return desc.file
- }
- f.mu.RUnlock()
- return nil
-}
-
-// fds returns an ordering of FDs.
-func (f *FDMap) fds() FDs {
- fds := make(FDs, 0, len(f.files))
- for fd := range f.files {
- fds = append(fds, fd)
- }
- sort.Sort(fds)
- return fds
-}
-
-// GetFDs returns a list of valid fds.
-func (f *FDMap) GetFDs() FDs {
- f.mu.RLock()
- defer f.mu.RUnlock()
- return f.fds()
-}
-
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDMap) GetRefs() []*fs.File {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- fds := f.fds()
- fs := make([]*fs.File, 0, len(fds))
- for _, fd := range fds {
- desc := f.files[fd]
- desc.file.IncRef()
- fs = append(fs, desc.file)
- }
- return fs
-}
-
-// Fork returns an independent FDMap pointing to the same descriptors.
-func (f *FDMap) Fork() *FDMap {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- clone := f.k.NewFDMap()
-
- // Grab a extra reference for every file.
- for fd, desc := range f.files {
- desc.file.IncRef()
- clone.files[fd] = desc
- }
-
- // That's it!
- return clone
-}
-
-// unlock releases all file locks held by this FDMap's uid. Must only be
-// called on a non-nil *fs.File.
-func (f *FDMap) unlock(file *fs.File) {
- id := lock.UniqueID(f.ID())
- file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF})
-}
-
-// inotifyFileClose generates the appropriate inotify events for f being closed.
-func inotifyFileClose(f *fs.File) {
- var ev uint32
- d := f.Dirent
-
- if fs.IsDir(d.Inode.StableAttr) {
- ev |= linux.IN_ISDIR
- }
-
- if f.Flags().Write {
- ev |= linux.IN_CLOSE_WRITE
- } else {
- ev |= linux.IN_CLOSE_NOWRITE
- }
-
- d.InotifyEvent(ev, 0)
-}
-
-// Remove removes an FD from the FDMap, and returns (File, true) if a File
-// one was found. Callers are expected to decrement the reference count on
-// the File. Otherwise returns (nil, false).
-func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) {
- f.mu.Lock()
- desc := f.files[fd]
- delete(f.files, fd)
- f.mu.Unlock()
- if desc.file != nil {
- f.unlock(desc.file)
- inotifyFileClose(desc.file)
- return desc.file, true
- }
- return nil, false
-}
-
-// RemoveIf removes all FDs where cond is true.
-func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) {
- var removed []*fs.File
- f.mu.Lock()
- for fd, desc := range f.files {
- if desc.file != nil && cond(desc.file, desc.flags) {
- delete(f.files, fd)
- removed = append(removed, desc.file)
- }
- }
- f.mu.Unlock()
-
- for _, file := range removed {
- f.unlock(file)
- inotifyFileClose(file)
- file.DecRef()
- }
-}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
new file mode 100644
index 000000000..1f3a57dc1
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table.go
@@ -0,0 +1,380 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "bytes"
+ "fmt"
+ "math"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+)
+
+// FDFlags define flags for an individual descriptor.
+//
+// +stateify savable
+type FDFlags struct {
+ // CloseOnExec indicates the descriptor should be closed on exec.
+ CloseOnExec bool
+}
+
+// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
+// representation.
+func (f FDFlags) ToLinuxFileFlags() (mask uint) {
+ if f.CloseOnExec {
+ mask |= linux.O_CLOEXEC
+ }
+ return
+}
+
+// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
+// representation.
+func (f FDFlags) ToLinuxFDFlags() (mask uint) {
+ if f.CloseOnExec {
+ mask |= linux.FD_CLOEXEC
+ }
+ return
+}
+
+// descriptor holds the details about a file descriptor, namely a pointer to
+// the file itself and the descriptor flags.
+//
+// Note that this is immutable and can only be changed via operations on the
+// descriptorTable.
+//
+// +stateify savable
+type descriptor struct {
+ file *fs.File
+ flags FDFlags
+}
+
+// FDTable is used to manage File references and flags.
+//
+// +stateify savable
+type FDTable struct {
+ refs.AtomicRefCount
+ k *Kernel
+
+ // uid is a unique identifier.
+ uid uint64
+
+ // mu protects below.
+ mu sync.Mutex `state:"nosave"`
+
+ // used contains the number of non-nil entries.
+ used int32
+
+ // descriptorTable holds descriptors.
+ descriptorTable `state:".(map[int32]descriptor)"`
+}
+
+func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
+ m := make(map[int32]descriptor)
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ m[fd] = descriptor{
+ file: file,
+ flags: flags,
+ }
+ })
+ return m
+}
+
+func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
+ f.init() // Initialize table.
+ for fd, d := range m {
+ f.set(fd, d.file, d.flags)
+
+ // Note that we do _not_ need to acquire a extra table
+ // reference here. The table reference will already be
+ // accounted for in the file, so we drop the reference taken by
+ // set above.
+ d.file.DecRef()
+ }
+}
+
+// drop drops the table reference.
+func (f *FDTable) drop(file *fs.File) {
+ // Release locks.
+ file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF})
+
+ // Send inotify events.
+ d := file.Dirent
+ var ev uint32
+ if fs.IsDir(d.Inode.StableAttr) {
+ ev |= linux.IN_ISDIR
+ }
+ if file.Flags().Write {
+ ev |= linux.IN_CLOSE_WRITE
+ } else {
+ ev |= linux.IN_CLOSE_NOWRITE
+ }
+ d.InotifyEvent(ev, 0)
+
+ // Drop the table reference.
+ file.DecRef()
+}
+
+// ID returns a unique identifier for this FDTable.
+func (f *FDTable) ID() uint64 {
+ return f.uid
+}
+
+// NewFDTable allocates a new FDTable that may be used by tasks in k.
+func (k *Kernel) NewFDTable() *FDTable {
+ f := &FDTable{
+ k: k,
+ uid: atomic.AddUint64(&k.fdMapUids, 1),
+ }
+ f.init()
+ return f
+}
+
+// destroy removes all of the file descriptors from the map.
+func (f *FDTable) destroy() {
+ f.RemoveIf(func(*fs.File, FDFlags) bool {
+ return true
+ })
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FDTable) DecRef() {
+ f.DecRefWithDestructor(f.destroy)
+}
+
+// Size returns the number of file descriptor slots currently allocated.
+func (f *FDTable) Size() int {
+ size := atomic.LoadInt32(&f.used)
+ return int(size)
+}
+
+// forEach iterates over all non-nil files.
+//
+// It is the caller's responsibility to acquire an appropriate lock.
+func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) {
+ fd := int32(0)
+ for {
+ file, flags, ok := f.get(fd)
+ if !ok {
+ break
+ }
+ if file != nil {
+ if !file.TryIncRef() {
+ continue // Race caught.
+ }
+ fn(int32(fd), file, flags)
+ file.DecRef()
+ }
+ fd++
+ }
+}
+
+// String is a stringer for FDTable.
+func (f *FDTable) String() string {
+ var b bytes.Buffer
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ n, _ := file.Dirent.FullName(nil /* root */)
+ b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+ })
+ return b.String()
+}
+
+// NewFDs allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return nil, syscall.EINVAL
+ }
+
+ // Default limit.
+ end := int32(math.MaxInt32)
+
+ // Ensure we don't get past the provided limit.
+ if limitSet := limits.FromContext(ctx); limitSet != nil {
+ lim := limitSet.Get(limits.NumberOfFiles)
+ if lim.Cur != limits.Infinity {
+ end = int32(lim.Cur)
+ }
+ if fd >= end {
+ return nil, syscall.EMFILE
+ }
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // Install all entries.
+ for i := fd; i < end && len(fds) < len(files); i++ {
+ if d, _, _ := f.get(i); d == nil {
+ f.set(i, files[len(fds)], flags) // Set the descriptor.
+ fds = append(fds, i) // Record the file descriptor.
+ }
+ }
+
+ // Failure? Unwind existing FDs.
+ if len(fds) < len(files) {
+ for _, i := range fds {
+ f.set(i, nil, FDFlags{}) // Zap entry.
+ }
+ return nil, syscall.EMFILE
+ }
+
+ return fds, nil
+}
+
+// NewFDAt sets the file reference for the given FD. If there is an active
+// reference for that FD, the ref count for that existing reference is
+// decremented.
+func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return syscall.EBADF
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // Check the limit for the provided file.
+ if limitSet := limits.FromContext(ctx); limitSet != nil {
+ if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
+ return syscall.EMFILE
+ }
+ }
+
+ // Install the entry.
+ f.set(fd, file, flags)
+ return nil
+}
+
+// SetFlags sets the flags for the given file descriptor.
+//
+// True is returned iff flags were changed.
+func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return syscall.EBADF
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ file, _, _ := f.get(fd)
+ if file == nil {
+ // No file found.
+ return syscall.EBADF
+ }
+
+ // Update the flags.
+ f.set(fd, file, flags)
+ return nil
+}
+
+// Get returns a reference to the file and the flags for the FD or nil if no
+// file is defined for the given fd.
+//
+// N.B. Callers are required to use DecRef when they are done.
+//
+//go:nosplit
+func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
+ if fd < 0 {
+ return nil, FDFlags{}
+ }
+
+ for {
+ file, flags, _ := f.get(fd)
+ if file != nil {
+ if !file.TryIncRef() {
+ continue // Race caught.
+ }
+ // Reference acquired.
+ return file, flags
+ }
+ // No file available.
+ return nil, FDFlags{}
+ }
+}
+
+// GetFDs returns a list of valid fds.
+func (f *FDTable) GetFDs() []int32 {
+ fds := make([]int32, 0, f.used)
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ fds = append(fds, fd)
+ })
+ return fds
+}
+
+// GetRefs returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDTable) GetRefs() []*fs.File {
+ files := make([]*fs.File, 0, f.Size())
+ f.forEach(func(_ int32, file *fs.File, flags FDFlags) {
+ file.IncRef() // Acquire a reference for caller.
+ files = append(files, file)
+ })
+ return files
+}
+
+// Fork returns an independent FDTable.
+func (f *FDTable) Fork() *FDTable {
+ clone := f.k.NewFDTable()
+
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ // The set function here will acquire an appropriate table
+ // reference for the clone. We don't need anything else.
+ clone.set(fd, file, flags)
+ })
+ return clone
+}
+
+// Remove removes an FD from and returns a non-file iff successful.
+//
+// N.B. Callers are required to use DecRef when they are done.
+func (f *FDTable) Remove(fd int32) *fs.File {
+ if fd < 0 {
+ return nil
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ orig, _, _ := f.get(fd)
+ if orig != nil {
+ orig.IncRef() // Reference for caller.
+ f.set(fd, nil, FDFlags{}) // Zap entry.
+ }
+ return orig
+}
+
+// RemoveIf removes all FDs where cond is true.
+func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ if cond(file, flags) {
+ f.set(fd, nil, FDFlags{}) // Clear from table.
+ }
+ })
+}
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
new file mode 100644
index 000000000..e009df974
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -0,0 +1,103 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync/atomic"
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+type descriptorTable struct {
+ // slice is a *[]unsafe.Pointer, where each element is actually
+ // *descriptor object, updated atomically.
+ //
+ // Changes to the slice itself requiring holding FDTable.mu.
+ slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
+}
+
+// init initializes the table.
+func (f *FDTable) init() {
+ var slice []unsafe.Pointer // Empty slice.
+ atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+}
+
+// get gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
+ slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+ if fd >= int32(len(slice)) {
+ return nil, FDFlags{}, false
+ }
+ d := (*descriptor)(atomic.LoadPointer(&slice[fd]))
+ if d == nil {
+ return nil, FDFlags{}, true
+ }
+ return d.file, d.flags, true
+}
+
+// set sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
+ slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+
+ // Grow the table as required.
+ if last := int32(len(slice)); fd >= last {
+ end := fd + 1
+ if end < 2*last {
+ end = 2 * last
+ }
+ slice = append(slice, make([]unsafe.Pointer, end-last)...)
+ atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+ }
+
+ // Create the new element.
+ var d *descriptor
+ if file != nil {
+ d = &descriptor{
+ file: file,
+ flags: flags,
+ }
+ }
+
+ // Update the single element.
+ orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d)))
+
+ // Acquire a table reference.
+ if file != nil && (orig == nil || file != orig.file) {
+ file.IncRef()
+ }
+
+ // Drop the table reference.
+ if orig != nil && file != orig.file {
+ f.drop(orig.file)
+ }
+
+ // Adjust used.
+ switch {
+ case orig == nil && file != nil:
+ atomic.AddInt32(&f.used, 1)
+ case orig != nil && file == nil:
+ atomic.AddInt32(&f.used, -1)
+ }
+}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 938239aeb..ded27d668 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -51,11 +51,13 @@ type FSContext struct {
func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
root.IncRef()
cwd.IncRef()
- return &FSContext{
+ f := FSContext{
root: root,
cwd: cwd,
umask: umask,
}
+ f.EnableLeakCheck("kernel.FSContext")
+ return &f
}
// destroy is the destructor for an FSContext.
diff --git a/pkg/sentry/kernel/futex/atomicptr_bucket.go b/pkg/sentry/kernel/futex/atomicptr_bucket_unsafe.go
index d3fdf09b0..d3fdf09b0 100755
--- a/pkg/sentry/kernel/futex/atomicptr_bucket.go
+++ b/pkg/sentry/kernel/futex/atomicptr_bucket_unsafe.go
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 3bd5c04af..278cc8143 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -729,14 +729,14 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool
}
b := m.lockBucket(&k)
- err = m.unlockPILocked(t, addr, tid, b)
+ err = m.unlockPILocked(t, addr, tid, b, &k)
k.release()
b.mu.Unlock()
return err
}
-func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
+func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error {
cur, err := t.LoadUint32(addr)
if err != nil {
return err
@@ -746,7 +746,22 @@ func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *buc
return syserror.EPERM
}
- if b.waiters.Empty() {
+ var next *Waiter // Who's the next owner?
+ var next2 *Waiter // Who's the one after that?
+ for w := b.waiters.Front(); w != nil; w = w.Next() {
+ if !w.key.matches(key) {
+ continue
+ }
+
+ if next == nil {
+ next = w
+ } else {
+ next2 = w
+ break
+ }
+ }
+
+ if next == nil {
// It's safe to set 0 because there are no waiters, no new owner, and the
// executing task is the current owner (no owner died bit).
prev, err := t.CompareAndSwapUint32(addr, cur, 0)
@@ -761,12 +776,10 @@ func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *buc
return nil
}
- next := b.waiters.Front()
-
// Set next owner's TID, waiters if there are any. Resets owner died bit, if
// set, because the executing task takes over as the owner.
val := next.tid
- if next.Next() != nil {
+ if next2 != nil {
val |= linux.FUTEX_WAITERS
}
diff --git a/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go b/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go
deleted file mode 100755
index cef77125b..000000000
--- a/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go
+++ /dev/null
@@ -1,4 +0,0 @@
-// automatically generated by stateify.
-
-package kdefs
-
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 47dadc43a..38b49cba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -155,7 +155,7 @@ type Kernel struct {
// cpuClockTicker increments cpuClock.
cpuClockTicker *ktime.Timer `state:"nosave"`
- // fdMapUids is an ever-increasing counter for generating FDMap uids.
+ // fdMapUids is an ever-increasing counter for generating FDTable uids.
//
// fdMapUids is mutable, and is accessed using atomic memory operations.
fdMapUids uint64
@@ -400,8 +400,8 @@ func (k *Kernel) flushMountSourceRefs() error {
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
- return k.tasks.forEachFDPaused(func(desc descriptor) error {
- desc.file.Dirent.Inode.MountSource.FlushDirentRefs()
+ return k.tasks.forEachFDPaused(func(file *fs.File) error {
+ file.Dirent.Inode.MountSource.FlushDirentRefs()
return nil
})
}
@@ -410,35 +410,35 @@ func (k *Kernel) flushMountSourceRefs() error {
// task.
//
// Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error {
+func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
- if t.fds == nil {
+ if t.fdTable == nil {
continue
}
- for _, desc := range t.fds.files {
- if err := f(desc); err != nil {
- return err
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if lastErr := f(file); lastErr != nil && err == nil {
+ err = lastErr
}
- }
+ })
}
- return nil
+ return err
}
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
- return ts.forEachFDPaused(func(desc descriptor) error {
- if flags := desc.file.Flags(); !flags.Write {
+ return ts.forEachFDPaused(func(file *fs.File) error {
+ if flags := file.Flags(); !flags.Write {
return nil
}
- if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+ if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
return nil
}
// Here we need all metadata synced.
- syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+ syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
if err := fs.SaveFileFsyncError(syncErr); err != nil {
- name, _ := desc.file.Dirent.FullName(nil /* root */)
+ name, _ := file.Dirent.FullName(nil /* root */)
// Wrap this error in ErrSaveRejection
// so that it will trigger a save
// error, rather than a panic. This
@@ -483,14 +483,12 @@ func (ts *TaskSet) unregisterEpollWaiters() {
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
- if fdmap := t.fds; fdmap != nil {
- for _, desc := range fdmap.files {
- if desc.file != nil {
- if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
- e.UnregisterEpollWaiters()
- }
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+ e.UnregisterEpollWaiters()
}
- }
+ })
}
}
}
@@ -538,6 +536,8 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
}
log.Infof("Memory load took [%s].", time.Since(memoryStart))
+ log.Infof("Overall load took [%s]", time.Since(loadStart))
+
// Ensure that all pending asynchronous work is complete:
// - namedpipe opening
// - inode file opening
@@ -602,9 +602,9 @@ type CreateProcessArgs struct {
// Credentials is the initial credentials.
Credentials *auth.Credentials
- // FDMap is the initial set of file descriptors. If CreateProcess succeeds,
- // it takes a reference on FDMap.
- FDMap *FDMap
+ // FDTable is the initial set of file descriptors. If CreateProcess succeeds,
+ // it takes a reference on FDTable.
+ FDTable *FDTable
// Umask is the initial umask.
Umask uint
@@ -789,9 +789,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
return nil, 0, errors.New(se.String())
}
- // Take a reference on the FDMap, which will be transferred to
+ // Take a reference on the FDTable, which will be transferred to
// TaskSet.NewTask().
- args.FDMap.IncRef()
+ args.FDTable.IncRef()
// Create the task.
config := &TaskConfig{
@@ -799,7 +799,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
ThreadGroup: tg,
TaskContext: tc,
FSContext: newFSContext(root, wd, args.Umask),
- FDMap: args.FDMap,
+ FDTable: args.FDTable,
Credentials: args.Credentials,
AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
UTSNamespace: args.UTSNamespace,
@@ -871,7 +871,7 @@ func (k *Kernel) pauseTimeLocked() {
}
// By precondition, nothing else can be interacting with PIDNamespace.tids
- // or FDMap.files, so we can iterate them without synchronization. (We
+ // or FDTable.files, so we can iterate them without synchronization. (We
// can't hold the TaskSet mutex when pausing thread group timers because
// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
// mutex, while holding the Timer mutex.)
@@ -882,14 +882,14 @@ func (k *Kernel) pauseTimeLocked() {
it.PauseTimer()
}
}
- // This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+ // This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
- if fdm := t.fds; fdm != nil {
- for _, desc := range fdm.files {
- if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.PauseTimer()
}
- }
+ })
}
}
k.timekeeper.PauseUpdates()
@@ -914,12 +914,12 @@ func (k *Kernel) resumeTimeLocked() {
it.ResumeTimer()
}
}
- if fdm := t.fds; fdm != nil {
- for _, desc := range fdm.files {
- if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.ResumeTimer()
}
- }
+ })
}
}
}
diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go
index 56c59e76c..b24036d33 100755
--- a/pkg/sentry/kernel/kernel_state_autogen.go
+++ b/pkg/sentry/kernel/kernel_state_autogen.go
@@ -61,21 +61,24 @@ func (x *descriptor) load(m state.Map) {
m.Load("flags", &x.flags)
}
-func (x *FDMap) beforeSave() {}
-func (x *FDMap) save(m state.Map) {
+func (x *FDTable) beforeSave() {}
+func (x *FDTable) save(m state.Map) {
x.beforeSave()
+ var descriptorTable map[int32]descriptor = x.saveDescriptorTable()
+ m.SaveValue("descriptorTable", descriptorTable)
m.Save("AtomicRefCount", &x.AtomicRefCount)
m.Save("k", &x.k)
- m.Save("files", &x.files)
m.Save("uid", &x.uid)
+ m.Save("used", &x.used)
}
-func (x *FDMap) afterLoad() {}
-func (x *FDMap) load(m state.Map) {
+func (x *FDTable) afterLoad() {}
+func (x *FDTable) load(m state.Map) {
m.Load("AtomicRefCount", &x.AtomicRefCount)
m.Load("k", &x.k)
- m.Load("files", &x.files)
m.Load("uid", &x.uid)
+ m.Load("used", &x.used)
+ m.LoadValue("descriptorTable", new(map[int32]descriptor), func(y interface{}) { x.loadDescriptorTable(y.(map[int32]descriptor)) })
}
func (x *FSContext) beforeSave() {}
@@ -536,8 +539,8 @@ func (x *Task) save(m state.Map) {
m.Save("k", &x.k)
m.Save("containerID", &x.containerID)
m.Save("tc", &x.tc)
- m.Save("fsc", &x.fsc)
- m.Save("fds", &x.fds)
+ m.Save("fsContext", &x.fsContext)
+ m.Save("fdTable", &x.fdTable)
m.Save("vforkParent", &x.vforkParent)
m.Save("exitState", &x.exitState)
m.Save("exitTracerNotified", &x.exitTracerNotified)
@@ -592,8 +595,8 @@ func (x *Task) load(m state.Map) {
m.Load("k", &x.k)
m.Load("containerID", &x.containerID)
m.Load("tc", &x.tc)
- m.Load("fsc", &x.fsc)
- m.Load("fds", &x.fds)
+ m.Load("fsContext", &x.fsContext)
+ m.Load("fdTable", &x.fdTable)
m.Load("vforkParent", &x.vforkParent)
m.Load("exitState", &x.exitState)
m.Load("exitTracerNotified", &x.exitTracerNotified)
@@ -1118,7 +1121,7 @@ func init() {
state.Register("kernel.AbstractSocketNamespace", (*AbstractSocketNamespace)(nil), state.Fns{Save: (*AbstractSocketNamespace).save, Load: (*AbstractSocketNamespace).load})
state.Register("kernel.FDFlags", (*FDFlags)(nil), state.Fns{Save: (*FDFlags).save, Load: (*FDFlags).load})
state.Register("kernel.descriptor", (*descriptor)(nil), state.Fns{Save: (*descriptor).save, Load: (*descriptor).load})
- state.Register("kernel.FDMap", (*FDMap)(nil), state.Fns{Save: (*FDMap).save, Load: (*FDMap).load})
+ state.Register("kernel.FDTable", (*FDTable)(nil), state.Fns{Save: (*FDTable).save, Load: (*FDTable).load})
state.Register("kernel.FSContext", (*FSContext)(nil), state.Fns{Save: (*FSContext).save, Load: (*FSContext).load})
state.Register("kernel.IPCNamespace", (*IPCNamespace)(nil), state.Fns{Save: (*IPCNamespace).save, Load: (*IPCNamespace).load})
state.Register("kernel.Kernel", (*Kernel)(nil), state.Fns{Save: (*Kernel).save, Load: (*Kernel).load})
diff --git a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go
index be6b07629..895abb129 100755
--- a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go
+++ b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go
@@ -1,11 +1,12 @@
package kernel
import (
+ "unsafe"
+
"fmt"
"gvisor.dev/gvisor/third_party/gvsync"
"reflect"
"strings"
- "unsafe"
)
// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 355984140..81fcd8258 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -294,6 +294,7 @@ func (tg *ThreadGroup) createSession() error {
id: SessionID(id),
leader: tg,
}
+ s.refs.EnableLeakCheck("kernel.Session")
// Create a new ProcessGroup, belonging to that Session.
// This also has a single reference (assigned below).
@@ -307,6 +308,7 @@ func (tg *ThreadGroup) createSession() error {
session: s,
ancestors: 0,
}
+ pg.refs.EnableLeakCheck("kernel.ProcessGroup")
// Tie them and return the result.
s.processGroups.PushBack(pg)
@@ -378,11 +380,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
// We manually adjust the ancestors if the parent is in the same
// session.
tg.processGroup.session.incRef()
- pg := &ProcessGroup{
+ pg := ProcessGroup{
id: ProcessGroupID(id),
originator: tg,
session: tg.processGroup.session,
}
+ pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+
if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
pg.ancestors++
}
@@ -390,20 +394,20 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
// Assign the new process group; adjust children.
oldParentPG := tg.parentPG()
tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
- childTG.processGroup.incRefWithParent(pg)
+ childTG.processGroup.incRefWithParent(&pg)
childTG.processGroup.decRefWithParent(oldParentPG)
})
tg.processGroup.decRefWithParent(oldParentPG)
- tg.processGroup = pg
+ tg.processGroup = &pg
// Add the new process group to the session.
- pg.session.processGroups.PushBack(pg)
+ pg.session.processGroups.PushBack(&pg)
// Ensure this translation is added to all namespaces.
for ns := tg.pidns; ns != nil; ns = ns.parent {
local := ns.tgids[tg]
- ns.pgids[pg] = ProcessGroupID(local)
- ns.processGroups[ProcessGroupID(local)] = pg
+ ns.pgids[&pg] = ProcessGroupID(local)
+ ns.processGroups[ProcessGroupID(local)] = &pg
}
return nil
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 3e9fe70e2..5bd610f68 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -224,6 +224,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
creatorPID: pid,
changeTime: ktime.NowFromContext(ctx),
}
+ shm.EnableLeakCheck("kernel.Shm")
// Find the next available ID.
for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 175d1b247..8227ecf1d 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -67,6 +67,7 @@ func (s *syslog) Log() []byte {
"Creating process schedule...",
"Generating random numbers by fair dice roll...",
"Rewriting operating system in Javascript...",
+ "Reticulating splines...",
"Consulting tar man page...",
"Forking spaghetti code...",
"Checking naughty and nice process list...",
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2e3a39d3b..e91f82bb3 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -236,15 +236,15 @@ type Task struct {
// tc is protected by mu, and is owned by the task goroutine.
tc TaskContext
- // fsc is the task's filesystem context.
+ // fsContext is the task's filesystem context.
//
- // fsc is protected by mu, and is owned by the task goroutine.
- fsc *FSContext
+ // fsContext is protected by mu, and is owned by the task goroutine.
+ fsContext *FSContext
- // fds is the task's file descriptor table.
+ // fdTable is the task's file descriptor table.
//
- // fds is protected by mu, and is owned by the task goroutine.
- fds *FDMap
+ // fdTable is protected by mu, and is owned by the task goroutine.
+ fdTable *FDTable
// If vforkParent is not nil, it is the task that created this task with
// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
@@ -602,7 +602,7 @@ func (t *Task) Value(key interface{}) interface{} {
case context.CtxThreadGroupID:
return int32(t.ThreadGroup().ID())
case fs.CtxRoot:
- return t.fsc.RootDirectory()
+ return t.fsContext.RootDirectory()
case fs.CtxDirentCacheLimiter:
return t.k.DirentCacheLimiter
case inet.CtxStack:
@@ -668,7 +668,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
func (t *Task) IsChrooted() bool {
realRoot := t.tg.mounts.Root()
defer realRoot.DecRef()
- root := t.fsc.RootDirectory()
+ root := t.fsContext.RootDirectory()
if root != nil {
defer root.DecRef()
}
@@ -689,16 +689,55 @@ func (t *Task) TaskContext() *TaskContext {
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) FSContext() *FSContext {
- return t.fsc
+ return t.fsContext
}
-// FDMap returns t's FDMap. FDMap does not take an additional reference on the
-// returned FDMap.
+// FDTable returns t's FDTable. FDMTable does not take an additional reference
+// on the returned FDMap.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
-func (t *Task) FDMap() *FDMap {
- return t.fds
+func (t *Task) FDTable() *FDTable {
+ return t.fdTable
+}
+
+// GetFile is a convenience wrapper t.FDTable().GetFile.
+//
+// Precondition: same as FDTable.
+func (t *Task) GetFile(fd int32) *fs.File {
+ f, _ := t.fdTable.Get(fd)
+ return f
+}
+
+// NewFDs is a convenience wrapper for t.FDTable().NewFDs.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) {
+ return t.fdTable.NewFDs(t, fd, files, flags)
+}
+
+// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) {
+ fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags)
+ if err != nil {
+ return 0, err
+ }
+ return fds[0], nil
+}
+
+// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
+ return t.fdTable.NewFDAt(t, fd, file, flags)
}
// WithMuLocked executes f with t.mu locked.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index b5cc3860d..0916fd658 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -214,20 +214,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
}
- var fsc *FSContext
+ var fsContext *FSContext
if opts.NewFSContext {
- fsc = t.fsc.Fork()
+ fsContext = t.fsContext.Fork()
} else {
- fsc = t.fsc
- fsc.IncRef()
+ fsContext = t.fsContext
+ fsContext.IncRef()
}
- var fds *FDMap
+ var fdTable *FDTable
if opts.NewFiles {
- fds = t.fds.Fork()
+ fdTable = t.fdTable.Fork()
} else {
- fds = t.fds
- fds.IncRef()
+ fdTable = t.fdTable
+ fdTable.IncRef()
}
pidns := t.tg.pidns
@@ -251,8 +251,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
ThreadGroup: tg,
SignalMask: t.SignalMask(),
TaskContext: tc,
- FSContext: fsc,
- FDMap: fds,
+ FSContext: fsContext,
+ FDTable: fdTable,
Credentials: creds,
Niceness: t.Niceness(),
NetworkNamespaced: t.netns,
@@ -485,22 +485,22 @@ func (t *Task) Unshare(opts *SharingOptions) error {
// namespace"
t.ipcns = NewIPCNamespace(creds.UserNamespace)
}
- var oldfds *FDMap
+ var oldFDTable *FDTable
if opts.NewFiles {
- oldfds = t.fds
- t.fds = oldfds.Fork()
+ oldFDTable = t.fdTable
+ t.fdTable = oldFDTable.Fork()
}
- var oldfsc *FSContext
+ var oldFSContext *FSContext
if opts.NewFSContext {
- oldfsc = t.fsc
- t.fsc = oldfsc.Fork()
+ oldFSContext = t.fsContext
+ t.fsContext = oldFSContext.Fork()
}
t.mu.Unlock()
- if oldfds != nil {
- oldfds.DecRef()
+ if oldFDTable != nil {
+ oldFDTable.DecRef()
}
- if oldfsc != nil {
- oldfsc.DecRef()
+ if oldFSContext != nil {
+ oldFSContext.DecRef()
}
return nil
}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index cd85acaef..17a089b90 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -195,7 +195,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tg.pidns.owner.mu.Unlock()
// Remove FDs with the CloseOnExec flag set.
- t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool {
+ t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool {
return flags.CloseOnExec
})
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b97d65185..535f03e50 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -265,8 +265,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
// Releasing the MM unblocks a blocked CLONE_VFORK parent.
t.unstopVforkParent()
- t.fsc.DecRef()
- t.fds.DecRef()
+ t.fsContext.DecRef()
+ t.fdTable.DecRef()
// If this is the last task to exit from the thread group, release the
// thread group's resources.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index cf48663b6..a29e9b9eb 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -63,7 +63,7 @@ func (t *Task) DebugDumpState() {
if mm := t.MemoryManager(); mm != nil {
t.Debugf("Mappings:\n%s", mm)
}
- t.Debugf("FDMap:\n%s", t.fds)
+ t.Debugf("FDTable:\n%s", t.fdTable)
}
// debugDumpRegisters logs register state at log level debug.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 72caae537..a88bf3951 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -52,9 +52,10 @@ type TaskConfig struct {
// succeeds.
FSContext *FSContext
- // FDMap is the FDMap of the new task. A reference must be held on FDMap,
- // which is transferred to TaskSet.NewTask whether or not it succeeds.
- FDMap *FDMap
+ // FDTable is the FDTableof the new task. A reference must be held on
+ // FDMap, which is transferred to TaskSet.NewTask whether or not it
+ // succeeds.
+ FDTable *FDTable
// Credentials is the Credentials of the new task.
Credentials *auth.Credentials
@@ -90,7 +91,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
if err != nil {
cfg.TaskContext.release()
cfg.FSContext.DecRef()
- cfg.FDMap.DecRef()
+ cfg.FDTable.DecRef()
return nil, err
}
return t, nil
@@ -112,8 +113,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
signalMask: cfg.SignalMask,
signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable},
tc: *tc,
- fsc: cfg.FSContext,
- fds: cfg.FDMap,
+ fsContext: cfg.FSContext,
+ fdTable: cfg.FDTable,
p: cfg.Kernel.Platform.NewContext(),
k: cfg.Kernel,
ptraceTracees: make(map[*Task]struct{}),
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index edfdac2a7..baa12d9a0 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -54,7 +54,7 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
// openPath opens name for loading.
//
// openPath returns the fs.Dirent and an *fs.File for name, which is not
-// installed in the Task FDMap. The caller takes ownership of both.
+// installed in the Task FDTable. The caller takes ownership of both.
//
// name must be a readable, executable, regular file.
func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) {
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index c9a942023..1b746d030 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -213,7 +213,9 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
if err != nil {
return nil, err
}
- return &aioMappable{mfp: mfp, fr: fr}, nil
+ m := aioMappable{mfp: mfp, fr: fr}
+ m.EnableLeakCheck("mm.aioMappable")
+ return &m, nil
}
// DecRef implements refs.RefCounter.DecRef.
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 2f8651a0a..ea2d7af74 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -45,7 +45,9 @@ type SpecialMappable struct {
//
// Preconditions: fr.Length() != 0.
func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable {
- return &SpecialMappable{mfp: mfp, fr: fr, name: name}
+ m := SpecialMappable{mfp: mfp, fr: fr, name: name}
+ m.EnableLeakCheck("mm.SpecialMappable")
+ return &m
}
// DecRef implements refs.RefCounter.DecRef.
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/platform/kvm/filters.go
index 304da2032..7d949f1dd 100644
--- a/pkg/sentry/kernel/kdefs/kdefs.go
+++ b/pkg/sentry/platform/kvm/filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,9 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Package kdefs defines common kernel definitions.
-//
-package kdefs
+package kvm
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
-// FD is a File Descriptor.
-type FD int32
+// SyscallFilters returns syscalls made exclusively by the KVM platform.
+func (*KVM) SyscallFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_ARCH_PRCTL: {},
+ syscall.SYS_IOCTL: {},
+ syscall.SYS_MMAP: {},
+ syscall.SYS_RT_SIGSUSPEND: {},
+ syscall.SYS_RT_SIGTIMEDWAIT: {},
+ 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host.
+ }
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index b49d7f3c4..ee4cd2f4d 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -141,3 +141,17 @@ func (k *KVM) NewContext() platform.Context {
machine: k.machine,
}
}
+
+type constructor struct{}
+
+func (*constructor) New(f *os.File) (platform.Platform, error) {
+ return New(f)
+}
+
+func (*constructor) OpenDevice() (*os.File, error) {
+ return OpenDevice()
+}
+
+func init() {
+ platform.Register("kvm", &constructor{})
+}
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 7d92e16cc..679087e25 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -426,7 +426,12 @@ func (c *vCPU) unlock() {
// Normal state.
case vCPUUser | vCPUGuest | vCPUWaiter:
// Force a transition: this must trigger a notification when we
- // return from guest mode.
+ // return from guest mode. We must clear vCPUWaiter here
+ // anyways, because BounceToKernel will force a transition only
+ // from ring3 to ring0, which will not clear this bit. Halt may
+ // workaround the issue, but if there is no exception or
+ // syscall in this period, BounceToKernel will hang.
+ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
c.notify()
case vCPUUser | vCPUWaiter:
// Waiting for the lock to be released; the responsibility is
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index eccbe2336..ec22dbf87 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -19,8 +19,10 @@ package platform
import (
"fmt"
+ "os"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -93,6 +95,9 @@ type Platform interface {
// Platforms for which this does not hold may panic if PreemptAllCPUs is
// called.
PreemptAllCPUs() error
+
+ // SyscallFilters returns syscalls made exclusively by this platform.
+ SyscallFilters() seccomp.SyscallRules
}
// NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and
@@ -347,3 +352,26 @@ type File interface {
func (fr FileRange) String() string {
return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
}
+
+// Constructor represents a platform type.
+type Constructor interface {
+ New(deviceFile *os.File) (Platform, error)
+ OpenDevice() (*os.File, error)
+}
+
+// platforms contains all available platform types.
+var platforms = map[string]Constructor{}
+
+// Register registers a new platform type.
+func Register(name string, platform Constructor) {
+ platforms[name] = platform
+}
+
+// Lookup looks up the platform constructor by name.
+func Lookup(name string) (Constructor, error) {
+ p, ok := platforms[name]
+ if !ok {
+ return nil, fmt.Errorf("unknown platform: %v", name)
+ }
+ return p, nil
+}
diff --git a/pkg/abi/linux/binder.go b/pkg/sentry/platform/ptrace/filters.go
index 63b08324a..1e07cfd0d 100644
--- a/pkg/abi/linux/binder.go
+++ b/pkg/sentry/platform/ptrace/filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,9 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-package linux
+package ptrace
-// BinderVersion structure is used for BINDER_VERSION ioctl.
-type BinderVersion struct {
- ProtocolVersion int32
+import (
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// SyscallFilters returns syscalls made exclusively by the ptrace platform.
+func (*PTrace) SyscallFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ unix.SYS_GETCPU: {},
+ unix.SYS_SCHED_SETAFFINITY: {},
+ syscall.SYS_PTRACE: {},
+ syscall.SYS_TGKILL: {},
+ syscall.SYS_WAIT4: {},
+ }
}
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index ee7e0640c..6fd30ed25 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -45,6 +45,7 @@
package ptrace
import (
+ "os"
"sync"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -236,3 +237,17 @@ func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan s
func (*PTrace) NewContext() platform.Context {
return &context{}
}
+
+type constructor struct{}
+
+func (*constructor) New(*os.File) (platform.Platform, error) {
+ return New()
+}
+
+func (*constructor) OpenDevice() (*os.File, error) {
+ return nil, nil
+}
+
+func init() {
+ platform.Register("ptrace", &constructor{})
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index dc3475494..87ded0bbd 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -306,7 +306,7 @@ func (s *subprocess) createStub() (*thread, error) {
arch.SyscallArgument{Value: 0},
arch.SyscallArgument{Value: 0})
if err != nil {
- return nil, err
+ return nil, fmt.Errorf("creating stub process: %v", err)
}
// Wait for child to enter group-stop, so we don't stop its
@@ -325,7 +325,7 @@ func (s *subprocess) createStub() (*thread, error) {
arch.SyscallArgument{Value: 0},
arch.SyscallArgument{Value: 0})
if err != nil {
- return nil, err
+ return nil, fmt.Errorf("waiting on stub process: %v", err)
}
childT := &thread{
diff --git a/pkg/sentry/platform/ring0/defs_impl.go b/pkg/sentry/platform/ring0/defs_impl.go
index 8efc3825f..d4bfc5a4a 100755
--- a/pkg/sentry/platform/ring0/defs_impl.go
+++ b/pkg/sentry/platform/ring0/defs_impl.go
@@ -1,14 +1,14 @@
package ring0
import (
+ "syscall"
+
"fmt"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
"io"
"reflect"
- "syscall"
-
- "gvisor.dev/gvisor/pkg/sentry/usermem"
)
var (
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 3577b5127..0feff8778 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -70,6 +70,14 @@ func (c *CPU) init() {
c.tss.ist1Lo = uint32(stackAddr)
c.tss.ist1Hi = uint32(stackAddr >> 32)
+ // Set the I/O bitmap base address beyond the last byte in the TSS
+ // to block access to the entire I/O address range.
+ //
+ // From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1:
+ // I/O addresses not spanned by the map are treated as if they had set
+ // bits in the map.
+ c.tss.ioPerm = tssLimit + 1
+
// Permanently set the kernel segments.
c.registers.Cs = uint64(Kcode)
c.registers.Ds = uint64(Kdata)
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index b646dc258..4f4a20dfe 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
@@ -63,7 +62,7 @@ type RightsFiles []*fs.File
func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) {
files := make(RightsFiles, 0, len(fds))
for _, fd := range fds {
- file, _ := t.FDMap().GetDescriptor(kdefs.FD(fd))
+ file := t.GetFile(fd)
if file == nil {
files.Release()
return nil, syserror.EBADF
@@ -109,7 +108,9 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32
files, trunc := rights.Files(t, max)
fds := make([]int32, 0, len(files))
for i := 0; i < max && len(files) > 0; i++ {
- fd, err := t.FDMap().NewFDFrom(0, files[0], kernel.FDFlags{cloexec}, t.ThreadGroup().Limits())
+ fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{
+ CloseOnExec: cloexec,
+ })
files[0].DecRef()
files = files[1:]
if err != nil {
@@ -315,8 +316,7 @@ func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte {
// Parse parses a raw socket control message into portable objects.
func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) {
var (
- fds linux.ControlMessageRights
-
+ fds linux.ControlMessageRights
haveCreds bool
creds linux.ControlMessageCredentials
)
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 2a38e370a..9d1bcfd41 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -40,7 +40,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -286,14 +285,14 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
// GetAddress reads an sockaddr struct from the given address and converts it
// to the FullAddress format. It supports AF_UNIX, AF_INET and AF_INET6
// addresses.
-func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
+func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syserr.Error) {
// Make sure we have at least 2 bytes for the address family.
if len(addr) < 2 {
return tcpip.FullAddress{}, syserr.ErrInvalidArgument
}
family := usermem.ByteOrder.Uint16(addr)
- if family != uint16(sfamily) {
+ if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported
}
@@ -318,7 +317,7 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
case linux.AF_INET:
var a linux.SockAddrInet
if len(addr) < sockAddrInetSize {
- return tcpip.FullAddress{}, syserr.ErrBadAddress
+ return tcpip.FullAddress{}, syserr.ErrInvalidArgument
}
binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a)
@@ -331,7 +330,7 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
case linux.AF_INET6:
var a linux.SockAddrInet6
if len(addr) < sockAddrInet6Size {
- return tcpip.FullAddress{}, syserr.ErrBadAddress
+ return tcpip.FullAddress{}, syserr.ErrInvalidArgument
}
binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a)
@@ -344,6 +343,9 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
}
return out, nil
+ case linux.AF_UNSPEC:
+ return tcpip.FullAddress{}, nil
+
default:
return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported
}
@@ -466,7 +468,7 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
// Connect implements the linux syscall connect(2) for sockets backed by
// tpcip.Endpoint.
func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
- addr, err := GetAddress(s.family, sockaddr)
+ addr, err := GetAddress(s.family, sockaddr, false /* strict */)
if err != nil {
return err
}
@@ -499,7 +501,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
// Bind implements the linux syscall bind(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
- addr, err := GetAddress(s.family, sockaddr)
+ addr, err := GetAddress(s.family, sockaddr, true /* strict */)
if err != nil {
return err
}
@@ -537,7 +539,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *wait
// Accept implements the linux syscall accept(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
ep, wq, terr := s.Endpoint.Accept()
if terr != nil {
@@ -575,10 +577,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
}
- fdFlags := kernel.FDFlags{
+ fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
- }
- fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
+ })
t.Kernel().RecordSocket(ns)
@@ -1924,7 +1925,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
var addr *tcpip.FullAddress
if len(to) > 0 {
- addrBuf, err := GetAddress(s.family, to)
+ addrBuf, err := GetAddress(s.family, to, true /* strict */)
if err != nil {
return 0, err
}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index c63f3aacf..7f69406b7 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -190,7 +189,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
}
// Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
var peerAddr []byte
var peerAddrlen uint32
var peerAddrPtr *byte
@@ -236,11 +235,11 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
defer f.DecRef()
- fdFlags := kernel.FDFlags{
+ kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{
CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
- }
- kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits())
+ })
t.Kernel().RecordSocket(f)
+
return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index ecc1e2d53..f3d6c1e9b 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -27,7 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
@@ -272,7 +271,7 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr
}
// Accept implements socket.Socket.Accept.
-func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
// Netlink sockets never support accept.
return 0, nil, 0, syserr.ErrNotSupported
}
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index cc7b964ea..ccaaddbfc 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
@@ -286,7 +285,7 @@ func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultP
}
// Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
payload, se := rpcAccept(t, s.fd, peerRequested)
// Check if we need to block.
@@ -336,10 +335,9 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
})
defer file.DecRef()
- fdFlags := kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
- }
- fd, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, 0, syserr.FromError(err)
}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 933120f34..0efa58a58 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -27,7 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -53,7 +52,7 @@ type Socket interface {
// Accept implements the accept4(2) linux syscall.
// Returns fd, real peer address length and error. Real peer address
// length is only set if len(peer) > 0.
- Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error)
+ Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error)
// Bind implements the bind(2) linux syscall.
Bind(t *kernel.Task, sockaddr []byte) *syserr.Error
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index e4c416233..73d2df15d 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -143,7 +143,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E
}
q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
+ q1.EnableLeakCheck("transport.queue")
q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
+ q2.EnableLeakCheck("transport.queue")
if stype == linux.SOCK_STREAM {
a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
@@ -294,12 +296,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
}
readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit}
+ readQueue.EnableLeakCheck("transport.queue")
ne.connected = &connectedEndpoint{
endpoint: ce,
writeQueue: readQueue,
}
writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
+ writeQueue.EnableLeakCheck("transport.queue")
if e.stype == linux.SOCK_STREAM {
ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
} else {
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index e987519f0..c7f7c5b16 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -41,7 +41,9 @@ var (
// NewConnectionless creates a new unbound dgram endpoint.
func NewConnectionless(ctx context.Context) Endpoint {
ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
- ep.receiver = &queueReceiver{readQueue: &queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}}
+ q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}
+ q.EnableLeakCheck("transport.queue")
+ ep.receiver = &queueReceiver{readQueue: &q}
return ep
}
@@ -52,29 +54,24 @@ func (e *connectionlessEndpoint) isBound() bool {
// Close puts the endpoint in a closed state and frees all resources associated
// with it.
-//
-// The socket will be a fresh state after a call to close and may be reused.
-// That is, close may be used to "unbind" or "disconnect" the socket in error
-// paths.
func (e *connectionlessEndpoint) Close() {
e.Lock()
- var r Receiver
- if e.Connected() {
- e.receiver.CloseRecv()
- r = e.receiver
- e.receiver = nil
-
+ if e.connected != nil {
e.connected.Release()
e.connected = nil
}
+
if e.isBound() {
e.path = ""
}
+
+ e.receiver.CloseRecv()
+ r := e.receiver
+ e.receiver = nil
e.Unlock()
- if r != nil {
- r.CloseNotify()
- r.Release()
- }
+
+ r.CloseNotify()
+ r.Release()
}
// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
@@ -137,6 +134,9 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi
}
e.Lock()
+ if e.connected != nil {
+ e.connected.Release()
+ }
e.connected = connected
e.Unlock()
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 6190de0c5..637168714 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -28,7 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
@@ -69,10 +68,13 @@ func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType)
// NewWithDirent creates a new unix socket using an existing dirent.
func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File {
- return fs.NewFile(ctx, d, flags, &SocketOperations{
+ s := SocketOperations{
ep: ep,
stype: stype,
- })
+ }
+ s.EnableLeakCheck("unix.SocketOperations")
+
+ return fs.NewFile(ctx, d, flags, &s)
}
// DecRef implements RefCounter.DecRef.
@@ -108,7 +110,7 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
// extractPath extracts and validates the address.
func extractPath(sockaddr []byte) (string, *syserr.Error) {
- addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr)
+ addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr, true /* strict */)
if err != nil {
return "", err
}
@@ -191,7 +193,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
ep, err := s.ep.Accept()
if err != nil {
@@ -226,10 +228,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
}
- fdFlags := kernel.FDFlags{
+ fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
- }
- fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
+ })
if e != nil {
return 0, nil, 0, syserr.FromError(e)
}
diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go
index 57cf6b139..5187594a7 100644
--- a/pkg/sentry/strace/poll.go
+++ b/pkg/sentry/strace/poll.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/usermem"
)
@@ -50,7 +49,7 @@ func pollFD(t *kernel.Task, pfd *linux.PollFD, post bool) string {
if post {
revents = PollEventSet.Parse(uint64(pfd.REvents))
}
- return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, kdefs.FD(pfd.FD)), PollEventSet.Parse(uint64(pfd.Events)), revents)
+ return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, pfd.FD), PollEventSet.Parse(uint64(pfd.Events)), revents)
}
func pollFDs(t *kernel.Task, addr usermem.Addr, nfds uint, post bool) string {
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index f9cf2eb21..386b40af7 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -332,7 +332,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
switch family {
case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
- fa, err := epsocket.GetAddress(int(family), b)
+ fa, err := epsocket.GetAddress(int(family), b, true /* strict */)
if err != nil {
return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 86e9c5690..311389547 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -31,7 +31,6 @@ import (
"gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -133,7 +132,7 @@ func path(t *kernel.Task, addr usermem.Addr) string {
return fmt.Sprintf("%#x %s", addr, path)
}
-func fd(t *kernel.Task, fd kdefs.FD) string {
+func fd(t *kernel.Task, fd int32) string {
root := t.FSContext().RootDirectory()
if root != nil {
defer root.DecRef()
@@ -151,7 +150,7 @@ func fd(t *kernel.Task, fd kdefs.FD) string {
return fmt.Sprintf("AT_FDCWD %s", name)
}
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
// Cast FD to uint64 to avoid printing negative hex.
return fmt.Sprintf("%#x (bad FD)", uint64(fd))
@@ -375,7 +374,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
}
switch i.format[arg] {
case FD:
- output = append(output, fd(t, kdefs.FD(args[arg].Int())))
+ output = append(output, fd(t, args[arg].Int()))
case WriteBuffer:
output = append(output, dump(t, args[arg].Pointer(), args[arg+1].SizeT(), maximumBlobSize))
case WriteIOVec:
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index c710ec9e3..87dcad18b 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -15,25 +15,23 @@
package syscalls
import (
- "syscall"
"time"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
// CreateEpoll implements the epoll_create(2) linux syscall.
-func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) {
+func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) {
file := epoll.NewEventPoll(t)
defer file.DecRef()
- flags := kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: closeOnExec,
- }
- fd, err := t.FDMap().NewFDFrom(0, file, flags, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, err
}
@@ -42,25 +40,25 @@ func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) {
}
// AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD.
-func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
// Get epoll from the file descriptor.
- epollfile := t.FDMap().GetFile(epfd)
+ epollfile := t.GetFile(epfd)
if epollfile == nil {
- return syscall.EBADF
+ return syserror.EBADF
}
defer epollfile.DecRef()
// Get the target file id.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return syscall.EBADF
+ return syserror.EBADF
}
defer file.DecRef()
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
if !ok {
- return syscall.EBADF
+ return syserror.EBADF
}
// Try to add the entry.
@@ -68,25 +66,25 @@ func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags
}
// UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD.
-func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
// Get epoll from the file descriptor.
- epollfile := t.FDMap().GetFile(epfd)
+ epollfile := t.GetFile(epfd)
if epollfile == nil {
- return syscall.EBADF
+ return syserror.EBADF
}
defer epollfile.DecRef()
// Get the target file id.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return syscall.EBADF
+ return syserror.EBADF
}
defer file.DecRef()
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
if !ok {
- return syscall.EBADF
+ return syserror.EBADF
}
// Try to update the entry.
@@ -94,25 +92,25 @@ func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFl
}
// RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL.
-func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error {
+func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
// Get epoll from the file descriptor.
- epollfile := t.FDMap().GetFile(epfd)
+ epollfile := t.GetFile(epfd)
if epollfile == nil {
- return syscall.EBADF
+ return syserror.EBADF
}
defer epollfile.DecRef()
// Get the target file id.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return syscall.EBADF
+ return syserror.EBADF
}
defer file.DecRef()
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
if !ok {
- return syscall.EBADF
+ return syserror.EBADF
}
// Try to remove the entry.
@@ -120,18 +118,18 @@ func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error {
}
// WaitEpoll implements the epoll_wait(2) linux syscall.
-func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event, error) {
+func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, error) {
// Get epoll from the file descriptor.
- epollfile := t.FDMap().GetFile(fd)
+ epollfile := t.GetFile(fd)
if epollfile == nil {
- return nil, syscall.EBADF
+ return nil, syserror.EBADF
}
defer epollfile.DecRef()
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
if !ok {
- return nil, syscall.EBADF
+ return nil, syserror.EBADF
}
// Try to read events and return right away if we got them or if the
@@ -164,7 +162,7 @@ func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event
}
if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
- if err == syscall.ETIMEDOUT {
+ if err == syserror.ETIMEDOUT {
return nil, nil
}
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index ac3905c5c..264301bfa 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -17,7 +17,6 @@ package linux
import (
"io"
"sync"
- "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
@@ -54,7 +53,7 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
// Do not consume the error and return it as EFBIG.
// Simultaneously send a SIGXFSZ per setrlimit(2).
t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
- return syscall.EFBIG
+ return syserror.EFBIG
case syserror.ErrInterrupted:
// The syscall was interrupted. Return nil if it completed
// partially, otherwise return the error code that the syscall
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 7f18b1ac8..51db2d8f7 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -16,8 +16,6 @@
package linux
import (
- "syscall"
-
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -50,335 +48,335 @@ var AMD64 = &kernel.SyscallTable{
Table: map[uintptr]kernel.Syscall{
0: syscalls.Supported("read", Read),
1: syscalls.Supported("write", Write),
- 2: syscalls.Supported("open", Open),
+ 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil),
3: syscalls.Supported("close", Close),
- 4: syscalls.Undocumented("stat", Stat),
- 5: syscalls.Undocumented("fstat", Fstat),
- 6: syscalls.Undocumented("lstat", Lstat),
- 7: syscalls.Undocumented("poll", Poll),
- 8: syscalls.Undocumented("lseek", Lseek),
- 9: syscalls.Undocumented("mmap", Mmap),
- 10: syscalls.Undocumented("mprotect", Mprotect),
- 11: syscalls.Undocumented("munmap", Munmap),
- 12: syscalls.Undocumented("brk", Brk),
- 13: syscalls.Undocumented("rt_sigaction", RtSigaction),
- 14: syscalls.Undocumented("rt_sigprocmask", RtSigprocmask),
- 15: syscalls.Undocumented("rt_sigreturn", RtSigreturn),
- 16: syscalls.Undocumented("ioctl", Ioctl),
- 17: syscalls.Undocumented("pread64", Pread64),
- 18: syscalls.Undocumented("pwrite64", Pwrite64),
- 19: syscalls.Undocumented("readv", Readv),
- 20: syscalls.Undocumented("writev", Writev),
- 21: syscalls.Undocumented("access", Access),
- 22: syscalls.Undocumented("pipe", Pipe),
- 23: syscalls.Undocumented("select", Select),
- 24: syscalls.Undocumented("sched_yield", SchedYield),
- 25: syscalls.Undocumented("mremap", Mremap),
- 26: syscalls.Undocumented("msync", Msync),
- 27: syscalls.Undocumented("mincore", Mincore),
- 28: syscalls.Undocumented("madvise", Madvise),
- 29: syscalls.Undocumented("shmget", Shmget),
- 30: syscalls.Undocumented("shmat", Shmat),
- 31: syscalls.Undocumented("shmctl", Shmctl),
- 32: syscalls.Undocumented("dup", Dup),
- 33: syscalls.Undocumented("dup2", Dup2),
- 34: syscalls.Undocumented("pause", Pause),
- 35: syscalls.Undocumented("nanosleep", Nanosleep),
- 36: syscalls.Undocumented("getitimer", Getitimer),
- 37: syscalls.Undocumented("alarm", Alarm),
- 38: syscalls.Undocumented("setitimer", Setitimer),
- 39: syscalls.Undocumented("getpid", Getpid),
- 40: syscalls.Undocumented("sendfile", Sendfile),
- 41: syscalls.Undocumented("socket", Socket),
- 42: syscalls.Undocumented("connect", Connect),
- 43: syscalls.Undocumented("accept", Accept),
- 44: syscalls.Undocumented("sendto", SendTo),
- 45: syscalls.Undocumented("recvfrom", RecvFrom),
- 46: syscalls.Undocumented("sendmsg", SendMsg),
- 47: syscalls.Undocumented("recvmsg", RecvMsg),
- 48: syscalls.Undocumented("shutdown", Shutdown),
- 49: syscalls.Undocumented("bind", Bind),
- 50: syscalls.Undocumented("listen", Listen),
- 51: syscalls.Undocumented("getsockname", GetSockName),
- 52: syscalls.Undocumented("getpeername", GetPeerName),
- 53: syscalls.Undocumented("socketpair", SocketPair),
- 54: syscalls.Undocumented("setsockopt", SetSockOpt),
- 55: syscalls.Undocumented("getsockopt", GetSockOpt),
- 56: syscalls.Undocumented("clone", Clone),
- 57: syscalls.Undocumented("fork", Fork),
- 58: syscalls.Undocumented("vfork", Vfork),
- 59: syscalls.Undocumented("execve", Execve),
- 60: syscalls.Undocumented("exit", Exit),
- 61: syscalls.Undocumented("wait4", Wait4),
- 62: syscalls.Undocumented("kill", Kill),
- 63: syscalls.Undocumented("uname", Uname),
- 64: syscalls.Undocumented("semget", Semget),
- 65: syscalls.Undocumented("semop", Semop),
- 66: syscalls.Undocumented("semctl", Semctl),
- 67: syscalls.Undocumented("shmdt", Shmdt),
- 68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
- 69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
- 70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
- 71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
- 72: syscalls.Undocumented("fcntl", Fcntl),
- 73: syscalls.Undocumented("flock", Flock),
- 74: syscalls.Undocumented("fsync", Fsync),
- 75: syscalls.Undocumented("fdatasync", Fdatasync),
- 76: syscalls.Undocumented("truncate", Truncate),
- 77: syscalls.Undocumented("ftruncate", Ftruncate),
- 78: syscalls.Undocumented("getdents", Getdents),
- 79: syscalls.Undocumented("getcwd", Getcwd),
- 80: syscalls.Undocumented("chdir", Chdir),
- 81: syscalls.Undocumented("fchdir", Fchdir),
- 82: syscalls.Undocumented("rename", Rename),
- 83: syscalls.Undocumented("mkdir", Mkdir),
- 84: syscalls.Undocumented("rmdir", Rmdir),
- 85: syscalls.Undocumented("creat", Creat),
- 86: syscalls.Undocumented("link", Link),
- 87: syscalls.Undocumented("link", Unlink),
- 88: syscalls.Undocumented("symlink", Symlink),
- 89: syscalls.Undocumented("readlink", Readlink),
- 90: syscalls.Undocumented("chmod", Chmod),
- 91: syscalls.Undocumented("fchmod", Fchmod),
- 92: syscalls.Undocumented("chown", Chown),
- 93: syscalls.Undocumented("fchown", Fchown),
- 94: syscalls.Undocumented("lchown", Lchown),
- 95: syscalls.Undocumented("umask", Umask),
- 96: syscalls.Undocumented("gettimeofday", Gettimeofday),
- 97: syscalls.Undocumented("getrlimit", Getrlimit),
- 98: syscalls.Undocumented("getrusage", Getrusage),
- 99: syscalls.Undocumented("sysinfo", Sysinfo),
- 100: syscalls.Undocumented("times", Times),
- 101: syscalls.Undocumented("ptrace", Ptrace),
- 102: syscalls.Undocumented("getuid", Getuid),
- 103: syscalls.Undocumented("syslog", Syslog),
- 104: syscalls.Undocumented("getgid", Getgid),
- 105: syscalls.Undocumented("setuid", Setuid),
- 106: syscalls.Undocumented("setgid", Setgid),
- 107: syscalls.Undocumented("geteuid", Geteuid),
- 108: syscalls.Undocumented("getegid", Getegid),
- 109: syscalls.Undocumented("setpgid", Setpgid),
- 110: syscalls.Undocumented("getppid", Getppid),
- 111: syscalls.Undocumented("getpgrp", Getpgrp),
- 112: syscalls.Undocumented("setsid", Setsid),
- 113: syscalls.Undocumented("setreuid", Setreuid),
- 114: syscalls.Undocumented("setregid", Setregid),
- 115: syscalls.Undocumented("getgroups", Getgroups),
- 116: syscalls.Undocumented("setgroups", Setgroups),
- 117: syscalls.Undocumented("setresuid", Setresuid),
- 118: syscalls.Undocumented("getresuid", Getresuid),
- 119: syscalls.Undocumented("setresgid", Setresgid),
- 120: syscalls.Undocumented("setresgid", Getresgid),
- 121: syscalls.Undocumented("getpgid", Getpgid),
- 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
- 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
- 124: syscalls.Undocumented("getsid", Getsid),
- 125: syscalls.Undocumented("capget", Capget),
- 126: syscalls.Undocumented("capset", Capset),
- 127: syscalls.Undocumented("rt_sigpending", RtSigpending),
- 128: syscalls.Undocumented("rt_sigtimedwait", RtSigtimedwait),
- 129: syscalls.Undocumented("rt_sigqueueinfo", RtSigqueueinfo),
- 130: syscalls.Undocumented("rt_sigsuspend", RtSigsuspend),
- 131: syscalls.Undocumented("sigaltstack", Sigaltstack),
- 132: syscalls.Undocumented("utime", Utime),
- 133: syscalls.Undocumented("mknod", Mknod),
- 134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil),
- 135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil),
- 136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil),
- 137: syscalls.Undocumented("statfs", Statfs),
- 138: syscalls.Undocumented("fstatfs", Fstatfs),
- 139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
- 140: syscalls.Undocumented("getpriority", Getpriority),
- 141: syscalls.Undocumented("setpriority", Setpriority),
+ 4: syscalls.Supported("stat", Stat),
+ 5: syscalls.Supported("fstat", Fstat),
+ 6: syscalls.Supported("lstat", Lstat),
+ 7: syscalls.Supported("poll", Poll),
+ 8: syscalls.Supported("lseek", Lseek),
+ 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+ 10: syscalls.Supported("mprotect", Mprotect),
+ 11: syscalls.Supported("munmap", Munmap),
+ 12: syscalls.Supported("brk", Brk),
+ 13: syscalls.Supported("rt_sigaction", RtSigaction),
+ 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask),
+ 15: syscalls.Supported("rt_sigreturn", RtSigreturn),
+ 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
+ 17: syscalls.Supported("pread64", Pread64),
+ 18: syscalls.Supported("pwrite64", Pwrite64),
+ 19: syscalls.Supported("readv", Readv),
+ 20: syscalls.Supported("writev", Writev),
+ 21: syscalls.Supported("access", Access),
+ 22: syscalls.Supported("pipe", Pipe),
+ 23: syscalls.Supported("select", Select),
+ 24: syscalls.Supported("sched_yield", SchedYield),
+ 25: syscalls.Supported("mremap", Mremap),
+ 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
+ 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
+ 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
+ 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
+ 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
+ 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
+ 32: syscalls.Supported("dup", Dup),
+ 33: syscalls.Supported("dup2", Dup2),
+ 34: syscalls.Supported("pause", Pause),
+ 35: syscalls.Supported("nanosleep", Nanosleep),
+ 36: syscalls.Supported("getitimer", Getitimer),
+ 37: syscalls.Supported("alarm", Alarm),
+ 38: syscalls.Supported("setitimer", Setitimer),
+ 39: syscalls.Supported("getpid", Getpid),
+ 40: syscalls.Supported("sendfile", Sendfile),
+ 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
+ 42: syscalls.Supported("connect", Connect),
+ 43: syscalls.Supported("accept", Accept),
+ 44: syscalls.Supported("sendto", SendTo),
+ 45: syscalls.Supported("recvfrom", RecvFrom),
+ 46: syscalls.Supported("sendmsg", SendMsg),
+ 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
+ 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
+ 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
+ 50: syscalls.Supported("listen", Listen),
+ 51: syscalls.Supported("getsockname", GetSockName),
+ 52: syscalls.Supported("getpeername", GetPeerName),
+ 53: syscalls.Supported("socketpair", SocketPair),
+ 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
+ 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
+ 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
+ 57: syscalls.Supported("fork", Fork),
+ 58: syscalls.Supported("vfork", Vfork),
+ 59: syscalls.Supported("execve", Execve),
+ 60: syscalls.Supported("exit", Exit),
+ 61: syscalls.Supported("wait4", Wait4),
+ 62: syscalls.Supported("kill", Kill),
+ 63: syscalls.Supported("uname", Uname),
+ 64: syscalls.Supported("semget", Semget),
+ 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
+ 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+ 67: syscalls.Supported("shmdt", Shmdt),
+ 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
+ 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
+ 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
+ 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
+ 76: syscalls.Supported("truncate", Truncate),
+ 77: syscalls.Supported("ftruncate", Ftruncate),
+ 78: syscalls.Supported("getdents", Getdents),
+ 79: syscalls.Supported("getcwd", Getcwd),
+ 80: syscalls.Supported("chdir", Chdir),
+ 81: syscalls.Supported("fchdir", Fchdir),
+ 82: syscalls.Supported("rename", Rename),
+ 83: syscalls.Supported("mkdir", Mkdir),
+ 84: syscalls.Supported("rmdir", Rmdir),
+ 85: syscalls.Supported("creat", Creat),
+ 86: syscalls.Supported("link", Link),
+ 87: syscalls.Supported("unlink", Unlink),
+ 88: syscalls.Supported("symlink", Symlink),
+ 89: syscalls.Supported("readlink", Readlink),
+ 90: syscalls.Supported("chmod", Chmod),
+ 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
+ 92: syscalls.Supported("chown", Chown),
+ 93: syscalls.Supported("fchown", Fchown),
+ 94: syscalls.Supported("lchown", Lchown),
+ 95: syscalls.Supported("umask", Umask),
+ 96: syscalls.Supported("gettimeofday", Gettimeofday),
+ 97: syscalls.Supported("getrlimit", Getrlimit),
+ 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
+ 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
+ 100: syscalls.Supported("times", Times),
+ 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
+ 102: syscalls.Supported("getuid", Getuid),
+ 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
+ 104: syscalls.Supported("getgid", Getgid),
+ 105: syscalls.Supported("setuid", Setuid),
+ 106: syscalls.Supported("setgid", Setgid),
+ 107: syscalls.Supported("geteuid", Geteuid),
+ 108: syscalls.Supported("getegid", Getegid),
+ 109: syscalls.Supported("setpgid", Setpgid),
+ 110: syscalls.Supported("getppid", Getppid),
+ 111: syscalls.Supported("getpgrp", Getpgrp),
+ 112: syscalls.Supported("setsid", Setsid),
+ 113: syscalls.Supported("setreuid", Setreuid),
+ 114: syscalls.Supported("setregid", Setregid),
+ 115: syscalls.Supported("getgroups", Getgroups),
+ 116: syscalls.Supported("setgroups", Setgroups),
+ 117: syscalls.Supported("setresuid", Setresuid),
+ 118: syscalls.Supported("getresuid", Getresuid),
+ 119: syscalls.Supported("setresgid", Setresgid),
+ 120: syscalls.Supported("getresgid", Getresgid),
+ 121: syscalls.Supported("getpgid", Getpgid),
+ 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+ 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+ 124: syscalls.Supported("getsid", Getsid),
+ 125: syscalls.Supported("capget", Capget),
+ 126: syscalls.Supported("capset", Capset),
+ 127: syscalls.Supported("rt_sigpending", RtSigpending),
+ 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
+ 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
+ 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
+ 131: syscalls.Supported("sigaltstack", Sigaltstack),
+ 132: syscalls.Supported("utime", Utime),
+ 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil),
+ 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil),
+ 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
+ 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil),
+ 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
+ 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+ 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
+ 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
+ 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
- 143: syscalls.Undocumented("sched_getparam", SchedGetparam),
- 144: syscalls.Undocumented("sched_setscheduler", SchedSetscheduler),
- 145: syscalls.Undocumented("sched_getscheduler", SchedGetscheduler),
- 146: syscalls.Undocumented("sched_get_priority_max", SchedGetPriorityMax),
- 147: syscalls.Undocumented("sched_get_priority_min", SchedGetPriorityMin),
- 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil),
- 149: syscalls.Undocumented("mlock", Mlock),
- 150: syscalls.Undocumented("munlock", Munlock),
- 151: syscalls.Undocumented("mlockall", Mlockall),
- 152: syscalls.Undocumented("munlockall", Munlockall),
+ 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
+ 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
+ 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
+ 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
+ 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
+ 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
+ 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
- 154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil),
- 155: syscalls.Error("pivot_root", syscall.EPERM, "", nil),
- 156: syscalls.Error("sysctl", syscall.EPERM, `syscall is "worthless"`, nil),
- 157: syscalls.Undocumented("prctl", Prctl),
- 158: syscalls.Undocumented("arch_prctl", ArchPrctl),
+ 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil),
+ 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil),
+ 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil),
+ 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
+ 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil),
159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
- 160: syscalls.Undocumented("setrlimit", Setrlimit),
- 161: syscalls.Undocumented("chroot", Chroot),
- 162: syscalls.Undocumented("sync", Sync),
+ 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
+ 161: syscalls.Supported("chroot", Chroot),
+ 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
- 165: syscalls.Undocumented("mount", Mount),
- 166: syscalls.Undocumented("umount2", Umount2),
+ 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
+ 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
- 170: syscalls.Undocumented("sethostname", Sethostname),
- 171: syscalls.Undocumented("setdomainname", Setdomainname),
+ 170: syscalls.Supported("sethostname", Sethostname),
+ 171: syscalls.Supported("setdomainname", Setdomainname),
172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil),
173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil),
174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil),
175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
- 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in > 2.6", nil),
- 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in > 2.6", nil),
+ 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
+ 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
- 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Does not exist > 3.1", nil),
- 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux", nil),
- 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux", nil),
- 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux", nil),
- 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux", nil),
- 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux", nil),
- 186: syscalls.Undocumented("gettid", Gettid),
- 187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341)
- 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 200: syscalls.Undocumented("tkill", Tkill),
- 201: syscalls.Undocumented("time", Time),
- 202: syscalls.Undocumented("futex", Futex),
- 203: syscalls.Undocumented("sched_setaffinity", SchedSetaffinity),
- 204: syscalls.Undocumented("sched_getaffinity", SchedGetaffinity),
- 205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
- 206: syscalls.Undocumented("io_setup", IoSetup),
- 207: syscalls.Undocumented("io_destroy", IoDestroy),
- 208: syscalls.Undocumented("io_getevents", IoGetevents),
- 209: syscalls.Undocumented("io_submit", IoSubmit),
- 210: syscalls.Undocumented("io_cancel", IoCancel),
- 211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+ 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+ 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 186: syscalls.Supported("gettid", Gettid),
+ 187: syscalls.ErrorWithEvent("readahead", syserror.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341)
+ 188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 200: syscalls.Supported("tkill", Tkill),
+ 201: syscalls.Supported("time", Time),
+ 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
+ 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
+ 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
+ 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+ 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
- 213: syscalls.Undocumented("epoll_create", EpollCreate),
- 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated", nil),
- 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated", nil),
- 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since 3.16", nil),
- 217: syscalls.Undocumented("getdents64", Getdents64),
- 218: syscalls.Undocumented("set_tid_address", SetTidAddress),
- 219: syscalls.Undocumented("restart_syscall", RestartSyscall),
- 220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
- 221: syscalls.Undocumented("fadvise64", Fadvise64),
- 222: syscalls.Undocumented("timer_create", TimerCreate),
- 223: syscalls.Undocumented("timer_settime", TimerSettime),
- 224: syscalls.Undocumented("timer_gettime", TimerGettime),
- 225: syscalls.Undocumented("timer_getoverrun", TimerGetoverrun),
- 226: syscalls.Undocumented("timer_delete", TimerDelete),
- 227: syscalls.Undocumented("clock_settime", ClockSettime),
- 228: syscalls.Undocumented("clock_gettime", ClockGettime),
- 229: syscalls.Undocumented("clock_getres", ClockGetres),
- 230: syscalls.Undocumented("clock_nanosleep", ClockNanosleep),
- 231: syscalls.Undocumented("exit_group", ExitGroup),
- 232: syscalls.Undocumented("epoll_wait", EpollWait),
- 233: syscalls.Undocumented("epoll_ctl", EpollCtl),
- 234: syscalls.Undocumented("tgkill", Tgkill),
- 235: syscalls.Undocumented("utimes", Utimes),
- 236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil),
+ 213: syscalls.Supported("epoll_create", EpollCreate),
+ 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil),
+ 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil),
+ 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
+ 217: syscalls.Supported("getdents64", Getdents64),
+ 218: syscalls.Supported("set_tid_address", SetTidAddress),
+ 219: syscalls.Supported("restart_syscall", RestartSyscall),
+ 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+ 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
+ 222: syscalls.Supported("timer_create", TimerCreate),
+ 223: syscalls.Supported("timer_settime", TimerSettime),
+ 224: syscalls.Supported("timer_gettime", TimerGettime),
+ 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
+ 226: syscalls.Supported("timer_delete", TimerDelete),
+ 227: syscalls.Supported("clock_settime", ClockSettime),
+ 228: syscalls.Supported("clock_gettime", ClockGettime),
+ 229: syscalls.Supported("clock_getres", ClockGetres),
+ 230: syscalls.Supported("clock_nanosleep", ClockNanosleep),
+ 231: syscalls.Supported("exit_group", ExitGroup),
+ 232: syscalls.Supported("epoll_wait", EpollWait),
+ 233: syscalls.Supported("epoll_ctl", EpollCtl),
+ 234: syscalls.Supported("tgkill", Tgkill),
+ 235: syscalls.Supported("utimes", Utimes),
+ 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil),
237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
- 238: syscalls.Undocumented("set_mempolicy", SetMempolicy),
- 239: syscalls.Undocumented("get_mempolicy", GetMempolicy),
- 240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
- 241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
- 242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
- 243: syscalls.ErrorWithEvent("mq_timedreceive", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
- 244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
- 245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
+ 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
+ 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
- 247: syscalls.Undocumented("waitid", Waitid),
- 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user", nil),
- 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user", nil),
- 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user", nil),
+ 247: syscalls.Supported("waitid", Waitid),
+ 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
+ 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
+ 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
- 253: syscalls.Undocumented("inotify_init", InotifyInit),
- 254: syscalls.Undocumented("inotify_add_watch", InotifyAddWatch),
- 255: syscalls.Undocumented("inotify_rm_watch", InotifyRmWatch),
+ 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
+ 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
+ 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
- 257: syscalls.Undocumented("openat", Openat),
- 258: syscalls.Undocumented("mkdirat", Mkdirat),
- 259: syscalls.Undocumented("mknodat", Mknodat),
- 260: syscalls.Undocumented("fchownat", Fchownat),
- 261: syscalls.Undocumented("futimesat", Futimesat),
- 262: syscalls.Undocumented("fstatat", Fstatat),
- 263: syscalls.Undocumented("unlinkat", Unlinkat),
- 264: syscalls.Undocumented("renameat", Renameat),
- 265: syscalls.Undocumented("linkat", Linkat),
- 266: syscalls.Undocumented("symlinkat", Symlinkat),
- 267: syscalls.Undocumented("readlinkat", Readlinkat),
- 268: syscalls.Undocumented("fchmodat", Fchmodat),
- 269: syscalls.Undocumented("faccessat", Faccessat),
- 270: syscalls.Undocumented("pselect", Pselect),
- 271: syscalls.Undocumented("ppoll", Ppoll),
- 272: syscalls.Undocumented("unshare", Unshare),
- 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete", nil),
- 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete", nil),
- 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
- 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
- 277: syscalls.Undocumented("sync_file_range", SyncFileRange),
- 278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
- 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
- 280: syscalls.Undocumented("utimensat", Utimensat),
- 281: syscalls.Undocumented("epoll_pwait", EpollPwait),
- 282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
- 283: syscalls.Undocumented("timerfd_create", TimerfdCreate),
- 284: syscalls.Undocumented("eventfd", Eventfd),
- 285: syscalls.Undocumented("fallocate", Fallocate),
- 286: syscalls.Undocumented("timerfd_settime", TimerfdSettime),
- 287: syscalls.Undocumented("timerfd_gettime", TimerfdGettime),
- 288: syscalls.Undocumented("accept4", Accept4),
- 289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
- 290: syscalls.Undocumented("eventfd2", Eventfd2),
- 291: syscalls.Undocumented("epoll_create1", EpollCreate1),
- 292: syscalls.Undocumented("dup3", Dup3),
- 293: syscalls.Undocumented("pipe2", Pipe2),
- 294: syscalls.Undocumented("inotify_init1", InotifyInit1),
- 295: syscalls.Undocumented("preadv", Preadv),
- 296: syscalls.Undocumented("pwritev", Pwritev),
- 297: syscalls.Undocumented("rt_tgsigqueueinfo", RtTgsigqueueinfo),
- 298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil),
- 299: syscalls.Undocumented("recvmmsg", RecvMMsg),
- 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
- 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
- 302: syscalls.Undocumented("prlimit64", Prlimit64),
- 303: syscalls.ErrorWithEvent("name_to_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil),
- 304: syscalls.ErrorWithEvent("open_by_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil),
+ 257: syscalls.Supported("openat", Openat),
+ 258: syscalls.Supported("mkdirat", Mkdirat),
+ 259: syscalls.Supported("mknodat", Mknodat),
+ 260: syscalls.Supported("fchownat", Fchownat),
+ 261: syscalls.Supported("futimesat", Futimesat),
+ 262: syscalls.Supported("fstatat", Fstatat),
+ 263: syscalls.Supported("unlinkat", Unlinkat),
+ 264: syscalls.Supported("renameat", Renameat),
+ 265: syscalls.Supported("linkat", Linkat),
+ 266: syscalls.Supported("symlinkat", Symlinkat),
+ 267: syscalls.Supported("readlinkat", Readlinkat),
+ 268: syscalls.Supported("fchmodat", Fchmodat),
+ 269: syscalls.Supported("faccessat", Faccessat),
+ 270: syscalls.Supported("pselect", Pselect),
+ 271: syscalls.Supported("ppoll", Ppoll),
+ 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
+ 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 276: syscalls.ErrorWithEvent("tee", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
+ 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
+ 280: syscalls.Supported("utimensat", Utimensat),
+ 281: syscalls.Supported("epoll_pwait", EpollPwait),
+ 282: syscalls.ErrorWithEvent("signalfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
+ 283: syscalls.Supported("timerfd_create", TimerfdCreate),
+ 284: syscalls.Supported("eventfd", Eventfd),
+ 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
+ 286: syscalls.Supported("timerfd_settime", TimerfdSettime),
+ 287: syscalls.Supported("timerfd_gettime", TimerfdGettime),
+ 288: syscalls.Supported("accept4", Accept4),
+ 289: syscalls.ErrorWithEvent("signalfd4", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
+ 290: syscalls.Supported("eventfd2", Eventfd2),
+ 291: syscalls.Supported("epoll_create1", EpollCreate1),
+ 292: syscalls.Supported("dup3", Dup3),
+ 293: syscalls.Supported("pipe2", Pipe2),
+ 294: syscalls.Supported("inotify_init1", InotifyInit1),
+ 295: syscalls.Supported("preadv", Preadv),
+ 296: syscalls.Supported("pwritev", Pwritev),
+ 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+ 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
+ 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
+ 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+ 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+ 302: syscalls.Supported("prlimit64", Prlimit64),
+ 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+ 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
- 306: syscalls.Undocumented("syncfs", Syncfs),
- 307: syscalls.Undocumented("sendmmsg", SendMMsg),
- 308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
- 309: syscalls.Undocumented("getcpu", Getcpu),
- 310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
- 311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+ 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
+ 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
+ 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+ 309: syscalls.Supported("getcpu", Getcpu),
+ 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+ 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
- 314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
- 315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
- 316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772)
- 317: syscalls.Undocumented("seccomp", Seccomp),
- 318: syscalls.Undocumented("getrandom", GetRandom),
- 319: syscalls.Undocumented("memfd_create", MemfdCreate),
+ 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+ 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+ 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772)
+ 317: syscalls.Supported("seccomp", Seccomp),
+ 318: syscalls.Supported("getrandom", GetRandom),
+ 319: syscalls.Supported("memfd_create", MemfdCreate),
320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
- 322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836)
- 323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
- 324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897)
- 325: syscalls.Undocumented("mlock2", Mlock2),
+ 322: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836)
+ 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+ 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897)
+ 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
// Syscalls after 325 are "backports" from versions of Linux after 4.4.
- 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil),
- 327: syscalls.Undocumented("preadv2", Preadv2),
- 328: syscalls.Undocumented("pwritev2", Pwritev2),
+ 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
+ 327: syscalls.Supported("preadv2", Preadv2),
+ 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
332: syscalls.Supported("statx", Statx),
},
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
index 00b7e7cf2..333013d8c 100644
--- a/pkg/sentry/syscalls/linux/sigset.go
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -15,8 +15,6 @@
package linux
import (
- "syscall"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -27,7 +25,7 @@ import (
// STOP are clear.
func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) {
if size != linux.SignalSetSize {
- return 0, syscall.EINVAL
+ return 0, syserror.EINVAL
}
b := t.CopyScratchBuffer(8)
if _, err := t.CopyInBytes(sigSetAddr, b); err != nil {
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 7081d1a45..f56411bfe 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -55,7 +54,7 @@ type ioCallback struct {
OpCode uint16
ReqPrio int16
- FD uint32
+ FD int32
Buf uint64
Bytes uint64
@@ -65,7 +64,7 @@ type ioCallback struct {
Flags uint32
// eventfd to signal if IOCB_FLAG_RESFD is set in flags.
- ResFD uint32
+ ResFD int32
}
// ioEvent describes an I/O result.
@@ -292,7 +291,7 @@ func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioC
// submitCallback processes a single callback.
func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error {
- file := t.FDMap().GetFile(kdefs.FD(cb.FD))
+ file := t.GetFile(cb.FD)
if file == nil {
// File not found.
return syserror.EBADF
@@ -302,7 +301,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
// Was there an eventFD? Extract it.
var eventFile *fs.File
if cb.Flags&_IOCB_FLAG_RESFD != 0 {
- eventFile = t.FDMap().GetFile(kdefs.FD(cb.ResFD))
+ eventFile = t.GetFile(cb.ResFD)
if eventFile == nil {
// Bad FD.
return syserror.EBADF
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 14a61cfa5..4a2b9f061 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -15,12 +15,10 @@
package linux
import (
- "syscall"
-
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/syscalls"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
@@ -30,11 +28,11 @@ import (
// EpollCreate1 implements the epoll_create1(2) linux syscall.
func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
flags := args[0].Int()
- if flags & ^syscall.EPOLL_CLOEXEC != 0 {
+ if flags & ^linux.EPOLL_CLOEXEC != 0 {
return 0, nil, syserror.EINVAL
}
- closeOnExec := flags&syscall.EPOLL_CLOEXEC != 0
+ closeOnExec := flags&linux.EPOLL_CLOEXEC != 0
fd, err := syscalls.CreateEpoll(t, closeOnExec)
if err != nil {
return 0, nil, err
@@ -61,46 +59,43 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// EpollCtl implements the epoll_ctl(2) linux syscall.
func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- epfd := kdefs.FD(args[0].Int())
+ epfd := args[0].Int()
op := args[1].Int()
- fd := kdefs.FD(args[2].Int())
+ fd := args[2].Int()
eventAddr := args[3].Pointer()
// Capture the event state if needed.
flags := epoll.EntryFlags(0)
mask := waiter.EventMask(0)
var data [2]int32
- if op != syscall.EPOLL_CTL_DEL {
- var e syscall.EpollEvent
+ if op != linux.EPOLL_CTL_DEL {
+ var e linux.EpollEvent
if _, err := t.CopyIn(eventAddr, &e); err != nil {
return 0, nil, err
}
- if e.Events&syscall.EPOLLONESHOT != 0 {
+ if e.Events&linux.EPOLLONESHOT != 0 {
flags |= epoll.OneShot
}
- // syscall.EPOLLET is incorrectly generated as a negative number
- // in Go, see https://github.com/golang/go/issues/5328 for
- // details.
- if e.Events&-syscall.EPOLLET != 0 {
+ if e.Events&linux.EPOLLET != 0 {
flags |= epoll.EdgeTriggered
}
mask = waiter.EventMaskFromLinux(e.Events)
data[0] = e.Fd
- data[1] = e.Pad
+ data[1] = e.Data
}
// Perform the requested operations.
switch op {
- case syscall.EPOLL_CTL_ADD:
+ case linux.EPOLL_CTL_ADD:
// See fs/eventpoll.c.
mask |= waiter.EventHUp | waiter.EventErr
return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data)
- case syscall.EPOLL_CTL_DEL:
+ case linux.EPOLL_CTL_DEL:
return 0, nil, syscalls.RemoveEpoll(t, epfd, fd)
- case syscall.EPOLL_CTL_MOD:
+ case linux.EPOLL_CTL_MOD:
// Same as EPOLL_CTL_ADD.
mask |= waiter.EventHUp | waiter.EventErr
return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data)
@@ -132,7 +127,7 @@ func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error {
// EpollWait implements the epoll_wait(2) linux syscall.
func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- epfd := kdefs.FD(args[0].Int())
+ epfd := args[0].Int()
eventsAddr := args[1].Pointer()
maxEvents := int(args[2].Int())
timeout := int(args[3].Int())
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index 7dbe84884..8a34c4e99 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -15,12 +15,11 @@
package linux
import (
- "syscall"
-
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
+ "gvisor.dev/gvisor/pkg/syserror"
)
const (
@@ -38,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC)
if flags & ^allOps != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0)
@@ -47,10 +46,9 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
})
defer event.DecRef()
- fd, err := t.FDMap().NewFDFrom(0, event, kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, event, kernel.FDFlags{
CloseOnExec: flags&EFD_CLOEXEC != 0,
- },
- t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 3ef7441c2..2e00a91ce 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -34,7 +33,7 @@ import (
)
// fileOpAt performs an operation on the second last component in the path.
-func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error {
+func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error {
// Extract the last component.
dir, name := fs.SplitLast(path)
if dir == "/" {
@@ -60,7 +59,7 @@ func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dire
}
// fileOpOn performs an operation on the last entry of the path.
-func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error {
+func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error {
var (
d *fs.Dirent // The file.
wd *fs.Dirent // The working directory (if required.)
@@ -78,7 +77,7 @@ func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func
rel = wd
} else {
// Need to extract the given FD.
- f = t.FDMap().GetFile(dirFD)
+ f = t.GetFile(dirFD)
if f == nil {
return syserror.EBADF
}
@@ -131,7 +130,7 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string
return path, dirPath, nil
}
-func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) {
+func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return 0, err
@@ -184,8 +183,9 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
defer file.DecRef()
// Success.
- fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}
- newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
if err != nil {
return err
}
@@ -201,7 +201,7 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
return fd, err // Use result in frame.
}
-func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -285,7 +285,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Mknodat implements the linux syscall mknodat(2).
func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
path := args[1].Pointer()
mode := linux.FileMode(args[2].ModeT())
// We don't need this argument until we support creation of device nodes.
@@ -294,7 +294,7 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
return 0, nil, mknodAt(t, dirFD, path, mode)
}
-func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
+func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return 0, err
@@ -326,6 +326,7 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
if err != nil {
break
}
+ defer found.DecRef()
// We found something (possibly a symlink). If the
// O_EXCL flag was passed, then we can immediately
@@ -346,31 +347,41 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
}
// Try to resolve the symlink directly to a Dirent.
- resolved, err := found.Inode.Getlink(t)
- if err == nil || err != fs.ErrResolveViaReadlink {
+ var resolved *fs.Dirent
+ resolved, err = found.Inode.Getlink(t)
+ if err == nil {
// No more resolution necessary.
- found.DecRef()
- found = resolved
+ defer resolved.DecRef()
break
}
+ if err != fs.ErrResolveViaReadlink {
+ return err
+ }
+
+ // Are we able to resolve further?
+ if remainingTraversals == 0 {
+ return syscall.ELOOP
+ }
// Resolve the symlink to a path via Readlink.
- path, err := found.Inode.Readlink(t)
+ var path string
+ path, err = found.Inode.Readlink(t)
if err != nil {
break
}
remainingTraversals--
// Get the new parent from the target path.
+ var newParent *fs.Dirent
newParentPath, newName := fs.SplitLast(path)
- newParent, err := t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals)
+ newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals)
if err != nil {
break
}
+ defer newParent.DecRef()
// Repeat the process with the parent and name of the
// symlink target.
- parent.DecRef()
parent = newParent
name = newName
}
@@ -378,9 +389,6 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
var newFile *fs.File
switch err {
case nil:
- // The file existed.
- defer found.DecRef()
-
// Like sys_open, check for a few things about the
// filesystem before trying to get a reference to the
// fs.File. The same constraints on Check apply.
@@ -423,8 +431,9 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
}
// Success.
- fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}
- newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits())
+ newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
if err != nil {
return err
}
@@ -458,7 +467,7 @@ func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Openat implements linux syscall openat(2).
func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
flags := uint(args[2].Uint())
if flags&linux.O_CREAT != 0 {
@@ -497,7 +506,7 @@ func (ac accessContext) Value(key interface{}) interface{} {
}
}
-func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error {
+func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode uint) error {
const rOK = 4
const wOK = 2
const xOK = 1
@@ -552,7 +561,7 @@ func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Faccessat implements linux syscall faccessat(2).
func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
mode := args[2].ModeT()
flags := args[3].Int()
@@ -562,10 +571,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
// Ioctl implements linux syscall ioctl(2).
func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
request := int(args[1].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -574,12 +583,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Shared flags between file and socket.
switch request {
case linux.FIONCLEX:
- t.FDMap().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: false,
})
return 0, nil, nil
case linux.FIOCLEX:
- t.FDMap().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: true,
})
return 0, nil, nil
@@ -723,9 +732,9 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Fchdir implements the linux syscall fchdir(2).
func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -747,44 +756,47 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Close implements linux syscall close(2).
func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file, ok := t.FDMap().Remove(fd)
- if !ok {
+ // Note that Remove provides a reference on the file that we may use to
+ // flush. It is still active until we drop the final reference below
+ // (and other reference-holding operations complete).
+ file := t.FDTable().Remove(fd)
+ if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
err := file.Flush(t)
- return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file)
+ return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file)
}
// Dup implements linux syscall dup(2).
func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
- newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits())
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
if err != nil {
return 0, nil, syserror.EMFILE
}
- return uintptr(newfd), nil, nil
+ return uintptr(newFD), nil, nil
}
// Dup2 implements linux syscall dup2(2).
func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- oldfd := kdefs.FD(args[0].Int())
- newfd := kdefs.FD(args[1].Int())
+ oldfd := args[0].Int()
+ newfd := args[1].Int()
// If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
// then dup2() does nothing, and returns newfd.
if oldfd == newfd {
- oldFile := t.FDMap().GetFile(oldfd)
+ oldFile := t.GetFile(oldfd)
if oldFile == nil {
return 0, nil, syserror.EBADF
}
@@ -800,21 +812,21 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Dup3 implements linux syscall dup3(2).
func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- oldfd := kdefs.FD(args[0].Int())
- newfd := kdefs.FD(args[1].Int())
+ oldfd := args[0].Int()
+ newfd := args[1].Int()
flags := args[2].Uint()
if oldfd == newfd {
return 0, nil, syserror.EINVAL
}
- oldFile := t.FDMap().GetFile(oldfd)
+ oldFile := t.GetFile(oldfd)
if oldFile == nil {
return 0, nil, syserror.EBADF
}
defer oldFile.DecRef()
- err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}, t.ThreadGroup().Limits())
+ err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0})
if err != nil {
return 0, nil, err
}
@@ -857,10 +869,10 @@ func fSetOwn(t *kernel.Task, file *fs.File, who int32) {
// Fcntl implements linux syscall fcntl(2).
func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
cmd := args[1].Int()
- file, flags := t.FDMap().GetDescriptor(fd)
+ file, flags := t.FDTable().Get(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -868,9 +880,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
- from := kdefs.FD(args[2].Int())
- fdFlags := kernel.FDFlags{CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC}
- fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits())
+ from := args[2].Int()
+ fd, err := t.NewFDFrom(from, file, kernel.FDFlags{
+ CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+ })
if err != nil {
return 0, nil, err
}
@@ -879,7 +892,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
- t.FDMap().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
case linux.F_GETFL:
@@ -897,7 +910,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Copy in the lock request.
flockAddr := args[2].Pointer()
- var flock syscall.Flock_t
+ var flock linux.Flock
if _, err := t.CopyIn(flockAddr, &flock); err != nil {
return 0, nil, err
}
@@ -940,17 +953,17 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, err
}
- // The lock uid is that of the Task's FDMap.
- lockUniqueID := lock.UniqueID(t.FDMap().ID())
+ // The lock uid is that of the Task's FDTable.
+ lockUniqueID := lock.UniqueID(t.FDTable().ID())
// These locks don't block; execute the non-blocking operation using the inode's lock
// context directly.
switch flock.Type {
- case syscall.F_RDLCK:
+ case linux.F_RDLCK:
if !file.Flags().Read {
return 0, nil, syserror.EBADF
}
- if cmd == syscall.F_SETLK {
+ if cmd == linux.F_SETLK {
// Non-blocking lock, provide a nil lock.Blocker.
if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
return 0, nil, syserror.EAGAIN
@@ -962,11 +975,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
}
return 0, nil, nil
- case syscall.F_WRLCK:
+ case linux.F_WRLCK:
if !file.Flags().Write {
return 0, nil, syserror.EBADF
}
- if cmd == syscall.F_SETLK {
+ if cmd == linux.F_SETLK {
// Non-blocking lock, provide a nil lock.Blocker.
if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
return 0, nil, syserror.EAGAIN
@@ -978,7 +991,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
}
return 0, nil, nil
- case syscall.F_UNLCK:
+ case linux.F_UNLCK:
file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng)
return 0, nil, nil
default:
@@ -1031,7 +1044,7 @@ const (
// Fadvise64 implements linux syscall fadvise64(2).
// This implementation currently ignores the provided advice.
func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
length := args[2].Int64()
advice := args[3].Int()
@@ -1040,7 +1053,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, syserror.EINVAL
}
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -1066,7 +1079,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, nil
}
-func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -1111,14 +1124,14 @@ func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Mkdirat implements linux syscall mkdirat(2).
func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
mode := linux.FileMode(args[2].ModeT())
return 0, nil, mkdirAt(t, dirFD, addr, mode)
}
-func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -1158,7 +1171,7 @@ func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
}
-func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error {
+func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error {
newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
if err != nil {
return err
@@ -1201,7 +1214,7 @@ func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// Symlinkat implements linux syscall symlinkat(2).
func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldAddr := args[0].Pointer()
- dirFD := kdefs.FD(args[1].Int())
+ dirFD := args[1].Int()
newAddr := args[2].Pointer()
return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
@@ -1243,7 +1256,7 @@ func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
// linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
// specified by newDirFD and newAddr. If resolve is true, then the symlinks
// will be followed when evaluating the target.
-func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error {
+func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error {
oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
if err != nil {
return err
@@ -1257,7 +1270,7 @@ func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kd
}
if allowEmpty && oldPath == "" {
- target := t.FDMap().GetFile(oldDirFD)
+ target := t.GetFile(oldDirFD)
if target == nil {
return syserror.EBADF
}
@@ -1319,9 +1332,9 @@ func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Linkat implements linux syscall linkat(2).
func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- oldDirFD := kdefs.FD(args[0].Int())
+ oldDirFD := args[0].Int()
oldAddr := args[1].Pointer()
- newDirFD := kdefs.FD(args[2].Int())
+ newDirFD := args[2].Int()
newAddr := args[3].Pointer()
// man linkat(2):
@@ -1346,7 +1359,7 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
}
-func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
+func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return 0, err
@@ -1396,7 +1409,7 @@ func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Readlinkat implements linux syscall readlinkat(2).
func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
bufAddr := args[2].Pointer()
size := args[3].SizeT()
@@ -1405,7 +1418,7 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return n, nil, err
}
-func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -1435,7 +1448,7 @@ func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Unlinkat implements linux syscall unlinkat(2).
func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
flags := args[2].Uint()
if flags&linux.AT_REMOVEDIR != 0 {
@@ -1463,7 +1476,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
t.SendSignal(&arch.SignalInfo{
- Signo: int32(syscall.SIGXFSZ),
+ Signo: int32(linux.SIGXFSZ),
Code: arch.SignalInfoUser,
})
return 0, nil, syserror.EFBIG
@@ -1496,10 +1509,10 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Ftruncate implements linux syscall ftruncate(2).
func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
length := args[1].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -1523,7 +1536,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
t.SendSignal(&arch.SignalInfo{
- Signo: int32(syscall.SIGXFSZ),
+ Signo: int32(linux.SIGXFSZ),
Code: arch.SignalInfoUser,
})
return 0, nil, syserror.EFBIG
@@ -1614,7 +1627,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
return nil
}
-func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
+func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
path, _, err := copyInPath(t, addr, allowEmpty)
if err != nil {
return err
@@ -1622,7 +1635,7 @@ func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty
if path == "" {
// Annoying. What's wrong with fchown?
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return syserror.EBADF
}
@@ -1656,11 +1669,11 @@ func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fchown implements linux syscall fchown(2).
func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
uid := auth.UID(args[1].Uint())
gid := auth.GID(args[2].Uint())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -1671,7 +1684,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fchownat implements Linux syscall fchownat(2).
func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
uid := auth.UID(args[2].Uint())
gid := auth.GID(args[3].Uint())
@@ -1701,7 +1714,7 @@ func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
return nil
}
-func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -1722,10 +1735,10 @@ func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Fchmod implements linux syscall fchmod(2).
func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
mode := linux.FileMode(args[1].ModeT())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -1736,7 +1749,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fchmodat implements linux syscall fchmodat(2).
func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
mode := linux.FileMode(args[2].ModeT())
@@ -1752,7 +1765,7 @@ func defaultSetToSystemTimeSpec() fs.TimeSpec {
}
}
-func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
+func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
// Does the task own the file?
if !d.Inode.CheckOwnership(t) {
@@ -1785,7 +1798,7 @@ func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, r
// Linux returns EINVAL in this case. See utimes.c.
return syserror.EINVAL
}
- f := t.FDMap().GetFile(dirFD)
+ f := t.GetFile(dirFD)
if f == nil {
return syserror.EBADF
}
@@ -1813,7 +1826,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// No timesAddr argument will be interpreted as current system time.
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
- var times syscall.Utimbuf
+ var times linux.Utime
if _, err := t.CopyIn(timesAddr, &times); err != nil {
return 0, nil, err
}
@@ -1853,7 +1866,7 @@ func timespecIsValid(ts linux.Timespec) bool {
// Utimensat implements linux syscall utimensat(2).
func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
pathnameAddr := args[1].Pointer()
timesAddr := args[2].Pointer()
flags := args[3].Int()
@@ -1888,7 +1901,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
// Futimesat implements linux syscall futimesat(2).
func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
pathnameAddr := args[1].Pointer()
timesAddr := args[2].Pointer()
@@ -1912,7 +1925,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
}
-func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error {
+func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
if err != nil {
return err
@@ -1960,21 +1973,21 @@ func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Renameat implements linux syscall renameat(2).
func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- oldDirFD := kdefs.FD(args[0].Int())
+ oldDirFD := args[0].Int()
oldPathAddr := args[1].Pointer()
- newDirFD := kdefs.FD(args[2].Int())
+ newDirFD := args[2].Int()
newPathAddr := args[3].Pointer()
return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
}
// Fallocate implements linux system call fallocate(2).
func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
mode := args[1].Int64()
offset := args[2].Int64()
length := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -2005,7 +2018,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
}
if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
t.SendSignal(&arch.SignalInfo{
- Signo: int32(syscall.SIGXFSZ),
+ Signo: int32(linux.SIGXFSZ),
Code: arch.SignalInfoUser,
})
return 0, nil, syserror.EFBIG
@@ -2023,10 +2036,10 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
// Flock implements linux syscall flock(2).
func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
operation := args[1].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
// flock(2): EBADF fd is not an open file descriptor.
return 0, nil, syserror.EBADF
@@ -2133,8 +2146,9 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
defer dirent.DecRef()
defer file.DecRef()
- fdFlags := kernel.FDFlags{CloseOnExec: cloExec}
- newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
+ CloseOnExec: cloExec,
+ })
if err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index dea872672..63e2c5a5d 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -17,20 +17,19 @@ package linux
import (
"bytes"
"io"
- "syscall"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
)
// Getdents implements linux syscall getdents(2) for 64bit systems.
func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := int(args[2].Uint())
@@ -46,7 +45,7 @@ func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Getdents64 implements linux syscall getdents64(2).
func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := int(args[2].Uint())
@@ -62,8 +61,8 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// getdents implements the core of getdents(2)/getdents64(2).
// f is the syscall implementation dirent serialization function.
-func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) {
- dir := t.FDMap().GetFile(fd)
+func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) {
+ dir := t.GetFile(fd)
if dir == nil {
return 0, syserror.EBADF
}
@@ -83,7 +82,7 @@ func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*
switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err {
case nil:
- dir.Dirent.InotifyEvent(syscall.IN_ACCESS, 0)
+ dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
return uintptr(ds.Written()), nil
case io.EOF:
return 0, nil
@@ -147,21 +146,21 @@ func smallestDirent64(a arch.Context) uint {
func toType(nodeType fs.InodeType) uint8 {
switch nodeType {
case fs.RegularFile, fs.SpecialFile:
- return syscall.DT_REG
+ return linux.DT_REG
case fs.Symlink:
- return syscall.DT_LNK
+ return linux.DT_LNK
case fs.Directory, fs.SpecialDirectory:
- return syscall.DT_DIR
+ return linux.DT_DIR
case fs.Pipe:
- return syscall.DT_FIFO
+ return linux.DT_FIFO
case fs.CharacterDevice:
- return syscall.DT_CHR
+ return linux.DT_CHR
case fs.BlockDevice:
- return syscall.DT_BLK
+ return linux.DT_BLK
case fs.Socket:
- return syscall.DT_SOCK
+ return linux.DT_SOCK
default:
- return syscall.DT_UNKNOWN
+ return linux.DT_UNKNOWN
}
}
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index 9cfa660fa..b2c7b3444 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -15,14 +15,12 @@
package linux
import (
- "syscall"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.dev/gvisor/pkg/syserror"
)
const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC)
@@ -32,7 +30,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
flags := int(args[0].Int())
if flags&^allFlags != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
dirent := fs.NewDirent(t, anon.NewInode(t), "inotify")
@@ -44,9 +42,9 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t))
defer n.DecRef()
- fd, err := t.FDMap().NewFDFrom(0, n, kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, n, kernel.FDFlags{
CloseOnExec: flags&linux.IN_CLOEXEC != 0,
- }, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, err
@@ -63,18 +61,18 @@ func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// fdToInotify resolves an fd to an inotify object. If successful, the file will
// have an extra ref and the caller is responsible for releasing the ref.
-func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) {
- file := t.FDMap().GetFile(fd)
+func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) {
+ file := t.GetFile(fd)
if file == nil {
// Invalid fd.
- return nil, nil, syscall.EBADF
+ return nil, nil, syserror.EBADF
}
ino, ok := file.FileOperations.(*fs.Inotify)
if !ok {
// Not an inotify fd.
file.DecRef()
- return nil, nil, syscall.EINVAL
+ return nil, nil, syserror.EINVAL
}
return ino, file, nil
@@ -82,7 +80,7 @@ func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) {
// InotifyAddWatch implements the inotify_add_watch() syscall.
func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
mask := args[2].Uint()
@@ -93,7 +91,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
// "EINVAL: The given event mask contains no valid events."
// -- inotify_add_watch(2)
if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
ino, file, err := fdToInotify(t, fd)
@@ -110,11 +108,11 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error {
// "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7)
if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) {
- return syscall.ENOTDIR
+ return syserror.ENOTDIR
}
// Copy out to the return frame.
- fd = kdefs.FD(ino.AddWatch(dirent, mask))
+ fd = ino.AddWatch(dirent, mask)
return nil
})
@@ -123,7 +121,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
// InotifyRmWatch implements the inotify_rm_watch() syscall.
func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
wd := args[1].Int()
ino, file, err := fdToInotify(t, fd)
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index a3813b818..297e920c4 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -18,17 +18,16 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
)
// Lseek implements linux syscall lseek(2).
func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
offset := args[1].Int64()
whence := args[2].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index d831833bc..58a05b5bb 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -40,7 +39,7 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
prot := args[2].Int()
flags := args[3].Int()
- fd := kdefs.FD(args[4].Int())
+ fd := args[4].Int()
fixed := flags&linux.MAP_FIXED != 0
private := flags&linux.MAP_PRIVATE != 0
shared := flags&linux.MAP_SHARED != 0
@@ -80,7 +79,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if !anon {
// Convert the passed FD to a file reference.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 7c1bea43d..418d7fa5f 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -15,20 +15,19 @@
package linux
import (
- "syscall"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// pipe2 implements the actual system call with flags.
func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
- return 0, syscall.EINVAL
+ return 0, syserror.EINVAL
}
r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
@@ -38,25 +37,18 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
w.SetFlags(linuxToFlags(flags).Settable())
defer w.DecRef()
- rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{
- CloseOnExec: flags&linux.O_CLOEXEC != 0},
- t.ThreadGroup().Limits())
- if err != nil {
- return 0, err
- }
-
- wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{
- CloseOnExec: flags&linux.O_CLOEXEC != 0},
- t.ThreadGroup().Limits())
+ fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
if err != nil {
- t.FDMap().Remove(rfd)
return 0, err
}
- if _, err := t.CopyOut(addr, []kdefs.FD{rfd, wfd}); err != nil {
- t.FDMap().Remove(rfd)
- t.FDMap().Remove(wfd)
- return 0, syscall.EFAULT
+ if _, err := t.CopyOut(addr, fds); err != nil {
+ // The files are not closed in this case, the exact semantics
+ // of this error case are not well defined, but they could have
+ // already been observed by user space.
+ return 0, syserror.EFAULT
}
return 0, nil
}
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index ef6211218..7a13beac2 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -64,7 +63,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
return
}
- file := t.FDMap().GetFile(kdefs.FD(pfd.FD))
+ file := t.GetFile(pfd.FD)
if file == nil {
pfd.REvents = linux.POLLNVAL
return
@@ -265,7 +264,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
// immediately to ensure we don't leak. Note, another thread
// might be about to close fd. This is racy, but that's
// OK. Linux is racy in the same way.
- file := t.FDMap().GetFile(kdefs.FD(fd))
+ file := t.GetFile(fd)
if file == nil {
return 0, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 9d70881fd..98db32d77 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -16,15 +16,14 @@ package linux
import (
"fmt"
- "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// Prctl implements linux syscall prctl(2).
@@ -37,7 +36,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.PR_SET_PDEATHSIG:
sig := linux.Signal(args[1].Int())
if sig != 0 && !sig.IsValid() {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
t.SetParentDeathSignal(sig)
return 0, nil, nil
@@ -68,7 +67,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
d = mm.UserDumpable
default:
// N.B. Userspace may not pass SUID_DUMP_ROOT.
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
t.MemoryManager().SetDumpability(d)
return 0, nil, nil
@@ -89,7 +88,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
} else if val == 1 {
t.SetKeepCaps(true)
} else {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 0, nil, nil
@@ -97,7 +96,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.PR_SET_NAME:
addr := args[1].Pointer()
name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1)
- if err != nil && err != syscall.ENAMETOOLONG {
+ if err != nil && err != syserror.ENAMETOOLONG {
return 0, nil, err
}
t.SetName(name)
@@ -117,22 +116,22 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.PR_SET_MM:
if !t.HasCapability(linux.CAP_SYS_RESOURCE) {
- return 0, nil, syscall.EPERM
+ return 0, nil, syserror.EPERM
}
switch args[1].Int() {
case linux.PR_SET_MM_EXE_FILE:
- fd := kdefs.FD(args[2].Int())
+ fd := args[2].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// They trying to set exe to a non-file?
if !fs.IsFile(file.Dirent.Inode.StableAttr) {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
// Set the underlying executable.
@@ -154,12 +153,12 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
t.Kernel().EmitUnimplementedEvent(t)
fallthrough
default:
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
case linux.PR_SET_NO_NEW_PRIVS:
if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
// no_new_privs is assumed to always be set. See
// kernel.Task.updateCredsForExec.
@@ -167,14 +166,14 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.PR_GET_NO_NEW_PRIVS:
if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 1, nil, nil
case linux.PR_SET_SECCOMP:
if args[1].Int() != linux.SECCOMP_MODE_FILTER {
// Unsupported mode.
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer())
@@ -185,7 +184,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.PR_CAPBSET_READ:
cp := linux.Capability(args[1].Uint64())
if !cp.Ok() {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
var rv uintptr
if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 {
@@ -196,7 +195,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.PR_CAPBSET_DROP:
cp := linux.Capability(args[1].Uint64())
if !cp.Ok() {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 0, nil, t.DropBoundingCapability(cp)
@@ -221,7 +220,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
t.Kernel().EmitUnimplementedEvent(t)
fallthrough
default:
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 0, nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index a1965f490..b2474e60d 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -39,11 +38,11 @@ const (
// they can do large reads all at once. Bug for bug. Same for other read
// calls below.
func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -75,12 +74,12 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Pread64 implements linux syscall pread64(2).
func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
offset := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -122,11 +121,11 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// Readv implements linux syscall readv(2).
func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -152,12 +151,12 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Preadv implements linux syscall preadv(2).
func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -201,13 +200,13 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// splits the offset argument into a high/low value for compatibility with
// 32-bit architectures. The flags argument is the 5th argument.
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
flags := int(args[5].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index 434bbb322..99f6993f5 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -15,11 +15,10 @@
package linux
import (
- "syscall"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
)
const (
@@ -37,13 +36,13 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
pid := args[0].Int()
param := args[1].Pointer()
if param == 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if pid < 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
- return 0, nil, syscall.ESRCH
+ return 0, nil, syserror.ESRCH
}
r := SchedParam{schedPriority: onlyPriority}
if _, err := t.CopyOut(param, r); err != nil {
@@ -57,10 +56,10 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pid := args[0].Int()
if pid < 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
- return 0, nil, syscall.ESRCH
+ return 0, nil, syserror.ESRCH
}
return onlyScheduler, nil, nil
}
@@ -71,20 +70,20 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke
policy := args[1].Int()
param := args[2].Pointer()
if pid < 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if policy != onlyScheduler {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
- return 0, nil, syscall.ESRCH
+ return 0, nil, syserror.ESRCH
}
var r SchedParam
if _, err := t.CopyIn(param, &r); err != nil {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if r.schedPriority != onlyPriority {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 0, nil, nil
}
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 4885b5e40..18510ead8 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -15,13 +15,12 @@
package linux
import (
- "syscall"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
@@ -43,7 +42,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
// We only support SECCOMP_SET_MODE_FILTER at the moment.
if mode != linux.SECCOMP_SET_MODE_FILTER {
// Unsupported mode.
- return syscall.EINVAL
+ return syserror.EINVAL
}
tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0
@@ -51,7 +50,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
// The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC.
if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 {
// Unsupported flag.
- return syscall.EINVAL
+ return syserror.EINVAL
}
var fprog userSockFprog
@@ -65,7 +64,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
compiledFilter, err := bpf.Compile(filter)
if err != nil {
t.Debugf("Invalid seccomp-bpf filter: %v", err)
- return syscall.EINVAL
+ return syserror.EINVAL
}
return t.AppendSyscallFilter(compiledFilter, tsync)
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index ccdb079bb..195734257 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -15,7 +15,6 @@
package linux
import (
- "syscall"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -23,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
@@ -132,7 +130,7 @@ func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader
// from the untrusted address space range.
func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
if addrlen > maxAddrLen {
- return nil, syscall.EINVAL
+ return nil, syserror.EINVAL
}
addrBuf := make([]byte, addrlen)
@@ -154,7 +152,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
if int32(bufLen) < 0 {
- return syscall.EINVAL
+ return syserror.EINVAL
}
// Write the length unconditionally.
@@ -187,7 +185,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Check and initialize the flags.
if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
// Create the new socket.
@@ -200,9 +198,9 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
})
defer s.DecRef()
- fd, err := t.FDMap().NewFDFrom(0, s, kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, s, kernel.FDFlags{
CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
- }, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, err
}
@@ -219,15 +217,12 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// Check and initialize the flags.
if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
fileFlags := fs.SettableFileFlags{
NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
}
- fdFlags := kernel.FDFlags{
- CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
- }
// Create the socket pair.
s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol)
@@ -240,20 +235,16 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
defer s2.DecRef()
// Create the FDs for the sockets.
- fd1, err := t.FDMap().NewFDFrom(0, s1, fdFlags, t.ThreadGroup().Limits())
- if err != nil {
- return 0, nil, err
- }
- fd2, err := t.FDMap().NewFDFrom(0, s2, fdFlags, t.ThreadGroup().Limits())
+ fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{
+ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+ })
if err != nil {
- t.FDMap().Remove(fd1)
return 0, nil, err
}
// Copy the file descriptors out.
- if _, err := t.CopyOut(socks, []int32{int32(fd1), int32(fd2)}); err != nil {
- t.FDMap().Remove(fd1)
- t.FDMap().Remove(fd2)
+ if _, err := t.CopyOut(socks, fds); err != nil {
+ // Note that we don't close files here; see pipe(2) also.
return 0, nil, err
}
@@ -262,21 +253,21 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// Connect implements the linux syscall connect(2).
func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Uint()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Capture address and call syscall implementation.
@@ -291,23 +282,23 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// accept is the implementation of the accept syscall. It is called by accept
// and accept4 syscall handlers.
-func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
// Check that no unsupported flags are passed in.
if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
- return 0, syscall.EINVAL
+ return 0, syserror.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, syscall.EBADF
+ return 0, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, syscall.ENOTSOCK
+ return 0, syserror.ENOTSOCK
}
// Call the syscall implementation for this socket, then copy the
@@ -322,7 +313,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr
if peerRequested {
// NOTE(magi): Linux does not give you an error if it can't
// write the data back out so neither do we.
- if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL {
+ if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL {
return 0, err
}
}
@@ -331,7 +322,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr
// Accept4 implements the linux syscall accept4(2).
func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Pointer()
flags := int(args[3].Int())
@@ -342,7 +333,7 @@ func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// Accept implements the linux syscall accept(2).
func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Pointer()
@@ -352,21 +343,21 @@ func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Bind implements the linux syscall bind(2).
func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Uint()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Capture address and call syscall implementation.
@@ -380,20 +371,20 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Listen implements the linux syscall listen(2).
func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
backlog := args[1].Int()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Per Linux, the backlog is silently capped to reasonable values.
@@ -409,27 +400,27 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Shutdown implements the linux syscall shutdown(2).
func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
how := args[1].Int()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Validate how, then call syscall implementation.
switch how {
case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
default:
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 0, nil, s.Shutdown(t, int(how)).ToError()
@@ -437,23 +428,23 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// GetSockOpt implements the linux syscall getsockopt(2).
func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
level := args[1].Int()
name := args[2].Int()
optValAddr := args[3].Pointer()
optLenAddr := args[4].Pointer()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Read the length if present. Reject negative values.
@@ -464,7 +455,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
if optLen < 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
}
@@ -521,30 +512,30 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interfac
//
// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
level := args[1].Int()
name := args[2].Int()
optValAddr := args[3].Pointer()
optLen := args[4].Int()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
if optLen <= 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if optLen > maxOptLen {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
buf := t.CopyScratchBuffer(int(optLen))
if _, err := t.CopyIn(optValAddr, &buf); err != nil {
@@ -561,21 +552,21 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// GetSockName implements the linux syscall getsockname(2).
func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Pointer()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Get the socket name and copy it to the caller.
@@ -589,21 +580,21 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// GetPeerName implements the linux syscall getpeername(2).
func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Pointer()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Get the socket peer name and copy it to the caller.
@@ -617,31 +608,31 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// RecvMsg implements the linux syscall recvmsg(2).
func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
msgPtr := args[1].Pointer()
flags := args[2].Int()
if t.Arch().Width() != 8 {
// We only handle 64-bit for now.
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Reject flags that we don't handle yet.
if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if file.Flags().NonBlocking {
@@ -663,7 +654,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// RecvMMsg implements the linux syscall recvmmsg(2).
func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
msgPtr := args[1].Pointer()
vlen := args[2].Uint()
flags := args[3].Int()
@@ -671,25 +662,25 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if t.Arch().Width() != 8 {
// We only handle 64-bit for now.
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
// Reject flags that we don't handle yet.
if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
if file.Flags().NonBlocking {
@@ -704,7 +695,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return 0, nil, err
}
if !ts.Valid() {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
haveDeadline = true
@@ -724,7 +715,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
for i := uint64(0); i < uint64(vlen); i++ {
mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
if !ok {
- return 0, nil, syscall.EFAULT
+ return 0, nil, syserror.EFAULT
}
var n uintptr
if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
@@ -734,7 +725,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Copy the received length to the caller.
lp, ok := mp.AddLength(messageHeader64Len)
if !ok {
- return 0, nil, syscall.EFAULT
+ return 0, nil, syserror.EFAULT
}
if _, err = t.CopyOut(lp, uint32(n)); err != nil {
break
@@ -756,7 +747,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
}
if msg.IovLen > linux.UIO_MAXIOV {
- return 0, syscall.EMSGSIZE
+ return 0, syserror.EMSGSIZE
}
dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
AddressSpaceActive: true,
@@ -767,7 +758,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
// FIXME(b/63594852): Pretend we have an empty error queue.
if flags&linux.MSG_ERRQUEUE != 0 {
- return 0, syscall.EAGAIN
+ return 0, syserror.EAGAIN
}
// Fast path when no control message nor name buffers are provided.
@@ -792,7 +783,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
}
if msg.ControlLen > maxControlLen {
- return 0, syscall.ENOBUFS
+ return 0, syserror.ENOBUFS
}
n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
if e != nil {
@@ -842,27 +833,27 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
// recvFrom is the implementation of the recvfrom syscall. It is called by
// recvfrom and recv syscall handlers.
-func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
if int(bufLen) < 0 {
- return 0, syscall.EINVAL
+ return 0, syserror.EINVAL
}
// Reject flags that we don't handle yet.
if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
- return 0, syscall.EINVAL
+ return 0, syserror.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, syscall.EBADF
+ return 0, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, syscall.ENOTSOCK
+ return 0, syserror.ENOTSOCK
}
if file.Flags().NonBlocking {
@@ -903,7 +894,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
// RecvFrom implements the linux syscall recvfrom(2).
func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
bufPtr := args[1].Pointer()
bufLen := args[2].Uint64()
flags := args[3].Int()
@@ -916,31 +907,31 @@ func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// SendMsg implements the linux syscall sendmsg(2).
func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
msgPtr := args[1].Pointer()
flags := args[2].Int()
if t.Arch().Width() != 8 {
// We only handle 64-bit for now.
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Reject flags that we don't handle yet.
if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if file.Flags().NonBlocking {
@@ -953,32 +944,32 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// SendMMsg implements the linux syscall sendmmsg(2).
func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
msgPtr := args[1].Pointer()
vlen := args[2].Uint()
flags := args[3].Int()
if t.Arch().Width() != 8 {
// We only handle 64-bit for now.
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, nil, syscall.EBADF
+ return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, nil, syscall.ENOTSOCK
+ return 0, nil, syserror.ENOTSOCK
}
// Reject flags that we don't handle yet.
if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if file.Flags().NonBlocking {
@@ -990,7 +981,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
for i := uint64(0); i < uint64(vlen); i++ {
mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
if !ok {
- return 0, nil, syscall.EFAULT
+ return 0, nil, syserror.EFAULT
}
var n uintptr
if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
@@ -1000,7 +991,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Copy the received length to the caller.
lp, ok := mp.AddLength(messageHeader64Len)
if !ok {
- return 0, nil, syscall.EFAULT
+ return 0, nil, syserror.EFAULT
}
if _, err = t.CopyOut(lp, uint32(n)); err != nil {
break
@@ -1025,7 +1016,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
if msg.ControlLen > 0 {
// Put an upper bound to prevent large allocations.
if msg.ControlLen > maxControlLen {
- return 0, syscall.ENOBUFS
+ return 0, syserror.ENOBUFS
}
controlData = make([]byte, msg.ControlLen)
if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
@@ -1045,7 +1036,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
// Read data then call the sendmsg implementation.
if msg.IovLen > linux.UIO_MAXIOV {
- return 0, syscall.EMSGSIZE
+ return 0, syserror.EMSGSIZE
}
src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
AddressSpaceActive: true,
@@ -1079,23 +1070,23 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
// sendTo is the implementation of the sendto syscall. It is called by sendto
// and send syscall handlers.
-func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
bl := int(bufLen)
if bl < 0 {
- return 0, syscall.EINVAL
+ return 0, syserror.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
- return 0, syscall.EBADF
+ return 0, syserror.EBADF
}
defer file.DecRef()
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
if !ok {
- return 0, syscall.ENOTSOCK
+ return 0, syserror.ENOTSOCK
}
if file.Flags().NonBlocking {
@@ -1135,7 +1126,7 @@ func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, fla
// SendTo implements the linux syscall sendto(2).
func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
bufPtr := args[1].Pointer()
bufLen := args[2].Uint64()
flags := args[3].Int()
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index b6517313f..a7c98efcb 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -19,7 +19,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -81,8 +80,8 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
// Sendfile implements linux system call sendfile(2).
func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- outFD := kdefs.FD(args[0].Int())
- inFD := kdefs.FD(args[1].Int())
+ outFD := args[0].Int()
+ inFD := args[1].Int()
offsetAddr := args[2].Pointer()
count := int64(args[3].SizeT())
@@ -92,13 +91,13 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
// Get files.
- outFile := t.FDMap().GetFile(outFD)
+ outFile := t.GetFile(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
defer outFile.DecRef()
- inFile := t.FDMap().GetFile(inFD)
+ inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
@@ -163,9 +162,9 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Splice implements splice(2).
func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- inFD := kdefs.FD(args[0].Int())
+ inFD := args[0].Int()
inOffset := args[1].Pointer()
- outFD := kdefs.FD(args[2].Int())
+ outFD := args[2].Int()
outOffset := args[3].Pointer()
count := int64(args[4].SizeT())
flags := args[5].Int()
@@ -182,13 +181,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0
// Get files.
- outFile := t.FDMap().GetFile(outFD)
+ outFile := t.GetFile(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
defer outFile.DecRef()
- inFile := t.FDMap().GetFile(inFD)
+ inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
@@ -251,8 +250,8 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Tee imlements tee(2).
func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- inFD := kdefs.FD(args[0].Int())
- outFD := kdefs.FD(args[1].Int())
+ inFD := args[0].Int()
+ outFD := args[1].Int()
count := int64(args[2].SizeT())
flags := args[3].Int()
@@ -265,13 +264,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0
// Get files.
- outFile := t.FDMap().GetFile(outFD)
+ outFile := t.GetFile(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
defer outFile.DecRef()
- inFile := t.FDMap().GetFile(inFD)
+ inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 9a5657254..5556bc276 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -42,7 +41,7 @@ func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Fstatat implements linux syscall newfstatat, i.e. fstatat(2).
func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
statAddr := args[2].Pointer()
flags := args[3].Int()
@@ -54,7 +53,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if path == "" {
// Annoying. What's wrong with fstat?
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -93,10 +92,10 @@ func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Fstat implements linux syscall fstat(2).
func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
statAddr := args[1].Pointer()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -178,7 +177,7 @@ func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs
// Statx implements linux syscall statx(2).
func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
pathAddr := args[1].Pointer()
flags := args[2].Int()
mask := args[3].Uint()
@@ -197,7 +196,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
if path == "" {
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -285,10 +284,10 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fstatfs implements linux syscall fstatfs(2).
func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
statfsAddr := args[1].Pointer()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 37225735f..3e55235bd 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -19,7 +19,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -32,9 +31,9 @@ func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Syncfs implements linux system call syncfs(2).
func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -47,9 +46,9 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fsync implements linux syscall fsync(2).
func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -63,9 +62,9 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
//
// At the moment, it just calls Fsync, which is a big hammer, but correct.
func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -79,6 +78,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
var err error
+ fd := args[0].Int()
offset := args[1].Int64()
nbytes := args[2].Int64()
uflags := args[3].Uint()
@@ -97,8 +97,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
nbytes = fs.FileMaxOffset
}
- fd := kdefs.FD(args[0].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 9e037bd7b..595eb9155 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -123,29 +123,29 @@ func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) {
opts := kernel.CloneOptions{
SharingOptions: kernel.SharingOptions{
- NewAddressSpace: flags&syscall.CLONE_VM == 0,
- NewSignalHandlers: flags&syscall.CLONE_SIGHAND == 0,
- NewThreadGroup: flags&syscall.CLONE_THREAD == 0,
+ NewAddressSpace: flags&linux.CLONE_VM == 0,
+ NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0,
+ NewThreadGroup: flags&linux.CLONE_THREAD == 0,
TerminationSignal: linux.Signal(flags & exitSignalMask),
- NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID,
- NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER,
- NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET,
- NewFiles: flags&syscall.CLONE_FILES == 0,
- NewFSContext: flags&syscall.CLONE_FS == 0,
- NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS,
- NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC,
+ NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
+ NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
+ NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
+ NewFiles: flags&linux.CLONE_FILES == 0,
+ NewFSContext: flags&linux.CLONE_FS == 0,
+ NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
+ NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
},
Stack: stack,
- SetTLS: flags&syscall.CLONE_SETTLS == syscall.CLONE_SETTLS,
+ SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS,
TLS: tls,
- ChildClearTID: flags&syscall.CLONE_CHILD_CLEARTID == syscall.CLONE_CHILD_CLEARTID,
- ChildSetTID: flags&syscall.CLONE_CHILD_SETTID == syscall.CLONE_CHILD_SETTID,
+ ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID,
+ ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID,
ChildTID: childTID,
- ParentSetTID: flags&syscall.CLONE_PARENT_SETTID == syscall.CLONE_PARENT_SETTID,
+ ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID,
ParentTID: parentTID,
- Vfork: flags&syscall.CLONE_VFORK == syscall.CLONE_VFORK,
- Untraced: flags&syscall.CLONE_UNTRACED == syscall.CLONE_UNTRACED,
- InheritTracer: flags&syscall.CLONE_PTRACE == syscall.CLONE_PTRACE,
+ Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK,
+ Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED,
+ InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE,
}
ntid, ctrl, err := t.Clone(&opts)
return uintptr(ntid), ctrl, err
@@ -168,7 +168,7 @@ func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// "A call to fork() is equivalent to a call to clone(2) specifying flags
// as just SIGCHLD." - fork(2)
- return clone(t, int(syscall.SIGCHLD), 0, 0, 0, 0)
+ return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0)
}
// Vfork implements Linux syscall vfork(2).
@@ -178,7 +178,7 @@ func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
//
// CLONE_VM | CLONE_VFORK | SIGCHLD
// """ - vfork(2)
- return clone(t, syscall.CLONE_VM|syscall.CLONE_VFORK|int(syscall.SIGCHLD), 0, 0, 0, 0)
+ return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0)
}
// parseCommonWaitOptions applies the options common to wait4 and waitid to
@@ -193,7 +193,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
wopts.NonCloneTasks = true
wopts.CloneTasks = true
default:
- return syscall.EINVAL
+ return syserror.EINVAL
}
if options&linux.WCONTINUED != 0 {
wopts.Events |= kernel.EventGroupContinue
@@ -210,7 +210,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
// wait4 waits for the given child process to exit.
func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) {
if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
- return 0, syscall.EINVAL
+ return 0, syserror.EINVAL
}
wopts := kernel.WaitOptions{
Events: kernel.EventExit | kernel.EventTraceeStop,
@@ -291,10 +291,10 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
rusageAddr := args[4].Pointer()
if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
wopts := kernel.WaitOptions{
Events: kernel.EventTraceeStop,
@@ -307,7 +307,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
case linux.P_PGID:
wopts.SpecificPGID = kernel.ProcessGroupID(id)
default:
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
if err := parseCommonWaitOptions(&wopts, options); err != nil {
@@ -347,12 +347,12 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, nil
}
si := arch.SignalInfo{
- Signo: int32(syscall.SIGCHLD),
+ Signo: int32(linux.SIGCHLD),
}
si.SetPid(int32(wr.TID))
si.SetUid(int32(wr.UID))
// TODO(b/73541790): convert kernel.ExitStatus to functions and make
- // WaitResult.Status a linux.WaitStatus
+ // WaitResult.Status a linux.WaitStatus.
s := syscall.WaitStatus(wr.Status)
switch {
case s.Exited():
@@ -374,7 +374,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
case s.Continued():
si.Code = arch.CLD_CONTINUED
- si.SetStatus(int32(syscall.SIGCONT))
+ si.SetStatus(int32(linux.SIGCONT))
default:
t.Warningf("waitid got incomprehensible wait status %d", s)
}
@@ -395,16 +395,16 @@ func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
flags := args[0].Int()
opts := kernel.SharingOptions{
- NewAddressSpace: flags&syscall.CLONE_VM == syscall.CLONE_VM,
- NewSignalHandlers: flags&syscall.CLONE_SIGHAND == syscall.CLONE_SIGHAND,
- NewThreadGroup: flags&syscall.CLONE_THREAD == syscall.CLONE_THREAD,
- NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID,
- NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER,
- NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET,
- NewFiles: flags&syscall.CLONE_FILES == syscall.CLONE_FILES,
- NewFSContext: flags&syscall.CLONE_FS == syscall.CLONE_FS,
- NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS,
- NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC,
+ NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM,
+ NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND,
+ NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD,
+ NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
+ NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
+ NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
+ NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES,
+ NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS,
+ NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
+ NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
}
// "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
if opts.NewPIDNamespace {
@@ -623,7 +623,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
who := kernel.ThreadID(args[1].Int())
switch which {
- case syscall.PRIO_PROCESS:
+ case linux.PRIO_PROCESS:
// Look for who, return ESRCH if not found.
var task *kernel.Task
if who == 0 {
@@ -633,7 +633,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
if task == nil {
- return 0, nil, syscall.ESRCH
+ return 0, nil, syserror.ESRCH
}
// From kernel/sys.c:getpriority:
@@ -641,13 +641,13 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// will not return the normal nice-value, but a negated
// value that has been offset by 20"
return uintptr(20 - task.Niceness()), nil, nil
- case syscall.PRIO_USER:
+ case linux.PRIO_USER:
fallthrough
- case syscall.PRIO_PGRP:
+ case linux.PRIO_PGRP:
// PRIO_USER and PRIO_PGRP have no further implementation yet.
return 0, nil, nil
default:
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
}
@@ -669,7 +669,7 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
switch which {
- case syscall.PRIO_PROCESS:
+ case linux.PRIO_PROCESS:
// Look for who, return ESRCH if not found.
var task *kernel.Task
if who == 0 {
@@ -679,17 +679,17 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
if task == nil {
- return 0, nil, syscall.ESRCH
+ return 0, nil, syserror.ESRCH
}
task.SetNiceness(niceval)
- case syscall.PRIO_USER:
+ case linux.PRIO_USER:
fallthrough
- case syscall.PRIO_PGRP:
+ case linux.PRIO_PGRP:
// PRIO_USER and PRIO_PGRP have no further implementation yet.
return 0, nil, nil
default:
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 0, nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index ca5ccb7c3..d4134207b 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -15,13 +15,13 @@
package linux
import (
- "syscall"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserror"
)
const nsecPerSec = int64(time.Second)
@@ -47,7 +47,7 @@ func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error)
return itv, nil
default:
- return linux.ItimerVal{}, syscall.ENOSYS
+ return linux.ItimerVal{}, syserror.ENOSYS
}
}
@@ -65,7 +65,7 @@ func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) e
_, err := t.CopyOut(addr, itv)
return err
default:
- return syscall.ENOSYS
+ return syserror.ENOSYS
}
}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index ea6d44315..1ce5ce4c3 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -49,9 +48,9 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
NonBlocking: flags&linux.TFD_NONBLOCK != 0,
})
- fd, err := t.FDMap().NewFDFrom(0, f, kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, f, kernel.FDFlags{
CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
- }, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, err
}
@@ -61,7 +60,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
// TimerfdSettime implements Linux syscall timerfd_settime(2).
func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
flags := args[1].Int()
newValAddr := args[2].Pointer()
oldValAddr := args[3].Pointer()
@@ -70,7 +69,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
return 0, nil, syserror.EINVAL
}
- f := t.FDMap().GetFile(fd)
+ f := t.GetFile(fd)
if f == nil {
return 0, nil, syserror.EBADF
}
@@ -101,10 +100,10 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
// TimerfdGettime implements Linux syscall timerfd_gettime(2).
func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
curValAddr := args[1].Pointer()
- f := t.FDMap().GetFile(fd)
+ f := t.GetFile(fd)
if f == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index e3d1c6201..b3eb96a1c 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -17,11 +17,10 @@
package linux
import (
- "syscall"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// ArchPrctl implements linux syscall arch_prctl(2).
@@ -39,14 +38,14 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
case linux.ARCH_SET_FS:
fsbase := args[1].Uint64()
if !t.Arch().SetTLS(uintptr(fsbase)) {
- return 0, nil, syscall.EPERM
+ return 0, nil, syserror.EPERM
}
case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
t.Kernel().EmitUnimplementedEvent(t)
fallthrough
default:
- return 0, nil, syscall.EINVAL
+ return 0, nil, syserror.EINVAL
}
return 0, nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 3a5bf9ac4..5278c96a6 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -39,11 +38,11 @@ const (
// Write implements linux syscall write(2).
func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -75,12 +74,12 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Pwrite64 implements linux syscall pwrite64(2).
func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
offset := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -122,11 +121,11 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Writev implements linux syscall writev(2).
func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -152,12 +151,12 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Pwritev implements linux syscall pwritev(2).
func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -202,7 +201,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// splits the offset argument into a high/low value for compatibility with
// 32-bit architectures. The flags argument is the 5th argument.
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
@@ -212,7 +211,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return 0, nil, syserror.EACCES
}
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
index 9ba0eba7a..4ff8f9234 100644
--- a/pkg/sentry/syscalls/linux/timespec.go
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -15,7 +15,6 @@
package linux
import (
- "syscall"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -70,7 +69,7 @@ func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) {
tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:]))
return tv, nil
default:
- return linux.Timeval{}, syscall.ENOSYS
+ return linux.Timeval{}, syserror.ENOSYS
}
}
@@ -84,7 +83,7 @@ func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error
_, err := t.CopyOutBytes(addr, out)
return err
default:
- return syscall.ENOSYS
+ return syserror.ENOSYS
}
}
@@ -104,7 +103,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.D
return 0, err
}
if !timespec.Valid() {
- return 0, syscall.EINVAL
+ return 0, syserror.EINVAL
}
timeout = time.Duration(timespec.ToNsecCapped())
}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index a5f3d8407..f88055676 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -26,7 +26,6 @@ package syscalls
import (
"fmt"
- "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -40,16 +39,7 @@ func Supported(name string, fn kernel.SyscallFn) kernel.Syscall {
Name: name,
Fn: fn,
SupportLevel: kernel.SupportFull,
- Note: "Full Support",
- }
-}
-
-// Undocumented returns a syscall that is undocumented.
-func Undocumented(name string, fn kernel.SyscallFn) kernel.Syscall {
- return kernel.Syscall{
- Name: name,
- Fn: fn,
- SupportLevel: kernel.SupportUndocumented,
+ Note: "Fully Supported.",
}
}
@@ -65,7 +55,7 @@ func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []st
}
// Error returns a syscall handler that will always give the passed error.
-func Error(name string, err syscall.Errno, note string, urls []string) kernel.Syscall {
+func Error(name string, err error, note string, urls []string) kernel.Syscall {
if note != "" {
note = note + "; "
}
@@ -75,14 +65,14 @@ func Error(name string, err syscall.Errno, note string, urls []string) kernel.Sy
return 0, nil, err
},
SupportLevel: kernel.SupportUnimplemented,
- Note: fmt.Sprintf("%sReturns %q", note, err.Error()),
+ Note: fmt.Sprintf("%sReturns %q.", note, err.Error()),
URLs: urls,
}
}
// ErrorWithEvent gives a syscall function that sends an unimplemented
// syscall event via the event channel and returns the passed error.
-func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) kernel.Syscall {
+func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall {
if note != "" {
note = note + "; "
}
@@ -93,7 +83,7 @@ func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string)
return 0, nil, err
},
SupportLevel: kernel.SupportUnimplemented,
- Note: fmt.Sprintf("%sReturns %q", note, err.Error()),
+ Note: fmt.Sprintf("%sReturns %q.", note, err.Error()),
URLs: urls,
}
}
@@ -115,7 +105,7 @@ func CapError(name string, c linux.Capability, note string, urls []string) kerne
return 0, nil, syserror.ENOSYS
},
SupportLevel: kernel.SupportUnimplemented,
- Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise", note, syserror.EPERM, c.String(), syserror.ENOSYS),
+ Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS),
URLs: urls,
}
}
diff --git a/pkg/sentry/time/seqatomic_parameters.go b/pkg/sentry/time/seqatomic_parameters_unsafe.go
index b4fb0a7f0..f6560d0bb 100755
--- a/pkg/sentry/time/seqatomic_parameters.go
+++ b/pkg/sentry/time/seqatomic_parameters_unsafe.go
@@ -1,11 +1,12 @@
package time
import (
+ "unsafe"
+
"fmt"
"gvisor.dev/gvisor/third_party/gvsync"
"reflect"
"strings"
- "unsafe"
)
// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index 965808a27..8ff922c69 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -49,43 +49,44 @@ var (
)
var netstackErrorTranslations = map[*tcpip.Error]*Error{
- tcpip.ErrUnknownProtocol: ErrUnknownProtocol,
- tcpip.ErrUnknownNICID: ErrUnknownNICID,
- tcpip.ErrUnknownDevice: ErrUnknownDevice,
- tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption,
- tcpip.ErrDuplicateNICID: ErrDuplicateNICID,
- tcpip.ErrDuplicateAddress: ErrDuplicateAddress,
- tcpip.ErrNoRoute: ErrNoRoute,
- tcpip.ErrBadLinkEndpoint: ErrBadLinkEndpoint,
- tcpip.ErrAlreadyBound: ErrAlreadyBound,
- tcpip.ErrInvalidEndpointState: ErrInvalidEndpointState,
- tcpip.ErrAlreadyConnecting: ErrAlreadyConnecting,
- tcpip.ErrAlreadyConnected: ErrAlreadyConnected,
- tcpip.ErrNoPortAvailable: ErrNoPortAvailable,
- tcpip.ErrPortInUse: ErrPortInUse,
- tcpip.ErrBadLocalAddress: ErrBadLocalAddress,
- tcpip.ErrClosedForSend: ErrClosedForSend,
- tcpip.ErrClosedForReceive: ErrClosedForReceive,
- tcpip.ErrWouldBlock: ErrWouldBlock,
- tcpip.ErrConnectionRefused: ErrConnectionRefused,
- tcpip.ErrTimeout: ErrTimeout,
- tcpip.ErrAborted: ErrAborted,
- tcpip.ErrConnectStarted: ErrConnectStarted,
- tcpip.ErrDestinationRequired: ErrDestinationRequired,
- tcpip.ErrNotSupported: ErrNotSupported,
- tcpip.ErrQueueSizeNotSupported: ErrQueueSizeNotSupported,
- tcpip.ErrNotConnected: ErrNotConnected,
- tcpip.ErrConnectionReset: ErrConnectionReset,
- tcpip.ErrConnectionAborted: ErrConnectionAborted,
- tcpip.ErrNoSuchFile: ErrNoSuchFile,
- tcpip.ErrInvalidOptionValue: ErrInvalidOptionValue,
- tcpip.ErrNoLinkAddress: ErrHostDown,
- tcpip.ErrBadAddress: ErrBadAddress,
- tcpip.ErrNetworkUnreachable: ErrNetworkUnreachable,
- tcpip.ErrMessageTooLong: ErrMessageTooLong,
- tcpip.ErrNoBufferSpace: ErrNoBufferSpace,
- tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled,
- tcpip.ErrNotPermitted: ErrNotPermittedNet,
+ tcpip.ErrUnknownProtocol: ErrUnknownProtocol,
+ tcpip.ErrUnknownNICID: ErrUnknownNICID,
+ tcpip.ErrUnknownDevice: ErrUnknownDevice,
+ tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption,
+ tcpip.ErrDuplicateNICID: ErrDuplicateNICID,
+ tcpip.ErrDuplicateAddress: ErrDuplicateAddress,
+ tcpip.ErrNoRoute: ErrNoRoute,
+ tcpip.ErrBadLinkEndpoint: ErrBadLinkEndpoint,
+ tcpip.ErrAlreadyBound: ErrAlreadyBound,
+ tcpip.ErrInvalidEndpointState: ErrInvalidEndpointState,
+ tcpip.ErrAlreadyConnecting: ErrAlreadyConnecting,
+ tcpip.ErrAlreadyConnected: ErrAlreadyConnected,
+ tcpip.ErrNoPortAvailable: ErrNoPortAvailable,
+ tcpip.ErrPortInUse: ErrPortInUse,
+ tcpip.ErrBadLocalAddress: ErrBadLocalAddress,
+ tcpip.ErrClosedForSend: ErrClosedForSend,
+ tcpip.ErrClosedForReceive: ErrClosedForReceive,
+ tcpip.ErrWouldBlock: ErrWouldBlock,
+ tcpip.ErrConnectionRefused: ErrConnectionRefused,
+ tcpip.ErrTimeout: ErrTimeout,
+ tcpip.ErrAborted: ErrAborted,
+ tcpip.ErrConnectStarted: ErrConnectStarted,
+ tcpip.ErrDestinationRequired: ErrDestinationRequired,
+ tcpip.ErrNotSupported: ErrNotSupported,
+ tcpip.ErrQueueSizeNotSupported: ErrQueueSizeNotSupported,
+ tcpip.ErrNotConnected: ErrNotConnected,
+ tcpip.ErrConnectionReset: ErrConnectionReset,
+ tcpip.ErrConnectionAborted: ErrConnectionAborted,
+ tcpip.ErrNoSuchFile: ErrNoSuchFile,
+ tcpip.ErrInvalidOptionValue: ErrInvalidOptionValue,
+ tcpip.ErrNoLinkAddress: ErrHostDown,
+ tcpip.ErrBadAddress: ErrBadAddress,
+ tcpip.ErrNetworkUnreachable: ErrNetworkUnreachable,
+ tcpip.ErrMessageTooLong: ErrMessageTooLong,
+ tcpip.ErrNoBufferSpace: ErrNoBufferSpace,
+ tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled,
+ tcpip.ErrNotPermitted: ErrNotPermittedNet,
+ tcpip.ErrAddressFamilyNotSupported: ErrAddressFamilyNotSupported,
}
// TranslateNetstackError converts an error from the tcpip package to a sentry
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 345653544..1987e89cc 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -48,6 +48,7 @@ var (
EMSGSIZE = error(syscall.EMSGSIZE)
ENAMETOOLONG = error(syscall.ENAMETOOLONG)
ENOATTR = ENODATA
+ ENOBUFS = error(syscall.ENOBUFS)
ENODATA = error(syscall.ENODATA)
ENODEV = error(syscall.ENODEV)
ENOENT = error(syscall.ENOENT)
@@ -59,6 +60,7 @@ var (
ENOSYS = error(syscall.ENOSYS)
ENOTDIR = error(syscall.ENOTDIR)
ENOTEMPTY = error(syscall.ENOTEMPTY)
+ ENOTSOCK = error(syscall.ENOTSOCK)
ENOTSUP = error(syscall.ENOTSUP)
ENOTTY = error(syscall.ENOTTY)
ENXIO = error(syscall.ENXIO)
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 966b3acfd..c61f96fb0 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -66,43 +66,44 @@ func (e *Error) IgnoreStats() bool {
// Errors that can be returned by the network stack.
var (
- ErrUnknownProtocol = &Error{msg: "unknown protocol"}
- ErrUnknownNICID = &Error{msg: "unknown nic id"}
- ErrUnknownDevice = &Error{msg: "unknown device"}
- ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"}
- ErrDuplicateNICID = &Error{msg: "duplicate nic id"}
- ErrDuplicateAddress = &Error{msg: "duplicate address"}
- ErrNoRoute = &Error{msg: "no route"}
- ErrBadLinkEndpoint = &Error{msg: "bad link layer endpoint"}
- ErrAlreadyBound = &Error{msg: "endpoint already bound", ignoreStats: true}
- ErrInvalidEndpointState = &Error{msg: "endpoint is in invalid state"}
- ErrAlreadyConnecting = &Error{msg: "endpoint is already connecting", ignoreStats: true}
- ErrAlreadyConnected = &Error{msg: "endpoint is already connected", ignoreStats: true}
- ErrNoPortAvailable = &Error{msg: "no ports are available"}
- ErrPortInUse = &Error{msg: "port is in use"}
- ErrBadLocalAddress = &Error{msg: "bad local address"}
- ErrClosedForSend = &Error{msg: "endpoint is closed for send"}
- ErrClosedForReceive = &Error{msg: "endpoint is closed for receive"}
- ErrWouldBlock = &Error{msg: "operation would block", ignoreStats: true}
- ErrConnectionRefused = &Error{msg: "connection was refused"}
- ErrTimeout = &Error{msg: "operation timed out"}
- ErrAborted = &Error{msg: "operation aborted"}
- ErrConnectStarted = &Error{msg: "connection attempt started", ignoreStats: true}
- ErrDestinationRequired = &Error{msg: "destination address is required"}
- ErrNotSupported = &Error{msg: "operation not supported"}
- ErrQueueSizeNotSupported = &Error{msg: "queue size querying not supported"}
- ErrNotConnected = &Error{msg: "endpoint not connected"}
- ErrConnectionReset = &Error{msg: "connection reset by peer"}
- ErrConnectionAborted = &Error{msg: "connection aborted"}
- ErrNoSuchFile = &Error{msg: "no such file"}
- ErrInvalidOptionValue = &Error{msg: "invalid option value specified"}
- ErrNoLinkAddress = &Error{msg: "no remote link address"}
- ErrBadAddress = &Error{msg: "bad address"}
- ErrNetworkUnreachable = &Error{msg: "network is unreachable"}
- ErrMessageTooLong = &Error{msg: "message too long"}
- ErrNoBufferSpace = &Error{msg: "no buffer space available"}
- ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"}
- ErrNotPermitted = &Error{msg: "operation not permitted"}
+ ErrUnknownProtocol = &Error{msg: "unknown protocol"}
+ ErrUnknownNICID = &Error{msg: "unknown nic id"}
+ ErrUnknownDevice = &Error{msg: "unknown device"}
+ ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"}
+ ErrDuplicateNICID = &Error{msg: "duplicate nic id"}
+ ErrDuplicateAddress = &Error{msg: "duplicate address"}
+ ErrNoRoute = &Error{msg: "no route"}
+ ErrBadLinkEndpoint = &Error{msg: "bad link layer endpoint"}
+ ErrAlreadyBound = &Error{msg: "endpoint already bound", ignoreStats: true}
+ ErrInvalidEndpointState = &Error{msg: "endpoint is in invalid state"}
+ ErrAlreadyConnecting = &Error{msg: "endpoint is already connecting", ignoreStats: true}
+ ErrAlreadyConnected = &Error{msg: "endpoint is already connected", ignoreStats: true}
+ ErrNoPortAvailable = &Error{msg: "no ports are available"}
+ ErrPortInUse = &Error{msg: "port is in use"}
+ ErrBadLocalAddress = &Error{msg: "bad local address"}
+ ErrClosedForSend = &Error{msg: "endpoint is closed for send"}
+ ErrClosedForReceive = &Error{msg: "endpoint is closed for receive"}
+ ErrWouldBlock = &Error{msg: "operation would block", ignoreStats: true}
+ ErrConnectionRefused = &Error{msg: "connection was refused"}
+ ErrTimeout = &Error{msg: "operation timed out"}
+ ErrAborted = &Error{msg: "operation aborted"}
+ ErrConnectStarted = &Error{msg: "connection attempt started", ignoreStats: true}
+ ErrDestinationRequired = &Error{msg: "destination address is required"}
+ ErrNotSupported = &Error{msg: "operation not supported"}
+ ErrQueueSizeNotSupported = &Error{msg: "queue size querying not supported"}
+ ErrNotConnected = &Error{msg: "endpoint not connected"}
+ ErrConnectionReset = &Error{msg: "connection reset by peer"}
+ ErrConnectionAborted = &Error{msg: "connection aborted"}
+ ErrNoSuchFile = &Error{msg: "no such file"}
+ ErrInvalidOptionValue = &Error{msg: "invalid option value specified"}
+ ErrNoLinkAddress = &Error{msg: "no remote link address"}
+ ErrBadAddress = &Error{msg: "bad address"}
+ ErrNetworkUnreachable = &Error{msg: "network is unreachable"}
+ ErrMessageTooLong = &Error{msg: "message too long"}
+ ErrNoBufferSpace = &Error{msg: "no buffer space available"}
+ ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"}
+ ErrNotPermitted = &Error{msg: "operation not permitted"}
+ ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
)
// Errors related to Subnet
@@ -339,6 +340,10 @@ type Endpoint interface {
// get the actual result. The first call to Connect after the socket has
// connected returns nil. Calling connect again results in ErrAlreadyConnected.
// Anything else -- the attempt to connect failed.
+ //
+ // If address.Addr is empty, this means that Enpoint has to be
+ // disconnected if this is supported, otherwise
+ // ErrAddressFamilyNotSupported must be returned.
Connect(address FullAddress) *Error
// Shutdown closes the read and/or write end of the endpoint connection
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 33cefd937..ab9e80747 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -422,6 +422,11 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
+ if addr.Addr == "" {
+ // AF_UNSPEC isn't supported.
+ return tcpip.ErrAddressFamilyNotSupported
+ }
+
nicid := addr.NIC
localPort := uint16(0)
switch e.state {
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 73599dd9d..42aded77f 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -298,6 +298,11 @@ func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
ep.mu.Lock()
defer ep.mu.Unlock()
+ if addr.Addr == "" {
+ // AF_UNSPEC isn't supported.
+ return tcpip.ErrAddressFamilyNotSupported
+ }
+
if ep.closed {
return tcpip.ErrInvalidEndpointState
}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index ee60ebf58..cb40fea94 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1271,6 +1271,11 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocol
// Connect connects the endpoint to its peer.
func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+ if addr.Addr == "" && addr.Port == 0 {
+ // AF_UNSPEC isn't supported.
+ return tcpip.ErrAddressFamilyNotSupported
+ }
+
return e.connect(addr, true, true)
}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index ec61a3886..b93959034 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -342,6 +342,7 @@ func loadError(s string) *tcpip.Error {
tcpip.ErrNoBufferSpace,
tcpip.ErrBroadcastDisabled,
tcpip.ErrNotPermitted,
+ tcpip.ErrAddressFamilyNotSupported,
}
messageToError = make(map[string]*tcpip.Error)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 99fdfb795..cb0ea83a6 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -698,8 +698,44 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (t
return netProto, nil
}
+func (e *endpoint) disconnect() *tcpip.Error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ if e.state != stateConnected {
+ return nil
+ }
+ id := stack.TransportEndpointID{}
+ // Exclude ephemerally bound endpoints.
+ if e.bindNICID != 0 || e.id.LocalAddress == "" {
+ var err *tcpip.Error
+ id = stack.TransportEndpointID{
+ LocalPort: e.id.LocalPort,
+ LocalAddress: e.id.LocalAddress,
+ }
+ id, err = e.registerWithStack(e.regNICID, e.effectiveNetProtos, id)
+ if err != nil {
+ return err
+ }
+ e.state = stateBound
+ } else {
+ e.state = stateInitial
+ }
+
+ e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
+ e.id = id
+ e.route.Release()
+ e.route = stack.Route{}
+ e.dstPort = 0
+
+ return nil
+}
+
// Connect connects the endpoint to its peer. Specifying a NIC is optional.
func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+ if addr.Addr == "" {
+ return e.disconnect()
+ }
if addr.Port == 0 {
// We don't support connecting to port zero.
return tcpip.ErrInvalidEndpointState
@@ -734,12 +770,16 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
defer r.Release()
id := stack.TransportEndpointID{
- LocalAddress: r.LocalAddress,
+ LocalAddress: e.id.LocalAddress,
LocalPort: localPort,
RemotePort: addr.Port,
RemoteAddress: r.RemoteAddress,
}
+ if e.state == stateInitial {
+ id.LocalAddress = r.LocalAddress
+ }
+
// Even if we're connected, this endpoint can still be used to send
// packets on a different network protocol, so we register both even if
// v6only is set to false and this is an ipv6 endpoint.
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 701bdd72b..18e786397 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -92,8 +92,6 @@ func (e *endpoint) afterLoad() {
if err != nil {
panic(*err)
}
-
- e.id.LocalAddress = e.route.LocalAddress
} else if len(e.id.LocalAddress) != 0 { // stateBound
if e.stack.CheckLocalAddress(e.regNICID, netProto, e.id.LocalAddress) == 0 {
panic(tcpip.ErrBadLocalAddress)
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 6d276f207..6f1eb9a41 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -22,40 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/watchdog"
)
-// PlatformType tells which platform to use.
-type PlatformType int
-
-const (
- // PlatformPtrace runs the sandbox with the ptrace platform.
- PlatformPtrace PlatformType = iota
-
- // PlatformKVM runs the sandbox with the KVM platform.
- PlatformKVM
-)
-
-// MakePlatformType converts type from string.
-func MakePlatformType(s string) (PlatformType, error) {
- switch s {
- case "ptrace":
- return PlatformPtrace, nil
- case "kvm":
- return PlatformKVM, nil
- default:
- return 0, fmt.Errorf("invalid platform type %q", s)
- }
-}
-
-func (p PlatformType) String() string {
- switch p {
- case PlatformPtrace:
- return "ptrace"
- case PlatformKVM:
- return "kvm"
- default:
- return fmt.Sprintf("unknown(%d)", p)
- }
-}
-
// FileAccessType tells how the filesystem is accessed.
type FileAccessType int
@@ -187,7 +153,7 @@ type Config struct {
LogPackets bool
// Platform is the platform to run on.
- Platform PlatformType
+ Platform string
// Strace indicates that strace should be enabled.
Strace bool
@@ -247,7 +213,7 @@ func (c *Config) ToFlags() []string {
"--overlay=" + strconv.FormatBool(c.Overlay),
"--network=" + c.Network.String(),
"--log-packets=" + strconv.FormatBool(c.LogPackets),
- "--platform=" + c.Platform.String(),
+ "--platform=" + c.Platform,
"--strace=" + strconv.FormatBool(c.Strace),
"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 59e1b46ec..e5de1f3d7 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -21,32 +21,23 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
- "gvisor.dev/gvisor/pkg/sentry/limits"
)
-// createFDMap creates an FD map that contains stdin, stdout, and stderr. If
-// console is true, then ioctl calls will be passed through to the host FD.
+// createFDTable creates an FD table that contains stdin, stdout, and stderr.
+// If console is true, then ioctl calls will be passed through to the host FD.
// Upon success, createFDMap dups then closes stdioFDs.
-func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
if len(stdioFDs) != 3 {
return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
k := kernel.KernelFromContext(ctx)
- fdm := k.NewFDMap()
- defer fdm.DecRef()
+ fdTable := k.NewFDTable()
+ defer fdTable.DecRef()
mounter := fs.FileOwnerFromContext(ctx)
- // Maps sandbox FD to host FD.
- fdMap := map[int]int{
- 0: stdioFDs[0],
- 1: stdioFDs[1],
- 2: stdioFDs[2],
- }
-
var ttyFile *fs.File
- for appFD, hostFD := range fdMap {
+ for appFD, hostFD := range stdioFDs {
var appFile *fs.File
if console && appFD < 3 {
@@ -80,11 +71,11 @@ func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs
}
// Add the file to the FD map.
- if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+ if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
return nil, err
}
}
- fdm.IncRef()
- return fdm, nil
+ fdTable.IncRef()
+ return fdTable, nil
}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index e4ccb40d9..0ee5b8bbd 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -437,29 +437,6 @@ func hostInetFilters() seccomp.SyscallRules {
}
}
-// ptraceFilters returns syscalls made exclusively by the ptrace platform.
-func ptraceFilters() seccomp.SyscallRules {
- return seccomp.SyscallRules{
- unix.SYS_GETCPU: {},
- unix.SYS_SCHED_SETAFFINITY: {},
- syscall.SYS_PTRACE: {},
- syscall.SYS_TGKILL: {},
- syscall.SYS_WAIT4: {},
- }
-}
-
-// kvmFilters returns syscalls made exclusively by the KVM platform.
-func kvmFilters() seccomp.SyscallRules {
- return seccomp.SyscallRules{
- syscall.SYS_ARCH_PRCTL: {},
- syscall.SYS_IOCTL: {},
- syscall.SYS_MMAP: {},
- syscall.SYS_RT_SIGSUSPEND: {},
- syscall.SYS_RT_SIGTIMEDWAIT: {},
- 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host.
- }
-}
-
func controlServerFilters(fd int) seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_ACCEPT: []seccomp.Rule{
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
index d5bee4453..9ff80276a 100644
--- a/runsc/boot/filter/extra_filters_race.go
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -33,6 +33,7 @@ func instrumentationFilters() seccomp.SyscallRules {
syscall.SYS_MUNLOCK: {},
syscall.SYS_NANOSLEEP: {},
syscall.SYS_OPEN: {},
+ syscall.SYS_OPENAT: {},
syscall.SYS_SET_ROBUST_LIST: {},
// Used within glibc's malloc.
syscall.SYS_TIME: {},
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index 468481f29..e80c171b3 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -18,13 +18,9 @@
package filter
import (
- "fmt"
-
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/platform"
- "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
- "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
)
// Options are seccomp filter related options.
@@ -53,14 +49,7 @@ func Install(opt Options) error {
s.Merge(profileFilters())
}
- switch p := opt.Platform.(type) {
- case *ptrace.PTrace:
- s.Merge(ptraceFilters())
- case *kvm.KVM:
- s.Merge(kvmFilters())
- default:
- return fmt.Errorf("unknown platform type %T", p)
- }
+ s.Merge(opt.Platform.SyscallFilters())
return seccomp.Install(s)
}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index af52286a6..f9a6f2d3c 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -25,8 +25,10 @@ import (
// Include filesystem types that OCI spec might mount.
_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
+ "gvisor.dev/gvisor/pkg/sentry/fs/gofer"
_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
+ "gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
@@ -36,8 +38,6 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/gofer"
- "gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
@@ -85,19 +85,6 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
if err != nil {
return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
}
-
- // Replicate permissions and owner from lower to upper mount point.
- attr, err := lower.UnstableAttr(ctx)
- if err != nil {
- return nil, fmt.Errorf("reading attributes from lower mount point: %v", err)
- }
- if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) {
- return nil, fmt.Errorf("error setting permission to upper mount point")
- }
- if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil {
- return nil, fmt.Errorf("setting owner to upper mount point: %v", err)
- }
-
return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 41027416d..8e8c6105b 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -32,6 +32,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
@@ -41,8 +42,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/loader"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
- "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
- "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
"gvisor.dev/gvisor/pkg/sentry/sighandling"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/time"
@@ -58,6 +57,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
"gvisor.dev/gvisor/runsc/boot/filter"
+ _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
"gvisor.dev/gvisor/runsc/specutils"
// Include supported socket providers.
@@ -262,7 +262,7 @@ func New(args Args) (*Loader, error) {
// Adjust the total memory returned by the Sentry so that applications that
// use /proc/meminfo can make allocations based on this limit.
usage.MinimumTotalMemoryBytes = args.TotalMem
- log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30))
+ log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
}
// Initiate the Kernel object, which is required by the Context passed
@@ -415,19 +415,12 @@ func (l *Loader) Destroy() {
}
func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
- switch conf.Platform {
- case PlatformPtrace:
- log.Infof("Platform: ptrace")
- return ptrace.New()
- case PlatformKVM:
- log.Infof("Platform: kvm")
- if deviceFile == nil {
- return nil, fmt.Errorf("kvm device file must be provided")
- }
- return kvm.New(deviceFile)
- default:
- return nil, fmt.Errorf("invalid platform %v", conf.Platform)
+ p, err := platform.Lookup(conf.Platform)
+ if err != nil {
+ panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
}
+ log.Infof("Platform: %s", conf.Platform)
+ return p.New(deviceFile)
}
func createMemoryFile() (*pgalloc.MemoryFile, error) {
@@ -516,13 +509,13 @@ func (l *Loader) run() error {
// Create the FD map, which will set stdin, stdout, and stderr. If console
// is true, then ioctl calls will be passed through to the host fd.
ctx := l.rootProcArgs.NewContext(l.k)
- fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs)
+ fdTable, err := createFDTable(ctx, l.console, l.stdioFDs)
if err != nil {
return fmt.Errorf("importing fds: %v", err)
}
// CreateProcess takes a reference on FDMap if successful. We won't need
// ours either way.
- l.rootProcArgs.FDMap = fdm
+ l.rootProcArgs.FDTable = fdTable
// cid for root container can be empty. Only subcontainers need it to set
// the mount location.
@@ -561,13 +554,13 @@ func (l *Loader) run() error {
return fmt.Errorf("creating init process: %v", err)
}
- // CreateProcess takes a reference on FDMap if successful.
- l.rootProcArgs.FDMap.DecRef()
+ // CreateProcess takes a reference on FDTable if successful.
+ l.rootProcArgs.FDTable.DecRef()
}
ep.tg = l.k.GlobalInit()
if l.console {
- ttyFile := l.rootProcArgs.FDMap.GetFile(0)
+ ttyFile, _ := l.rootProcArgs.FDTable.Get(0)
defer ttyFile.DecRef()
ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
@@ -647,13 +640,13 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
// Create the FD map, which will set stdin, stdout, and stderr.
ctx := procArgs.NewContext(l.k)
- fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs)
+ fdTable, err := createFDTable(ctx, false, stdioFDs)
if err != nil {
return fmt.Errorf("importing fds: %v", err)
}
- // CreateProcess takes a reference on FDMap if successful. We won't need ours
- // either way.
- procArgs.FDMap = fdm
+ // CreateProcess takes a reference on fdTable if successful. We won't
+ // need ours either way.
+ procArgs.FDTable = fdTable
// Can't take ownership away from os.File. dup them to get a new FDs.
var goferFDs []int
@@ -682,8 +675,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
}
l.k.StartProcess(tg)
- // CreateProcess takes a reference on FDMap if successful.
- procArgs.FDMap.DecRef()
+ // CreateProcess takes a reference on FDTable if successful.
+ procArgs.FDTable.DecRef()
l.processes[eid].tg = tg
return nil
@@ -1006,3 +999,8 @@ func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host
}
return ep.tg, ep.tty, nil
}
+
+func init() {
+ // TODO(gvisor.dev/issue/365): Make this configurable.
+ refs.SetLeakMode(refs.NoLeakChecking)
+}
diff --git a/pkg/abi/linux/ashmem.go b/runsc/boot/platforms/platforms.go
index 2a722abe0..056b46ad5 100644
--- a/pkg/abi/linux/ashmem.go
+++ b/runsc/boot/platforms/platforms.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,18 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-package linux
+// Package platforms imports all available platform packages.
+package platforms
-// Constants used by ashmem in pin-related ioctls.
-const (
- AshmemNotPurged = 0
- AshmemWasPurged = 1
- AshmemIsUnpinned = 0
- AshmemIsPinned = 1
+import (
+ // Import platforms that runsc might use.
+ _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+ _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
)
-// AshmemPin structure is used for pin-related ioctls.
-type AshmemPin struct {
- Offset uint32
- Len uint32
-}
+const (
+ // Ptrace runs the sandbox with the ptrace platform.
+ Ptrace = "ptrace"
+
+ // KVM runs the sandbox with the KVM platform.
+ KVM = "kvm"
+)
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 272eb14d3..b40fded5b 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -26,6 +26,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/boot/platforms"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -172,7 +173,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
if caps == nil {
caps = &specs.LinuxCapabilities{}
}
- if conf.Platform == boot.PlatformPtrace {
+ if conf.Platform == platforms.Ptrace {
// Ptrace platform requires extra capabilities.
const c = "CAP_SYS_PTRACE"
caps.Bounding = append(caps.Bounding, c)
diff --git a/runsc/main.go b/runsc/main.go
index 135061cd3..bc83c57a2 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -30,6 +30,7 @@ import (
"github.com/google/subcommands"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/cmd"
"gvisor.dev/gvisor/runsc/specutils"
@@ -61,7 +62,7 @@ var (
straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
// Flags that control sandbox runtime behavior.
- platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+ platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
gso = flag.Bool("gso", true, "enable generic segmenation offload")
fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
@@ -139,8 +140,8 @@ func main() {
}
cmd.ErrorLogger = errorLogger
- platformType, err := boot.MakePlatformType(*platform)
- if err != nil {
+ platformType := *platformName
+ if _, err := platform.Lookup(platformType); err != nil {
cmd.Fatalf("%v", err)
}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 6bebf0737..4a11f617d 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -32,9 +32,10 @@ import (
"gvisor.dev/gvisor/pkg/control/server"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
- "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/boot/platforms"
"gvisor.dev/gvisor/runsc/cgroup"
"gvisor.dev/gvisor/runsc/console"
"gvisor.dev/gvisor/runsc/specutils"
@@ -491,7 +492,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
{Type: specs.UTSNamespace},
}
- if conf.Platform == boot.PlatformPtrace {
+ if conf.Platform == platforms.Ptrace {
// TODO(b/75837838): Also set a new PID namespace so that we limit
// access to other host processes.
log.Infof("Sandbox will be started in the current PID namespace")
@@ -1046,19 +1047,15 @@ func (s *Sandbox) waitForStopped() error {
// deviceFileForPlatform opens the device file for the given platform. If the
// platform does not need a device file, then nil is returned.
-func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) {
- var (
- f *os.File
- err error
- )
- switch p {
- case boot.PlatformKVM:
- f, err = kvm.OpenDevice()
- default:
- return nil, nil
+func deviceFileForPlatform(name string) (*os.File, error) {
+ p, err := platform.Lookup(name)
+ if err != nil {
+ return nil, err
}
+
+ f, err := p.OpenDevice()
if err != nil {
return nil, fmt.Errorf("opening device file for platform %q: %v", p, err)
}
- return f, err
+ return f, nil
}