summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/syscalls/linux
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/syscalls/linux')
-rw-r--r--pkg/sentry/syscalls/linux/BUILD16
-rw-r--r--pkg/sentry/syscalls/linux/error.go74
-rw-r--r--pkg/sentry/syscalls/linux/linux64_amd64.go16
-rw-r--r--pkg/sentry/syscalls/linux/linux64_arm64.go24
-rw-r--r--pkg/sentry/syscalls/linux/sigset.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_aio.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_clone_amd64.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_clone_arm64.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_epoll.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go6
-rw-r--r--pkg/sentry/syscalls/linux/sys_futex.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_mempolicy.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_mount.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_pipe.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_poll.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_random.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_read.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_rlimit.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_rseq.go48
-rw-r--r--pkg/sentry/syscalls/linux/sys_seccomp.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_sem.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_shm.go7
-rw-r--r--pkg/sentry/syscalls/linux/sys_signal.go7
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat.go53
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat_amd64.go75
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat_arm64.go77
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go15
-rw-r--r--pkg/sentry/syscalls/linux/sys_time.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_timer.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_utsname.go12
-rw-r--r--pkg/sentry/syscalls/linux/sys_write.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_xattr.go155
-rw-r--r--pkg/sentry/syscalls/linux/timespec.go2
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/BUILD24
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/linux64.go16
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go25
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go25
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/sys_read.go95
42 files changed, 730 insertions, 191 deletions
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 6766ba587..be16ee686 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
package(licenses = ["notice"])
@@ -13,6 +13,8 @@ go_library(
"sigset.go",
"sys_aio.go",
"sys_capability.go",
+ "sys_clone_amd64.go",
+ "sys_clone_arm64.go",
"sys_epoll.go",
"sys_eventfd.go",
"sys_file.go",
@@ -30,6 +32,7 @@ go_library(
"sys_random.go",
"sys_read.go",
"sys_rlimit.go",
+ "sys_rseq.go",
"sys_rusage.go",
"sys_sched.go",
"sys_seccomp.go",
@@ -39,6 +42,8 @@ go_library(
"sys_socket.go",
"sys_splice.go",
"sys_stat.go",
+ "sys_stat_amd64.go",
+ "sys_stat_arm64.go",
"sys_sync.go",
"sys_sysinfo.go",
"sys_syslog.go",
@@ -52,18 +57,18 @@ go_library(
"sys_xattr.go",
"timespec.go",
],
- importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux",
visibility = ["//:sandbox"],
deps = [
"//pkg/abi",
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/bpf",
+ "//pkg/context",
"//pkg/log",
"//pkg/metric",
"//pkg/rand",
+ "//pkg/safemem",
"//pkg/sentry/arch",
- "//pkg/sentry/context",
"//pkg/sentry/fs",
"//pkg/sentry/fs/anon",
"//pkg/sentry/fs/lock",
@@ -83,15 +88,16 @@ go_library(
"//pkg/sentry/loader",
"//pkg/sentry/memmap",
"//pkg/sentry/mm",
- "//pkg/sentry/safemem",
"//pkg/sentry/socket",
"//pkg/sentry/socket/control",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/syscalls",
"//pkg/sentry/usage",
- "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
+ "//pkg/usermem",
"//pkg/waiter",
],
)
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 1d9018c96..64de56ac5 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -16,13 +16,14 @@ package linux
import (
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -31,20 +32,58 @@ var (
partialResultOnce sync.Once
)
+// HandleIOErrorVFS2 handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error {
+ known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+ if err != nil {
+ return err
+ }
+ if !known {
+ // An unknown error is encountered with a partial read/write.
+ fs := f.Mount().Filesystem().VirtualFilesystem()
+ root := vfs.RootFromContext(t)
+ name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry())
+ log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name)
+ partialResultOnce.Do(partialResultMetric.Increment)
+ }
+ return nil
+}
+
// handleIOError handles special error cases for partial results. For some
// errors, we may consume the error and return only the partial read/write.
//
// op and f are used only for panics.
func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
+ known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+ if err != nil {
+ return err
+ }
+ if !known {
+ // An unknown error is encountered with a partial read/write.
+ name, _ := f.Dirent.FullName(nil /* ignore chroot */)
+ log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+ partialResultOnce.Do(partialResultMetric.Increment)
+ }
+ return nil
+}
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// Returns false if error is unknown.
+func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) {
switch err {
case nil:
// Typical successful syscall.
- return nil
+ return true, nil
case io.EOF:
// EOF is always consumed. If this is a partial read/write
// (result != 0), the application will see that, otherwise
// they will see 0.
- return nil
+ return true, nil
case syserror.ErrExceedsFileSizeLimit:
// Ignore partialResult because this error only applies to
// normal files, and for those files we cannot accumulate
@@ -53,20 +92,20 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
// Do not consume the error and return it as EFBIG.
// Simultaneously send a SIGXFSZ per setrlimit(2).
t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
- return syserror.EFBIG
+ return true, syserror.EFBIG
case syserror.ErrInterrupted:
// The syscall was interrupted. Return nil if it completed
// partially, otherwise return the error code that the syscall
// needs (to indicate to the kernel what it should do).
if partialResult {
- return nil
+ return true, nil
}
- return intr
+ return true, intr
}
if !partialResult {
// Typical syscall error.
- return err
+ return true, err
}
switch err {
@@ -75,14 +114,14 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
// read/write. Like ErrWouldBlock, since we have a
// partial read/write, we consume the error and return
// the partial result.
- return nil
+ return true, nil
case syserror.EFAULT:
// EFAULT is only shown the user if nothing was
// read/written. If we read something (this case), they see
// a partial read/write. They will then presumably try again
// with an incremented buffer, which will EFAULT with
// result == 0.
- return nil
+ return true, nil
case syserror.EPIPE:
// Writes to a pipe or socket will return EPIPE if the other
// side is gone. The partial write is returned. EPIPE will be
@@ -90,32 +129,29 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
//
// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
// also be sent to the application.
- return nil
+ return true, nil
case syserror.ENOSPC:
// Similar to EPIPE. Return what we wrote this time, and let
// ENOSPC be returned on the next call.
- return nil
+ return true, nil
case syserror.ECONNRESET:
// For TCP sendfile connections, we may have a reset. But we
// should just return n as the result.
- return nil
+ return true, nil
case syserror.ErrWouldBlock:
// Syscall would block, but completed a partial read/write.
// This case should only be returned by IssueIO for nonblocking
// files. Since we have a partial read/write, we consume
// ErrWouldBlock, returning the partial result.
- return nil
+ return true, nil
}
switch err.(type) {
case kernel.SyscallRestartErrno:
// Identical to the EINTR case.
- return nil
+ return true, nil
}
- // An unknown error is encountered with a partial read/write.
- name, _ := f.Dirent.FullName(nil /* ignore chroot */)
- log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
- partialResultOnce.Do(partialResultMetric.Increment)
- return nil
+ // Error is unknown and cannot be properly handled.
+ return false, nil
}
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 272ae9991..7435b50bf 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -20,8 +20,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/syscalls"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
@@ -228,12 +228,12 @@ var AMD64 = &kernel.SyscallTable{
185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
186: syscalls.Supported("gettid", Gettid),
187: syscalls.Supported("readahead", Readahead),
- 188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
- 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
- 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
- 191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
- 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
- 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+ 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+ 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+ 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+ 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+ 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
@@ -377,7 +377,7 @@ var AMD64 = &kernel.SyscallTable{
331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
332: syscalls.Supported("statx", Statx),
333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
- 334: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+ 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
// Linux skips ahead to syscall 424 to sync numbers between arches.
424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 3b584eed9..03a39fe65 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -20,8 +20,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/syscalls"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall
@@ -41,12 +41,12 @@ var ARM64 = &kernel.SyscallTable{
2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
- 5: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
- 6: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
- 7: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
- 8: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
- 9: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
- 10: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+ 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+ 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+ 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+ 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+ 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
12: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
13: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
@@ -61,6 +61,7 @@ var ARM64 = &kernel.SyscallTable{
22: syscalls.Supported("epoll_pwait", EpollPwait),
23: syscalls.Supported("dup", Dup),
24: syscalls.Supported("dup3", Dup3),
+ 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
26: syscalls.Supported("inotify_init1", InotifyInit1),
27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
@@ -78,7 +79,9 @@ var ARM64 = &kernel.SyscallTable{
40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
41: syscalls.Error("pivot_root", syserror.EPERM, "", nil),
42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+ 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+ 45: syscalls.Supported("truncate", Truncate),
46: syscalls.Supported("ftruncate", Ftruncate),
47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
48: syscalls.Supported("faccessat", Faccessat),
@@ -112,6 +115,7 @@ var ARM64 = &kernel.SyscallTable{
76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
77: syscalls.Supported("tee", Tee),
78: syscalls.Supported("readlinkat", Readlinkat),
+ 79: syscalls.Supported("fstatat", Fstatat),
80: syscalls.Supported("fstat", Fstat),
81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
@@ -254,6 +258,8 @@ var ARM64 = &kernel.SyscallTable{
219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
221: syscalls.Supported("execve", Execve),
+ 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+ 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
226: syscalls.Supported("mprotect", Mprotect),
@@ -299,6 +305,8 @@ var ARM64 = &kernel.SyscallTable{
282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+
+ // Syscalls after 284 are "backports" from versions of Linux after 4.4.
285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
286: syscalls.Supported("preadv2", Preadv2),
287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
@@ -307,7 +315,7 @@ var ARM64 = &kernel.SyscallTable{
290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
291: syscalls.Supported("statx", Statx),
292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
- 293: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+ 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
// Linux skips ahead to syscall 424 to sync numbers between arches.
424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
index 333013d8c..2ddb2b146 100644
--- a/pkg/sentry/syscalls/linux/sigset.go
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -17,8 +17,8 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index f56411bfe..b401978db 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -23,8 +23,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/mm"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// I/O commands.
diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
new file mode 100644
index 000000000..dd43cf18d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors. We implement the default one in linux 3.11
+// x86_64:
+// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := int(args[0].Int())
+ stack := args[1].Pointer()
+ parentTID := args[2].Pointer()
+ childTID := args[3].Pointer()
+ tls := args[4].Pointer()
+ return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
new file mode 100644
index 000000000..cf68a8949
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors, and we implement the default one in linux 3.11
+// arm64(kernel/fork.c with CONFIG_CLONE_BACKWARDS defined in the config file):
+// sys_clone(clone_flags, newsp, parent_tidptr, tls_val, child_tidptr)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := int(args[0].Int())
+ stack := args[1].Pointer()
+ parentTID := args[2].Pointer()
+ tls := args[3].Pointer()
+ childTID := args[4].Pointer()
+ return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 65b4a227b..5f11b496c 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -20,8 +20,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
"gvisor.dev/gvisor/pkg/sentry/syscalls"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 9bc2445a5..421845ebb 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -18,8 +18,8 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
@@ -28,8 +28,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// fileOpAt performs an operation on the second last component in the path.
@@ -767,7 +767,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Note that Remove provides a reference on the file that we may use to
// flush. It is still active until we drop the final reference below
// (and other reference-holding operations complete).
- file := t.FDTable().Remove(fd)
+ file, _ := t.FDTable().Remove(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index bde17a767..b68261f72 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -21,8 +21,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// futexWaitRestartBlock encapsulates the state required to restart futex(2)
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 912cbe4ff..f66f4ffde 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -23,8 +23,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// Getdents implements linux syscall getdents(2) for 64bit systems.
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
index f5a519d8a..ac934dc6f 100644
--- a/pkg/sentry/syscalls/linux/sys_mempolicy.go
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -20,8 +20,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// We unconditionally report a single NUMA node. This also means that our
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 58a05b5bb..9959f6e61 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// Brk implements linux syscall brk(2).
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index 8c13e2d82..eb5ff48f5 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -19,8 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// Mount implements Linux syscall mount(2).
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 418d7fa5f..798344042 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -20,8 +20,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// pipe2 implements the actual system call with flags.
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 2b2df989a..4f8762d7d 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -23,8 +23,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
index bc4c588bf..c0aa0fd60 100644
--- a/pkg/sentry/syscalls/linux/sys_random.go
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -19,11 +19,11 @@ import (
"math"
"gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/safemem"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
const (
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index cd31e0649..227692f06 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -23,13 +23,13 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
const (
- // EventMaskRead contains events that can be triggerd on reads.
+ // EventMaskRead contains events that can be triggered on reads.
EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
)
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 51e3f836b..e08c333d6 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -19,8 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/limits"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// rlimit describes an implementation of 'struct rlimit', which may vary from
diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go
new file mode 100644
index 000000000..90db10ea6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rseq.go
@@ -0,0 +1,48 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// RSeq implements syscall rseq(2).
+func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].Uint()
+ flags := args[2].Int()
+ signature := args[3].Uint()
+
+ if !t.RSeqAvailable() {
+ // Event for applications that want rseq on a configuration
+ // that doesn't support them.
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ }
+
+ switch flags {
+ case 0:
+ // Register.
+ return 0, nil, t.SetRSeq(addr, length, signature)
+ case linux.RSEQ_FLAG_UNREGISTER:
+ return 0, nil, t.ClearRSeq(addr, length, signature)
+ default:
+ // Unknown flag.
+ return 0, nil, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 18510ead8..5b7a66f4d 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -19,8 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index cde3b54e7..5f54f2456 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
const opsMax = 500 // SEMOPM
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index d57ffb3a1..4a8bc24a2 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -39,10 +39,13 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
+ defer segment.DecRef()
return uintptr(segment.ID), nil, nil
}
// findSegment retrives a shm segment by the given id.
+//
+// findSegment returns a reference on Shm.
func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) {
r := t.IPCNamespace().ShmRegistry()
segment := r.FindByID(id)
@@ -63,6 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, syserror.EINVAL
}
+ defer segment.DecRef()
opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC,
@@ -72,7 +76,6 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, err
}
- defer segment.DecRef()
addr, err = t.MemoryManager().MMap(t, opts)
return uintptr(addr), nil, err
}
@@ -105,6 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, syserror.EINVAL
}
+ defer segment.DecRef()
stat, err := segment.IPCStat(t)
if err == nil {
@@ -128,6 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, syserror.EINVAL
}
+ defer segment.DecRef()
switch cmd {
case linux.IPC_SET:
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index fb6efd5d8..7e1747a0c 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -23,8 +23,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/signalfd"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// "For a process to have permission to send a signal it must
@@ -245,6 +245,11 @@ func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
sig := linux.Signal(args[0].Int())
newactarg := args[1].Pointer()
oldactarg := args[2].Pointer()
+ sigsetsize := args[3].SizeT()
+
+ if sigsetsize != linux.SignalSetSize {
+ return 0, nil, syserror.EINVAL
+ }
var newactptr *arch.SignalAct
if newactarg != 0 {
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 4b5aafcc0..2919228d0 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -26,9 +26,9 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// minListenBacklog is the minimum reasonable backlog for listening sockets.
@@ -41,7 +41,7 @@ const maxListenBacklog = 1024
const maxAddrLen = 200
// maxOptLen is the maximum sockopt parameter length we're willing to accept.
-const maxOptLen = 1024
+const maxOptLen = 1024 * 8
// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
// willing to accept. Note that this limit is smaller than Linux, which allows
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index dd3a5807f..fd642834b 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -25,6 +25,10 @@ import (
// doSplice implements a blocking splice operation.
func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
+ if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 {
+ return 0, syserror.EINVAL
+ }
+
var (
total int64
n int64
@@ -82,11 +86,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
offsetAddr := args[2].Pointer()
count := int64(args[3].SizeT())
- // Don't send a negative number of bytes.
- if count < 0 {
- return 0, nil, syserror.EINVAL
- }
-
// Get files.
inFile := t.GetFile(inFD)
if inFile == nil {
@@ -136,11 +135,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return 0, nil, err
}
- // The offset must be valid.
- if offset < 0 {
- return 0, nil, syserror.EINVAL
- }
-
// Do the splice.
n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
Length: count,
@@ -211,8 +205,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
opts := fs.SpliceOpts{
Length: count,
}
+ inFileAttr := inFile.Dirent.Inode.StableAttr
+ outFileAttr := outFile.Dirent.Inode.StableAttr
switch {
- case fs.IsPipe(inFile.Dirent.Inode.StableAttr) && !fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+ case fs.IsPipe(inFileAttr) && !fs.IsPipe(outFileAttr):
if inOffset != 0 {
return 0, nil, syserror.ESPIPE
}
@@ -225,11 +221,12 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if _, err := t.CopyIn(outOffset, &offset); err != nil {
return 0, nil, err
}
+
// Use the destination offset.
opts.DstOffset = true
opts.DstStart = offset
}
- case !fs.IsPipe(inFile.Dirent.Inode.StableAttr) && fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+ case !fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
if outOffset != 0 {
return 0, nil, syserror.ESPIPE
}
@@ -242,17 +239,18 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if _, err := t.CopyIn(inOffset, &offset); err != nil {
return 0, nil, err
}
+
// Use the source offset.
opts.SrcOffset = true
opts.SrcStart = offset
}
- case fs.IsPipe(inFile.Dirent.Inode.StableAttr) && fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+ case fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
if inOffset != 0 || outOffset != 0 {
return 0, nil, syserror.ESPIPE
}
// We may not refer to the same pipe; otherwise it's a continuous loop.
- if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+ if inFileAttr.InodeID == outFileAttr.InodeID {
return 0, nil, syserror.EINVAL
}
default:
@@ -262,6 +260,15 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Splice data.
n, err := doSplice(t, outFile, inFile, opts, nonBlock)
+ // Special files can have additional requirements for granularity. For
+ // example, read from eventfd returns EINVAL if a size is less 8 bytes.
+ // Inotify is another example. read will return EINVAL is a buffer is
+ // too small to return the next event, but a size of an event isn't
+ // fixed, it is sizeof(struct inotify_event) + {NAME_LEN} + 1.
+ if n != 0 && err != nil && (fs.IsAnonymous(inFileAttr) || fs.IsAnonymous(outFileAttr)) {
+ err = nil
+ }
+
// See above; inFile is chosen arbitrarily here.
return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 5556bc276..c841abccb 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -16,12 +16,11 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// Stat implements linux syscall stat(2).
@@ -125,56 +124,6 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr)
}
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
- b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
- // Dev (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
- // Ino (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
- // Nlink (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
- // Mode (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
- // UID (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
- // GID (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
- // Padding (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, 0)
- // Rdev (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
- // Size (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
- // Blksize (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
- // Blocks (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
- // ATime
- atime := uattr.AccessTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
- // MTime
- mtime := uattr.ModificationTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
- // CTime
- ctime := uattr.StatusChangeTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
- _, err := t.CopyOutBytes(dst, b)
- return err
-}
-
// Statx implements linux syscall statx(2).
func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
new file mode 100644
index 000000000..75a567bd4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -0,0 +1,75 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build amd64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
+// address dst in t's address space. It encodes the stat struct to bytes
+// manually, as stat() is a very common syscall for many applications, and
+// t.CopyObjectOut has noticeable performance impact due to its many slice
+// allocations and use of reflection.
+func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
+ b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
+
+ // Dev (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
+ // Ino (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
+ // Nlink (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
+ // Mode (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
+ // UID (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+ // GID (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
+ // Padding (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, 0)
+ // Rdev (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
+ // Size (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
+ // Blksize (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
+ // Blocks (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
+
+ // ATime
+ atime := uattr.AccessTime.Timespec()
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
+
+ // MTime
+ mtime := uattr.ModificationTime.Timespec()
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
+
+ // CTime
+ ctime := uattr.StatusChangeTime.Timespec()
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
+
+ _, err := t.CopyOutBytes(dst, b)
+ return err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
new file mode 100644
index 000000000..80c98d05c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -0,0 +1,77 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build arm64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
+// address dst in t's address space. It encodes the stat struct to bytes
+// manually, as stat() is a very common syscall for many applications, and
+// t.CopyObjectOut has noticeable performance impact due to its many slice
+// allocations and use of reflection.
+func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
+ b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
+
+ // Dev (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
+ // Ino (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
+ // Mode (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
+ // Nlink (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links))
+ // UID (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+ // GID (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
+ // Rdev (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
+ // Padding (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, 0)
+ // Size (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
+ // Blksize (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize))
+ // Padding (uint32)
+ b = binary.AppendUint32(b, usermem.ByteOrder, 0)
+ // Blocks (uint64)
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
+
+ // ATime
+ atime := uattr.AccessTime.Timespec()
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
+
+ // MTime
+ mtime := uattr.ModificationTime.Timespec()
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
+
+ // CTime
+ ctime := uattr.StatusChangeTime.Timespec()
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
+ b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
+
+ _, err := t.CopyOutBytes(dst, b)
+ return err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 4115116ff..0c9e2255d 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -24,8 +24,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
"gvisor.dev/gvisor/pkg/sentry/loader"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
const (
@@ -220,19 +220,6 @@ func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr
return uintptr(ntid), ctrl, err
}
-// Clone implements linux syscall clone(2).
-// sys_clone has so many flavors. We implement the default one in linux 3.11
-// x86_64:
-// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
-func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- flags := int(args[0].Int())
- stack := args[1].Pointer()
- parentTID := args[2].Pointer()
- childTID := args[3].Pointer()
- tls := args[4].Pointer()
- return clone(t, flags, stack, parentTID, childTID, tls)
-}
-
// Fork implements Linux syscall fork(2).
func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// "A call to fork() is equivalent to a call to clone(2) specifying flags
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index b887fa9d7..2d2aa0819 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// The most significant 29 bits hold either a pid or a file descriptor.
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index d4134207b..432351917 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -20,8 +20,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
const nsecPerSec = int64(time.Second)
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index 748e8dd8d..a393e28c1 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// +build amd64
+// +build amd64 arm64
package linux
@@ -35,7 +35,15 @@ func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
copy(u.Nodename[:], uts.HostName())
copy(u.Release[:], version.Release)
copy(u.Version[:], version.Version)
- copy(u.Machine[:], "x86_64") // build tag above.
+ // build tag above.
+ switch t.SyscallTable().Arch {
+ case arch.AMD64:
+ copy(u.Machine[:], "x86_64")
+ case arch.ARM64:
+ copy(u.Machine[:], "aarch64")
+ default:
+ copy(u.Machine[:], "unknown")
+ }
copy(u.Domainname[:], uts.DomainName())
// Copy out the result.
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index ad4b67806..aba892939 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -23,8 +23,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 97d9a65ea..efb95555c 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -21,16 +21,50 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
-// Getxattr implements linux syscall getxattr(2).
-func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// GetXattr implements linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return getXattrFromPath(t, args, true)
+}
+
+// LGetXattr implements linux syscall lgetxattr(2).
+func LGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return getXattrFromPath(t, args, false)
+}
+
+// FGetXattr implements linux syscall fgetxattr(2).
+func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := uint64(args[3].SizeT())
+
+ // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+ f := t.GetFile(fd)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+
+ n, value, err := getXattr(t, f.Dirent, nameAddr, size)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if _, err := t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
nameAddr := args[1].Pointer()
valueAddr := args[2].Pointer()
- size := args[3].SizeT()
+ size := uint64(args[3].SizeT())
path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
if err != nil {
@@ -38,21 +72,15 @@ func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
valueLen := 0
- err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
- value, err := getxattr(t, d, dirPath, nameAddr)
- if err != nil {
- return err
+ err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
}
- valueLen = len(value)
- if size == 0 {
- return nil
- }
- if size > linux.XATTR_SIZE_MAX {
- size = linux.XATTR_SIZE_MAX
- }
- if valueLen > int(size) {
- return syserror.ERANGE
+ n, value, err := getXattr(t, d, nameAddr, size)
+ valueLen = n
+ if err != nil {
+ return err
}
_, err = t.CopyOutBytes(valueAddr, []byte(value))
@@ -64,34 +92,79 @@ func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return uintptr(valueLen), nil, nil
}
-// getxattr implements getxattr from the given *fs.Dirent.
-func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) {
- if dirPath && !fs.IsDir(d.Inode.StableAttr) {
- return "", syserror.ENOTDIR
- }
-
+// getXattr implements getxattr(2) from the given *fs.Dirent.
+func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) (int, string, error) {
if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
- return "", err
+ return 0, "", err
}
name, err := copyInXattrName(t, nameAddr)
if err != nil {
- return "", err
+ return 0, "", err
}
+ // TODO(b/148380782): Support xattrs in namespaces other than "user".
if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
- return "", syserror.EOPNOTSUPP
+ return 0, "", syserror.EOPNOTSUPP
+ }
+
+ // If getxattr(2) is called with size 0, the size of the value will be
+ // returned successfully even if it is nonzero. In that case, we need to
+ // retrieve the entire attribute value so we can return the correct size.
+ requestedSize := size
+ if size == 0 || size > linux.XATTR_SIZE_MAX {
+ requestedSize = linux.XATTR_SIZE_MAX
+ }
+
+ value, err := d.Inode.GetXattr(t, name, requestedSize)
+ if err != nil {
+ return 0, "", err
}
+ n := len(value)
+ if uint64(n) > requestedSize {
+ return 0, "", syserror.ERANGE
+ }
+
+ // Don't copy out the attribute value if size is 0.
+ if size == 0 {
+ return n, "", nil
+ }
+ return n, value, nil
+}
+
+// SetXattr implements linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return setXattrFromPath(t, args, true)
+}
- return d.Inode.Getxattr(name)
+// LSetXattr implements linux syscall lsetxattr(2).
+func LSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return setXattrFromPath(t, args, false)
}
-// Setxattr implements linux syscall setxattr(2).
-func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// FSetXattr implements linux syscall fsetxattr(2).
+func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := uint64(args[3].SizeT())
+ flags := args[4].Uint()
+
+ // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+ f := t.GetFile(fd)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+
+ return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags)
+}
+
+func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
nameAddr := args[1].Pointer()
valueAddr := args[2].Pointer()
- size := args[3].SizeT()
+ size := uint64(args[3].SizeT())
flags := args[4].Uint()
path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
@@ -99,19 +172,19 @@ func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return 0, nil, err
}
- if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
- return 0, nil, syserror.EINVAL
- }
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
- return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
- return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags)
+ return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags)
})
}
-// setxattr implements setxattr from the given *fs.Dirent.
-func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error {
- if dirPath && !fs.IsDir(d.Inode.StableAttr) {
- return syserror.ENOTDIR
+// setXattr implements setxattr(2) from the given *fs.Dirent.
+func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
+ if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+ return syserror.EINVAL
}
if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
@@ -127,7 +200,7 @@ func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us
return syserror.E2BIG
}
buf := make([]byte, size)
- if _, err = t.CopyInBytes(valueAddr, buf); err != nil {
+ if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
return err
}
value := string(buf)
@@ -136,7 +209,7 @@ func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us
return syserror.EOPNOTSUPP
}
- return d.Inode.Setxattr(name, value)
+ return d.Inode.SetXattr(t, d, name, value, flags)
}
func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
index 4ff8f9234..ddc3ee26e 100644
--- a/pkg/sentry/syscalls/linux/timespec.go
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -19,8 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// copyTimespecIn copies a Timespec from the untrusted app range to the kernel.
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
new file mode 100644
index 000000000..6b8a00b6e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "vfs2",
+ srcs = [
+ "linux64.go",
+ "linux64_override_amd64.go",
+ "linux64_override_arm64.go",
+ "sys_read.go",
+ ],
+ visibility = ["//:sandbox"],
+ deps = [
+ "//pkg/sentry/arch",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/syscalls",
+ "//pkg/sentry/syscalls/linux",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/sentry/syscalls/linux/vfs2/linux64.go
new file mode 100644
index 000000000..19ee36081
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64.go
@@ -0,0 +1,16 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package vfs2 provides syscall implementations that use VFS2.
+package vfs2
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
new file mode 100644
index 000000000..c134714ee
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override(table map[uintptr]kernel.Syscall) {
+ table[0] = syscalls.Supported("read", Read)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
new file mode 100644
index 000000000..6af5c400f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override(table map[uintptr]kernel.Syscall) {
+ table[63] = syscalls.Supported("read", Read)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
new file mode 100644
index 000000000..7667524c7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+ // EventMaskRead contains events that can be triggered on reads.
+ EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements linux syscall read(2). Note that we try to get a buffer that
+// is exactly the size requested because some applications like qemu expect
+// they can do large reads all at once. Bug for bug. Same for other read
+// calls below.
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the destination of the read.
+ dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := read(t, file, dst, vfs.ReadOptions{})
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ n, err := file.Read(t, dst, opts)
+ if err != syserror.ErrWouldBlock {
+ return n, err
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ file.EventRegister(&w, EventMaskRead)
+
+ total := n
+ for {
+ // Shorten dst to reflect bytes previously read.
+ dst = dst.DropFirst(int(n))
+
+ // Issue the request and break out if it completes with anything other than
+ // "would block".
+ n, err := file.Read(t, dst, opts)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+ if err := t.Block(ch); err != nil {
+ break
+ }
+ }
+ file.EventUnregister(&w)
+
+ return total, err
+}