diff options
Diffstat (limited to 'pkg/sentry')
72 files changed, 243 insertions, 284 deletions
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index 18942e848..9a827e84f 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index d6b4d2357..e7cb24102 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 348f238fd..0d27a1f22 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index b2b94c304..6da13f26e 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || 386 // +build amd64 386 package arch diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index 8d8bb8a8b..96e9a6949 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || 386 // +build amd64 386 package arch diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go index 5d7b99bd9..bb5ff7f7f 100644 --- a/pkg/sentry/arch/arch_x86_impl.go +++ b/pkg/sentry/arch/arch_x86_impl.go @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build (amd64 || 386) && go1.1 // +build amd64 386 +// +build go1.1 package arch diff --git a/pkg/sentry/arch/fpu/fpu_amd64.go b/pkg/sentry/arch/fpu/fpu_amd64.go index 2e61b4955..e422f67a1 100644 --- a/pkg/sentry/arch/fpu/fpu_amd64.go +++ b/pkg/sentry/arch/fpu/fpu_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || i386 // +build amd64 i386 package fpu diff --git a/pkg/sentry/arch/fpu/fpu_arm64.go b/pkg/sentry/arch/fpu/fpu_arm64.go index 46634661f..49e641722 100644 --- a/pkg/sentry/arch/fpu/fpu_arm64.go +++ b/pkg/sentry/arch/fpu/fpu_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package fpu diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 58e28dbba..dbd4336f9 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 80df90076..ee22ec512 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go index 3859f41ee..c021ba072 100644 --- a/pkg/sentry/arch/syscalls_amd64.go +++ b/pkg/sentry/arch/syscalls_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go index 95dfd1e90..7146c9e44 100644 --- a/pkg/sentry/arch/syscalls_arm64.go +++ b/pkg/sentry/arch/syscalls_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go index 8a500a515..7613dfcbc 100644 --- a/pkg/sentry/control/logging.go +++ b/pkg/sentry/control/logging.go @@ -50,20 +50,20 @@ type LoggingArgs struct { // enable strace at all. If this flag is false then a completely // pristine copy of the syscall table will be swapped in. This // approach is used to remain consistent with an empty strace - // whitelist meaning trace all system calls. + // allowlist meaning trace all system calls. EnableStrace bool - // Strace is the whitelist of syscalls to trace to log. If this - // and StraceEventWhitelist are empty trace all system calls. - StraceWhitelist []string + // Strace is the allowlist of syscalls to trace to log. If this + // and StraceEventAllowlist are empty trace all system calls. + StraceAllowlist []string // SetEventStrace is a flag used to indicate that event strace // related arguments were passed in. SetEventStrace bool - // StraceEventWhitelist is the whitelist of syscalls to trace + // StraceEventAllowlist is the allowlist of syscalls to trace // to event log. - StraceEventWhitelist []string + StraceEventAllowlist []string } // Logging provides functions related to logging. @@ -107,13 +107,13 @@ func (l *Logging) Change(args *LoggingArgs, code *int) error { func (l *Logging) configureStrace(args *LoggingArgs) error { if args.EnableStrace { - // Install the whitelist specified. - if len(args.StraceWhitelist) > 0 { - if err := strace.Enable(args.StraceWhitelist, strace.SinkTypeLog); err != nil { + // Install the allowlist specified. + if len(args.StraceAllowlist) > 0 { + if err := strace.Enable(args.StraceAllowlist, strace.SinkTypeLog); err != nil { return err } } else { - // For convenience, if strace is enabled but whitelist + // For convenience, if strace is enabled but allowlist // is empty, enable everything to log. strace.EnableAll(strace.SinkTypeLog) } @@ -125,8 +125,8 @@ func (l *Logging) configureStrace(args *LoggingArgs) error { } func (l *Logging) configureEventStrace(args *LoggingArgs) error { - if len(args.StraceEventWhitelist) > 0 { - if err := strace.Enable(args.StraceEventWhitelist, strace.SinkTypeEvent); err != nil { + if len(args.StraceEventAllowlist) > 0 { + if err := strace.Enable(args.StraceEventAllowlist, strace.SinkTypeEvent); err != nil { return err } } else { diff --git a/pkg/sentry/fs/host/util_amd64_unsafe.go b/pkg/sentry/fs/host/util_amd64_unsafe.go index 21782f1da..e90629f4e 100644 --- a/pkg/sentry/fs/host/util_amd64_unsafe.go +++ b/pkg/sentry/fs/host/util_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package host diff --git a/pkg/sentry/fs/host/util_arm64_unsafe.go b/pkg/sentry/fs/host/util_arm64_unsafe.go index ed8f5242a..9fbb93726 100644 --- a/pkg/sentry/fs/host/util_arm64_unsafe.go +++ b/pkg/sentry/fs/host/util_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package host diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 14a97b468..05b776c2e 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -39,26 +39,14 @@ import ( // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // Snapshot current syncable dentries and special file FDs. - fs.renameMu.RLock() fs.syncMu.Lock() ds := make([]*dentry, 0, len(fs.syncableDentries)) for d := range fs.syncableDentries { - // It's safe to use IncRef here even though fs.syncableDentries doesn't - // hold references since we hold fs.renameMu. Note that we can't use - // TryIncRef since cached dentries at zero references should still be - // synced. - d.IncRef() ds = append(ds, d) } - fs.renameMu.RUnlock() sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { - // As above, fs.specialFileFDs doesn't hold references. However, unlike - // dentries, an FD that has reached zero references can't be - // resurrected, so we can use TryIncRef. - if sffd.vfsfd.TryIncRef() { - sffds = append(sffds, sffd) - } + sffds = append(sffds, sffd) } fs.syncMu.Unlock() @@ -68,9 +56,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync syncable dentries. for _, d := range ds { - err := d.syncCachedFile(ctx, true /* forFilesystemSync */) - d.DecRef(ctx) - if err != nil { + if err := d.syncCachedFile(ctx, true /* forFilesystemSync */); err != nil { ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) if retErr == nil { retErr = err @@ -81,9 +67,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - err := sffd.sync(ctx, true /* forFilesystemSync */) - sffd.vfsfd.DecRef(ctx) - if err != nil { + if err := sffd.sync(ctx, true /* forFilesystemSync */); err != nil { ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) if retErr == nil { retErr = err diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index bcf989765..ec8d58cc9 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -582,10 +582,10 @@ func (fs *filesystem) Release(ctx context.Context) { d.dataMu.Unlock() // Close host FDs if they exist. if d.readFD >= 0 { - unix.Close(int(d.readFD)) + _ = unix.Close(int(d.readFD)) } if d.writeFD >= 0 && d.readFD != d.writeFD { - unix.Close(int(d.writeFD)) + _ = unix.Close(int(d.writeFD)) } d.readFD = -1 d.writeFD = -1 @@ -1637,18 +1637,18 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.dataMu.Unlock() // Clunk open fids and close open host FDs. if !d.readFile.isNil() { - d.readFile.close(ctx) + _ = d.readFile.close(ctx) } if !d.writeFile.isNil() && d.readFile != d.writeFile { - d.writeFile.close(ctx) + _ = d.writeFile.close(ctx) } d.readFile = p9file{} d.writeFile = p9file{} if d.readFD >= 0 { - unix.Close(int(d.readFD)) + _ = unix.Close(int(d.readFD)) } if d.writeFD >= 0 && d.readFD != d.writeFD { - unix.Close(int(d.writeFD)) + _ = unix.Close(int(d.writeFD)) } d.readFD = -1 d.writeFD = -1 diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 29afb67d9..4b59c1c3c 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -42,6 +42,11 @@ import ( type specialFileFD struct { fileDescription + // releaseMu synchronizes the closing of fd.handle with fd.sync(). It's safe + // to access fd.handle without locking for operations that require a ref to + // be held by the caller, e.g. vfs.FileDescriptionImpl implementations. + releaseMu sync.RWMutex `state:"nosave"` + // handle is used for file I/O. handle is immutable. handle handle `state:"nosave"` @@ -117,7 +122,10 @@ func (fd *specialFileFD) Release(ctx context.Context) { if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } + fd.releaseMu.Lock() fd.handle.close(ctx) + fd.releaseMu.Unlock() + fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() delete(fs.specialFileFDs, fd) @@ -373,6 +381,13 @@ func (fd *specialFileFD) Sync(ctx context.Context) error { } func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error { + // Locks to ensure it didn't race with fd.Release(). + fd.releaseMu.RLock() + defer fd.releaseMu.RUnlock() + + if !fd.handle.isOpen() { + return nil + } err := func() error { // If we have a host FD, fsyncing it is likely to be faster than an fsync // RPC. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 79a54eef3..f2250c025 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -397,8 +397,8 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth } // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner(). - if parentDir != nil && parentDir.inode.mode&linux.S_ISGID == linux.S_ISGID { - kgid = auth.KGID(parentDir.inode.gid) + if parentDir != nil && atomic.LoadUint32(&parentDir.inode.mode)&linux.S_ISGID == linux.S_ISGID { + kgid = auth.KGID(atomic.LoadUint32(&parentDir.inode.gid)) if mode&linux.S_IFDIR == linux.S_IFDIR { mode |= linux.S_ISGID } diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index 070914a68..930016a3e 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -851,11 +851,18 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } + tmpOpts := *opts + + // Open the lowerFD with O_PATH if a symlink is opened for verity. + if tmpOpts.Flags&linux.O_NOFOLLOW != 0 && d.isSymlink() { + tmpOpts.Flags |= linux.O_PATH + } + // Open the file in the underlying file system. lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: d.lowerVD, Start: d.lowerVD, - }, opts) + }, &tmpOpts) // The file should exist, as we succeeded in finding its dentry. If it's // missing, it indicates an unexpected modification to the file system. @@ -893,7 +900,6 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // be called if a verity FD is successfully created. defer merkleReader.DecRef(ctx) - lowerFlags := lowerFD.StatusFlags() lowerFDOpts := lowerFD.Options() var merkleWriter *vfs.FileDescription var parentMerkleWriter *vfs.FileDescription @@ -946,7 +952,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf isDir: d.isDir(), } - if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { return nil, err } lowerFD.IncRef() diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go index 65465b814..af041bd50 100644 --- a/pkg/sentry/fsimpl/verity/verity_test.go +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -899,7 +899,7 @@ func TestUnmodifiedSymlinkFileReadSucceeds(t *testing.T) { t.Fatalf("SymlinkAt: %v", err) } - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("openVerityAt symlink: %v", err) @@ -1034,7 +1034,7 @@ func TestDeletedSymlinkFileReadFails(t *testing.T) { t.Fatalf("SymlinkAt: %v", err) } - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("openVerityAt symlink: %v", err) @@ -1136,7 +1136,7 @@ func TestModifiedSymlinkFileReadFails(t *testing.T) { } // Open symlink file to get the fd for ioctl in new step. - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt symlink: %v", err) } diff --git a/pkg/sentry/hostfd/hostfd_linux.go b/pkg/sentry/hostfd/hostfd_linux.go index e103e7296..0131da22d 100644 --- a/pkg/sentry/hostfd/hostfd_linux.go +++ b/pkg/sentry/hostfd/hostfd_linux.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package hostfd // MaxReadWriteIov is the maximum permitted size of a struct iovec array in a diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go index 2e66ec587..5ffafb0d1 100644 --- a/pkg/sentry/kernel/kernel_opts.go +++ b/pkg/sentry/kernel/kernel_opts.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package kernel // SpecialOpts contains non-standard options for the kernel. diff --git a/pkg/sentry/kernel/msgqueue/BUILD b/pkg/sentry/kernel/msgqueue/BUILD index e4305fead..5ec11e1f6 100644 --- a/pkg/sentry/kernel/msgqueue/BUILD +++ b/pkg/sentry/kernel/msgqueue/BUILD @@ -18,8 +18,8 @@ go_template_instance( go_library( name = "msgqueue", srcs = [ - "msgqueue.go", "message_list.go", + "msgqueue.go", ], visibility = ["//pkg/sentry:internal"], deps = [ diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 21358ec92..079294f81 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -768,14 +768,14 @@ const ( // ptraceClone is called at the end of a clone or fork syscall to check if t // should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK // stop. child is the new task. -func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool { +func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, args *linux.CloneArgs) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() event := false - if !opts.Untraced { + if args.Flags&linux.CLONE_UNTRACED == 0 { switch kind { case ptraceCloneKindClone: if t.ptraceOpts.TraceClone { @@ -810,7 +810,7 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => // include/linux/ptrace.h:ptrace_init_task(). - if event || opts.InheritTracer { + if event || args.Flags&linux.CLONE_PTRACE != 0 { tracer := t.Tracer() if tracer != nil { child.ptraceTracer.Store(tracer) diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index 5ae05b5c3..63422e155 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kernel diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 46dd84cbc..27514d67b 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kernel diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 7e1347aa6..da4b77ca2 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -26,140 +26,39 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// SharingOptions controls what resources are shared by a new task created by -// Task.Clone, or an existing task affected by Task.Unshare. -type SharingOptions struct { - // If NewAddressSpace is true, the task should have an independent virtual - // address space. - NewAddressSpace bool - - // If NewSignalHandlers is true, the task should use an independent set of - // signal handlers. - NewSignalHandlers bool - - // If NewThreadGroup is true, the task should be the leader of its own - // thread group. TerminationSignal is the signal that the thread group - // will send to its parent when it exits. If NewThreadGroup is false, - // TerminationSignal is ignored. - NewThreadGroup bool - TerminationSignal linux.Signal - - // If NewPIDNamespace is true: - // - // - In the context of Task.Clone, the new task should be the init task - // (TID 1) in a new PID namespace. - // - // - In the context of Task.Unshare, the task should create a new PID - // namespace, and all subsequent clones of the task should be members of - // the new PID namespace. - NewPIDNamespace bool - - // If NewUserNamespace is true, the task should have an independent user - // namespace. - NewUserNamespace bool - - // If NewNetworkNamespace is true, the task should have an independent - // network namespace. - NewNetworkNamespace bool - - // If NewFiles is true, the task should use an independent file descriptor - // table. - NewFiles bool - - // If NewFSContext is true, the task should have an independent FSContext. - NewFSContext bool - - // If NewUTSNamespace is true, the task should have an independent UTS - // namespace. - NewUTSNamespace bool - - // If NewIPCNamespace is true, the task should have an independent IPC - // namespace. - NewIPCNamespace bool -} - -// CloneOptions controls the behavior of Task.Clone. -type CloneOptions struct { - // SharingOptions defines the set of resources that the new task will share - // with its parent. - SharingOptions - - // Stack is the initial stack pointer of the new task. If Stack is 0, the - // new task will start with the same stack pointer as its parent. - Stack hostarch.Addr - - // If SetTLS is true, set the new task's TLS (thread-local storage) - // descriptor to TLS. If SetTLS is false, TLS is ignored. - SetTLS bool - TLS hostarch.Addr - - // If ChildClearTID is true, when the child exits, 0 is written to the - // address ChildTID in the child's memory, and if the write is successful a - // futex wake on the same address is performed. - // - // If ChildSetTID is true, the child's thread ID (in the child's PID - // namespace) is written to address ChildTID in the child's memory. (As in - // Linux, failed writes are silently ignored.) - ChildClearTID bool - ChildSetTID bool - ChildTID hostarch.Addr - - // If ParentSetTID is true, the child's thread ID (in the parent's PID - // namespace) is written to address ParentTID in the parent's memory. (As - // in Linux, failed writes are silently ignored.) - // - // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID - // causes the child's thread ID to be written to ptid in both the parent - // and child's memory, but this is a documentation error fixed by - // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). - ParentSetTID bool - ParentTID hostarch.Addr - - // If Vfork is true, place the parent in vforkStop until the cloned task - // releases its TaskImage. - Vfork bool - - // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for - // this clone(), and do not ptrace-attach the caller's tracer to the new - // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). - Untraced bool - - // If InheritTracer is true, ptrace-attach the caller's tracer to the new - // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported - // for it. If both Untraced and InheritTracer are true, no event will be - // reported, but tracer inheritance will still occur. - InheritTracer bool -} - // Clone implements the clone(2) syscall and returns the thread ID of the new // task in t's PID namespace. Clone may return both a non-zero thread ID and a // non-nil error. // // Preconditions: The caller must be running Task.doSyscallInvoke on the task // goroutine. -func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { +func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { // Since signal actions may refer to application signal handlers by virtual // address, any set of signal handlers must refer to the same address // space. - if !opts.NewSignalHandlers && opts.NewAddressSpace { + if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND { return 0, nil, linuxerr.EINVAL } // In order for the behavior of thread-group-directed signals to be sane, // all tasks in a thread group must share signal handlers. - if !opts.NewThreadGroup && opts.NewSignalHandlers { + if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD { return 0, nil, linuxerr.EINVAL } // All tasks in a thread group must be in the same PID namespace. - if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) { + if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) { return 0, nil, linuxerr.EINVAL } // The two different ways of specifying a new PID namespace are // incompatible. - if opts.NewPIDNamespace && t.childPIDNamespace != nil { + if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil { return 0, nil, linuxerr.EINVAL } // Thread groups and FS contexts cannot span user namespaces. - if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) { + if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 { + return 0, nil, linuxerr.EINVAL + } + // args.ExitSignal must be a valid signal. + if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() { return 0, nil, linuxerr.EINVAL } @@ -174,7 +73,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // user_namespaces(7) creds := t.Credentials() userns := creds.UserNamespace - if opts.NewUserNamespace { + if args.Flags&linux.CLONE_NEWUSER != 0 { var err error // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and // the caller is in a chroot environment (i.e., the caller's root @@ -189,21 +88,19 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { return 0, nil, err } } - if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { + if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { return 0, nil, linuxerr.EPERM } utsns := t.UTSNamespace() - if opts.NewUTSNamespace { + if args.Flags&linux.CLONE_NEWUTS != 0 { // Note that this must happen after NewUserNamespace so we get // the new userns if there is one. utsns = t.UTSNamespace().Clone(userns) } ipcns := t.IPCNamespace() - if opts.NewIPCNamespace { - // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC - // namespace" + if args.Flags&linux.CLONE_NEWIPC != 0 { ipcns = NewIPCNamespace(userns) } else { ipcns.IncRef() @@ -214,7 +111,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { defer cu.Clean() netns := t.NetworkNamespace() - if opts.NewNetworkNamespace { + if args.Flags&linux.CLONE_NEWNET != 0 { netns = inet.NewNamespace(netns) } @@ -227,7 +124,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) } - image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace) + image, err := t.image.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0) if err != nil { return 0, nil, err } @@ -236,17 +133,17 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) // clone() returns 0 in the child. image.Arch.SetReturn(0) - if opts.Stack != 0 { - image.Arch.SetStack(uintptr(opts.Stack)) + if args.Stack != 0 { + image.Arch.SetStack(uintptr(args.Stack)) } - if opts.SetTLS { - if !image.Arch.SetTLS(uintptr(opts.TLS)) { + if args.Flags&linux.CLONE_SETTLS != 0 { + if !image.Arch.SetTLS(uintptr(args.TLS)) { return 0, nil, linuxerr.EPERM } } var fsContext *FSContext - if opts.NewFSContext { + if args.Flags&linux.CLONE_FS == 0 { fsContext = t.fsContext.Fork() } else { fsContext = t.fsContext @@ -254,7 +151,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } var fdTable *FDTable - if opts.NewFiles { + if args.Flags&linux.CLONE_FILES == 0 { fdTable = t.fdTable.Fork(t) } else { fdTable = t.fdTable @@ -264,22 +161,22 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { pidns := t.tg.pidns if t.childPIDNamespace != nil { pidns = t.childPIDNamespace - } else if opts.NewPIDNamespace { + } else if args.Flags&linux.CLONE_NEWPID != 0 { pidns = pidns.NewChild(userns) } tg := t.tg rseqAddr := hostarch.Addr(0) rseqSignature := uint32(0) - if opts.NewThreadGroup { + if args.Flags&linux.CLONE_THREAD == 0 { if tg.mounts != nil { tg.mounts.IncRef() } sh := t.tg.signalHandlers - if opts.NewSignalHandlers { + if args.Flags&linux.CLONE_SIGHAND == 0 { sh = sh.Fork() } - tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy()) tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature @@ -304,7 +201,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), } - if opts.NewThreadGroup { + if args.Flags&linux.CLONE_THREAD == 0 { cfg.Parent = t } else { cfg.InheritParent = t @@ -322,7 +219,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // // However kernel/fork.c:copy_process() adds a limitation to this: // "sigaltstack should be cleared when sharing the same VM". - if opts.NewAddressSpace || opts.Vfork { + if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 { nt.SetSignalStack(t.SignalStack()) } @@ -347,35 +244,35 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) nt.syscallFilters.Store(copiedFilters) } - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { nt.vforkParent = t } - if opts.ChildClearTID { - nt.SetClearTID(opts.ChildTID) + if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 { + nt.SetClearTID(hostarch.Addr(args.ChildTID)) } - if opts.ChildSetTID { + if args.Flags&linux.CLONE_CHILD_SETTID != 0 { ctid := nt.ThreadID() - ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID) + ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID)) } ntid := t.tg.pidns.IDOfTask(nt) - if opts.ParentSetTID { - ntid.CopyOut(t, opts.ParentTID) + if args.Flags&linux.CLONE_PARENT_SETTID != 0 { + ntid.CopyOut(t, hostarch.Addr(args.ParentTID)) } kind := ptraceCloneKindClone - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { kind = ptraceCloneKindVfork - } else if opts.TerminationSignal == linux.SIGCHLD { + } else if linux.Signal(args.ExitSignal) == linux.SIGCHLD { kind = ptraceCloneKindFork } - if t.ptraceClone(kind, nt, opts) { - if opts.Vfork { + if t.ptraceClone(kind, nt, args) { + if args.Flags&linux.CLONE_VFORK != 0 { return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil } return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil } - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { t.maybeBeginVforkStop(nt) return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil } @@ -446,27 +343,35 @@ func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { } // Unshare changes the set of resources t shares with other tasks, as specified -// by opts. +// by flags. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) Unshare(opts *SharingOptions) error { - // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and - // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if - // t is the only task using its MM, which due to clone(2)'s rules imply - // that it is also the only task using its signal handlers / in its thread - // group, and cause EINVAL to be returned otherwise. +func (t *Task) Unshare(flags int32) error { + // "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if + // the caller is single threaded (i.e., it is not sharing its address space + // with another process or thread). In this case, these flags have no + // effect. (Note also that specifying CLONE_THREAD automatically implies + // CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.) + // If the process is multithreaded, then the use of these flags results in + // an error." - unshare(2). This is incorrect (cf. + // kernel/fork.c:ksys_unshare()): + // + // - CLONE_THREAD does not imply CLONE_VM. + // + // - CLONE_SIGHAND implies CLONE_THREAD. + // + // - Only CLONE_VM requires that the caller is not sharing its address + // space with another thread. CLONE_SIGHAND requires that the caller is not + // sharing its signal handlers, and CLONE_THREAD requires that the caller + // is the only thread in its thread group. // // Since we don't count the number of tasks using each address space or set - // of signal handlers, we reject NewSignalHandlers and NewAddressSpace - // altogether, and interpret NewThreadGroup as requiring that t be the only - // member of its thread group. This seems to be logically coherent, in the - // sense that clone(2) allows a task to share signal handlers and address - // spaces with tasks in other thread groups. - if opts.NewAddressSpace || opts.NewSignalHandlers { + // of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether. + if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 { return linuxerr.EINVAL } creds := t.Credentials() - if opts.NewThreadGroup { + if flags&linux.CLONE_THREAD != 0 { t.tg.signalHandlers.mu.Lock() if t.tg.tasksCount != 1 { t.tg.signalHandlers.mu.Unlock() @@ -476,7 +381,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { // This isn't racy because we're the only living task, and therefore // the only task capable of creating new ones, in our thread group. } - if opts.NewUserNamespace { + if flags&linux.CLONE_NEWUSER != 0 { if t.IsChrooted() { return linuxerr.EPERM } @@ -492,7 +397,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { creds = t.Credentials() } haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) - if opts.NewPIDNamespace { + if flags&linux.CLONE_NEWPID != 0 { if !haveCapSysAdmin { return linuxerr.EPERM } @@ -500,14 +405,14 @@ func (t *Task) Unshare(opts *SharingOptions) error { } t.mu.Lock() // Can't defer unlock: DecRefs must occur without holding t.mu. - if opts.NewNetworkNamespace { + if flags&linux.CLONE_NEWNET != 0 { if !haveCapSysAdmin { t.mu.Unlock() return linuxerr.EPERM } t.netns = inet.NewNamespace(t.netns) } - if opts.NewUTSNamespace { + if flags&linux.CLONE_NEWUTS != 0 { if !haveCapSysAdmin { t.mu.Unlock() return linuxerr.EPERM @@ -516,7 +421,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { // new user namespace is used if there is one. t.utsns = t.utsns.Clone(creds.UserNamespace) } - if opts.NewIPCNamespace { + if flags&linux.CLONE_NEWIPC != 0 { if !haveCapSysAdmin { t.mu.Unlock() return linuxerr.EPERM @@ -527,12 +432,12 @@ func (t *Task) Unshare(opts *SharingOptions) error { t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldFDTable *FDTable - if opts.NewFiles { + if flags&linux.CLONE_FILES != 0 { oldFDTable = t.fdTable t.fdTable = oldFDTable.Fork(t) } var oldFSContext *FSContext - if opts.NewFSContext { + if flags&linux.CLONE_FS != 0 { oldFSContext = t.fsContext t.fsContext = oldFSContext.Fork() } diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index 73ea73742..0567c8d32 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s index 953024600..c2a1dca11 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.s +++ b/pkg/sentry/platform/kvm/bluepill_amd64.s @@ -37,7 +37,15 @@ TEXT ·bluepill(SB),NOSPLIT,$0 begin: MOVQ vcpu+0(FP), AX LEAQ VCPU_CPU(AX), BX + + // The gorountine stack will be changed in guest which renders + // the frame pointer outdated and misleads perf tools. + // Disconnect the frame-chain with the zeroed frame pointer + // when it is saved in the frame in bluepillHandler(). + MOVQ BP, CX + MOVQ $0, BP BYTE CLI; + MOVQ CX, BP check_vcpu: MOVQ ENTRY_CPU_SELF(GS), CX CMPQ BX, CX diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index 198bafdea..4ba1d6f9c 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index 9e5c52923..acb0cb05f 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index f105fdbd0..ee7dba828 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 06fcf1d2e..f63ab6aba 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.12 -// +build !go1.18 +//go:build go1.12 && !go1.18 +// +build go1.12,!go1.18 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go index b9ed4a706..a5189d9e2 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64.go +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go index a002ae00c..c3fbbdc75 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_test.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go index 0c43d72f4..7fdb6ac64 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go index b73340f0e..159808433 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go index 0e3d84d95..b53e354da 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64_test.go +++ b/pkg/sentry/platform/kvm/kvm_arm64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go index f07a9f34d..54d579a2b 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index b28a2c4e8..a96634381 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 83bcc7406..de798bb2c 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index edaccf9bc..7937a8481 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index f6aa519b1..1a4a9ce7d 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index 49e1c7136..35660e827 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.12 -// +build !go1.18 +//go:build go1.12 && !go1.18 +// +build go1.12,!go1.18 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go index 7c19b6a8f..98c52b2f5 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package testutil diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go index c5235ca9d..6d0ba8252 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package testutil diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go index 4f7fe993a..07eda0ef3 100644 --- a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ptrace diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 32a70f4d5..13a55b784 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ptrace diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index e4257e3bf..8181db659 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ptrace diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 7e8a31802..129ca52e2 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package ptrace diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go index 0835e1d1c..f1e84059d 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux && (amd64 || arm64) // +build linux // +build amd64 arm64 diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go index 38b7b1a5e..ffd4665f4 100644 --- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.12 -// +build !go1.18 +//go:build go1.12 && !go1.18 +// +build go1.12,!go1.18 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go index 8a783712e..2397e04e7 100644 --- a/pkg/sentry/socket/hostinet/sockopt_impl.go +++ b/pkg/sentry/socket/hostinet/sockopt_impl.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package hostinet import ( diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index ea736e446..9b844b0c0 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -49,6 +49,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" @@ -273,6 +274,7 @@ var Metrics = tcpip.Stats{ Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."), ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), + SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), }, UDP: tcpip.UDPStats{ PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), @@ -1682,12 +1684,12 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int return nil } -func clampBufSize(newSz, min, max int64) int64 { +func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 { // packetOverheadFactor is used to multiply the value provided by the user on // a setsockopt(2) for setting the send/receive buffer sizes sockets. const packetOverheadFactor = 2 - if newSz > max { + if !ignoreMax && newSz > max { newSz = max } @@ -1712,7 +1714,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam v := hostarch.ByteOrder.Uint32(optVal) min, max := ep.SocketOptions().SendBufferLimits() - clamped := clampBufSize(int64(v), min, max) + clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */) return nil @@ -1723,7 +1725,22 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam v := hostarch.ByteOrder.Uint32(optVal) min, max := ep.SocketOptions().ReceiveBufferLimits() - clamped := clampBufSize(int64(v), min, max) + clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) + ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) + return nil + + case linux.SO_RCVBUFFORCE: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) { + return syserr.ErrNotPermitted + } + + v := hostarch.ByteOrder.Uint32(optVal) + min, max := ep.SocketOptions().ReceiveBufferLimits() + clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */) ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) return nil diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index f5da3c509..658e90bb9 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -509,7 +509,6 @@ func SetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) { linux.SO_ATTACH_REUSEPORT_EBPF, linux.SO_CNX_ADVICE, linux.SO_DETACH_FILTER, - linux.SO_RCVBUFFORCE, linux.SO_SNDBUFFORCE: t.Kernel().EmitUnimplementedEvent(t) diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go index cefd20b9b..c42297c80 100644 --- a/pkg/sentry/state/state_metadata.go +++ b/pkg/sentry/state/state_metadata.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package state import ( diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go index 6ce1bb592..317c3c31c 100644 --- a/pkg/sentry/strace/linux64_amd64.go +++ b/pkg/sentry/strace/linux64_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package strace diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go index ce5594301..65f27c810 100644 --- a/pkg/sentry/strace/linux64_arm64.go +++ b/pkg/sentry/strace/linux64_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package strace diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 3ea9c31dd..757ff2a40 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -819,10 +819,10 @@ func convertToSyscallFlag(sinks SinkType) uint32 { return ret } -// Enable enables the syscalls in whitelist in all syscall tables. +// Enable enables the syscalls in allowlist in all syscall tables. // // Preconditions: Initialize has been called. -func Enable(whitelist []string, sinks SinkType) error { +func Enable(allowlist []string, sinks SinkType) error { flags := convertToSyscallFlag(sinks) for _, table := range kernel.SyscallTables() { // Is this known? @@ -832,7 +832,7 @@ func Enable(whitelist []string, sinks SinkType) error { } // Convert to a set of system calls numbers. - wl, err := sys.ConvertToSysnoMap(whitelist) + wl, err := sys.ConvertToSysnoMap(allowlist) if err != nil { return err } diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go index dd43cf18d..2b2dbd9f9 100644 --- a/pkg/sentry/syscalls/linux/sys_clone_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go index cf68a8949..877c86e6a 100644 --- a/pkg/sentry/syscalls/linux/sys_clone_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go index 0a04a6113..e38066ea8 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go index 5a3b1bfad..b2ea390c5 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 46145955e..981cdd985 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -31,11 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -const ( - // exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux. - exitSignalMask = 0xff -) - var ( // ExecMaxTotalSize is the maximum length of all argv and envv entries. // @@ -201,33 +196,16 @@ func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // clone is used by Clone, Fork, and VFork. func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) { - opts := kernel.CloneOptions{ - SharingOptions: kernel.SharingOptions{ - NewAddressSpace: flags&linux.CLONE_VM == 0, - NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0, - NewThreadGroup: flags&linux.CLONE_THREAD == 0, - TerminationSignal: linux.Signal(flags & exitSignalMask), - NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, - NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, - NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, - NewFiles: flags&linux.CLONE_FILES == 0, - NewFSContext: flags&linux.CLONE_FS == 0, - NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, - NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, - }, - Stack: stack, - SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS, - TLS: tls, - ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID, - ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID, - ChildTID: childTID, - ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID, - ParentTID: parentTID, - Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK, - Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED, - InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE, - } - ntid, ctrl, err := t.Clone(&opts) + args := linux.CloneArgs{ + Flags: uint64(uint32(flags) &^ linux.CSIGNAL), + Pidfd: uint64(parentTID), + ChildTID: uint64(childTID), + ParentTID: uint64(parentTID), + ExitSignal: uint64(flags & linux.CSIGNAL), + Stack: uint64(stack), + TLS: uint64(tls), + } + ntid, ctrl, err := t.Clone(&args) return uintptr(ntid), ctrl, err } @@ -460,29 +438,16 @@ func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel // Unshare implements linux syscall unshare(2). func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() - opts := kernel.SharingOptions{ - NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM, - NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND, - NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD, - NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, - NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, - NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, - NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES, - NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS, - NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, - NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, - } // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) - if opts.NewPIDNamespace { - opts.NewThreadGroup = true + if flags&linux.CLONE_NEWPID != 0 { + flags |= linux.CLONE_THREAD } // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS." - if opts.NewUserNamespace { - opts.NewThreadGroup = true - opts.NewFSContext = true + if flags&linux.CLONE_NEWUSER != 0 { + flags |= linux.CLONE_THREAD | linux.CLONE_FS } - return 0, nil, t.Unshare(&opts) + return 0, nil, t.Unshare(flags) } // SchedYield implements linux syscall sched_yield(2). diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go index c5899a359..8c6cd7511 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build amd64 +//go:build amd64 +// +build amd64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go index fb08a356e..ff4ac4d6d 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build arm64 +//go:build arm64 +// +build arm64 package linux diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go index 2da538fc6..122921b52 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go index 88b9c7627..d32031481 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package vfs2 diff --git a/pkg/sentry/time/sampler_amd64.go b/pkg/sentry/time/sampler_amd64.go index 9f1b4b2fb..5fa1832b4 100644 --- a/pkg/sentry/time/sampler_amd64.go +++ b/pkg/sentry/time/sampler_amd64.go @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build amd64 +//go:build amd64 +// +build amd64 package time diff --git a/pkg/sentry/time/sampler_arm64.go b/pkg/sentry/time/sampler_arm64.go index 4c8d33ae4..3560e66ae 100644 --- a/pkg/sentry/time/sampler_arm64.go +++ b/pkg/sentry/time/sampler_arm64.go @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build arm64 +//go:build arm64 +// +build arm64 package time diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index aeca262e3..eb3c60610 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -427,9 +427,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential if opts.Flags&linux.O_DIRECTORY != 0 { rp.mustBeDir = true } - // Ignore O_PATH for verity, as verity performs extra operations on the fd for verification. - // The underlying filesystem that verity wraps opens the fd with O_PATH. - if opts.Flags&linux.O_PATH != 0 && rp.mount.fs.FilesystemType().Name() != "verity" { + if opts.Flags&linux.O_PATH != 0 { vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) if err != nil { return nil, err |