diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/fdchannel/BUILD | 17 | ||||
-rw-r--r-- | pkg/fdchannel/fdchannel_test.go | 131 | ||||
-rw-r--r-- | pkg/fdchannel/fdchannel_unsafe.go | 146 | ||||
-rw-r--r-- | pkg/log/log.go | 13 | ||||
-rw-r--r-- | pkg/sentry/control/BUILD | 3 | ||||
-rw-r--r-- | pkg/sentry/control/logging.go | 136 | ||||
-rw-r--r-- | pkg/sentry/fs/file.go | 14 | ||||
-rw-r--r-- | pkg/sentry/fs/inode.go | 19 | ||||
-rw-r--r-- | pkg/sentry/fs/inode_overlay.go | 6 | ||||
-rw-r--r-- | pkg/sentry/fs/proc/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/fs/proc/net.go | 169 | ||||
-rw-r--r-- | pkg/sentry/fs/splice.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/tmpfs/fs.go | 3 | ||||
-rw-r--r-- | pkg/sentry/kernel/auth/BUILD | 12 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 11 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_clone.go | 9 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_identity.go | 220 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_start.go | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess.go | 9 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/linux64.go | 2 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_file.go | 106 |
21 files changed, 867 insertions, 164 deletions
diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD new file mode 100644 index 000000000..e54e7371c --- /dev/null +++ b/pkg/fdchannel/BUILD @@ -0,0 +1,17 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "fdchannel", + srcs = ["fdchannel_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/fdchannel", + visibility = ["//visibility:public"], +) + +go_test( + name = "fdchannel_test", + size = "small", + srcs = ["fdchannel_test.go"], + embed = [":fdchannel"], +) diff --git a/pkg/fdchannel/fdchannel_test.go b/pkg/fdchannel/fdchannel_test.go new file mode 100644 index 000000000..5d01dc636 --- /dev/null +++ b/pkg/fdchannel/fdchannel_test.go @@ -0,0 +1,131 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdchannel + +import ( + "io/ioutil" + "os" + "sync" + "syscall" + "testing" + "time" +) + +func TestSendRecvFD(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + recvFD, err := recvEP.RecvFDNonblock() + if err != syscall.EAGAIN && err != syscall.EWOULDBLOCK { + t.Errorf("RecvFDNonblock before SendFD: got (%d, %v), wanted (<unspecified>, EAGAIN or EWOULDBLOCK", recvFD, err) + } + + if err := sendEP.SendFD(int(sendFile.Fd())); err != nil { + t.Fatalf("SendFD failed: %v", err) + } + recvFD, err = recvEP.RecvFD() + if err != nil { + t.Fatalf("RecvFD failed: %v", err) + } + recvFile := os.NewFile(uintptr(recvFD), "received file") + defer recvFile.Close() + + sendInfo, err := sendFile.Stat() + if err != nil { + t.Fatalf("failed to stat sent file: %v", err) + } + sendInfoSys := sendInfo.Sys() + sendStat, ok := sendInfoSys.(*syscall.Stat_t) + if !ok { + t.Fatalf("sent file's FileInfo is backed by unknown type %T", sendInfoSys) + } + + recvInfo, err := recvFile.Stat() + if err != nil { + t.Fatalf("failed to stat received file: %v", err) + } + recvInfoSys := recvInfo.Sys() + recvStat, ok := recvInfoSys.(*syscall.Stat_t) + if !ok { + t.Fatalf("received file's FileInfo is backed by unknown type %T", recvInfoSys) + } + + if sendStat.Dev != recvStat.Dev || sendStat.Ino != recvStat.Ino { + t.Errorf("sent file (dev=%d, ino=%d) does not match received file (dev=%d, ino=%d)", sendStat.Dev, sendStat.Ino, recvStat.Dev, recvStat.Ino) + } +} + +func TestShutdownThenRecvFD(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + recvEP.Shutdown() + if _, err := recvEP.RecvFD(); err == nil { + t.Error("RecvFD succeeded unexpectedly") + } +} + +func TestRecvFDThenShutdown(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + var receiverWG sync.WaitGroup + receiverWG.Add(1) + go func() { + defer receiverWG.Done() + if _, err := recvEP.RecvFD(); err == nil { + t.Error("RecvFD succeeded unexpectedly") + } + }() + defer receiverWG.Wait() + time.Sleep(time.Second) // to ensure recvEP.RecvFD() has blocked + recvEP.Shutdown() +} diff --git a/pkg/fdchannel/fdchannel_unsafe.go b/pkg/fdchannel/fdchannel_unsafe.go new file mode 100644 index 000000000..367235be5 --- /dev/null +++ b/pkg/fdchannel/fdchannel_unsafe.go @@ -0,0 +1,146 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris + +// Package fdchannel implements passing file descriptors between processes over +// Unix domain sockets. +package fdchannel + +import ( + "fmt" + "reflect" + "sync/atomic" + "syscall" + "unsafe" +) + +// int32 is the real type of a file descriptor. +const sizeofInt32 = int(unsafe.Sizeof(int32(0))) + +// NewConnectedSockets returns a pair of file descriptors, owned by the caller, +// representing connected sockets that may be passed to separate calls to +// NewEndpoint to create connected Endpoints. +func NewConnectedSockets() ([2]int, error) { + return syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0) +} + +// Endpoint sends file descriptors to, and receives them from, another +// connected Endpoint. +// +// Endpoint is not copyable or movable by value. +type Endpoint struct { + sockfd int32 // accessed using atomic memory operations + msghdr syscall.Msghdr + cmsg *syscall.Cmsghdr // followed by sizeofInt32 bytes of data +} + +// Init must be called on zero-value Endpoints before first use. sockfd must be +// a blocking AF_UNIX SOCK_SEQPACKET socket. +func (ep *Endpoint) Init(sockfd int) { + // "Datagram sockets in various domains (e.g., the UNIX and Internet + // domains) permit zero-length datagrams." - recv(2). Experimentally, + // sendmsg+recvmsg for a zero-length datagram is slightly faster than + // sendmsg+recvmsg for a single byte over a stream socket. + cmsgSlice := make([]byte, syscall.CmsgSpace(sizeofInt32)) + cmsgReflect := (*reflect.SliceHeader)((unsafe.Pointer)(&cmsgSlice)) + ep.sockfd = int32(sockfd) + ep.msghdr.Control = (*byte)((unsafe.Pointer)(cmsgReflect.Data)) + ep.cmsg = (*syscall.Cmsghdr)((unsafe.Pointer)(cmsgReflect.Data)) + // ep.msghdr.Controllen and ep.cmsg.* are mutated by recvmsg(2), so they're + // set before calling sendmsg/recvmsg. +} + +// NewEndpoint is a convenience function that returns an initialized Endpoint +// allocated on the heap. +func NewEndpoint(sockfd int) *Endpoint { + ep := &Endpoint{} + ep.Init(sockfd) + return ep +} + +// Destroy releases resources owned by ep. No other Endpoint methods may be +// called after Destroy. +func (ep *Endpoint) Destroy() { + // These need not use sync/atomic since there must not be any concurrent + // calls to Endpoint methods. + if ep.sockfd >= 0 { + syscall.Close(int(ep.sockfd)) + ep.sockfd = -1 + } +} + +// Shutdown causes concurrent and future calls to ep.SendFD(), ep.RecvFD(), and +// ep.RecvFDNonblock(), as well as the same calls in the connected Endpoint, to +// unblock and return errors. It does not wait for concurrent calls to return. +// +// Shutdown is the only Endpoint method that may be called concurrently with +// other methods. +func (ep *Endpoint) Shutdown() { + if sockfd := int(atomic.SwapInt32(&ep.sockfd, -1)); sockfd >= 0 { + syscall.Shutdown(sockfd, syscall.SHUT_RDWR) + syscall.Close(sockfd) + } +} + +// SendFD sends the open file description represented by the given file +// descriptor to the connected Endpoint. +func (ep *Endpoint) SendFD(fd int) error { + cmsgLen := syscall.CmsgLen(sizeofInt32) + ep.cmsg.Level = syscall.SOL_SOCKET + ep.cmsg.Type = syscall.SCM_RIGHTS + ep.cmsg.SetLen(cmsgLen) + *ep.cmsgData() = int32(fd) + ep.msghdr.SetControllen(cmsgLen) + _, _, e := syscall.Syscall(syscall.SYS_SENDMSG, uintptr(atomic.LoadInt32(&ep.sockfd)), uintptr((unsafe.Pointer)(&ep.msghdr)), 0) + if e != 0 { + return e + } + return nil +} + +// RecvFD receives an open file description from the connected Endpoint and +// returns a file descriptor representing it, owned by the caller. +func (ep *Endpoint) RecvFD() (int, error) { + return ep.recvFD(0) +} + +// RecvFDNonblock receives an open file description from the connected Endpoint +// and returns a file descriptor representing it, owned by the caller. If there +// are no pending receivable open file descriptions, RecvFDNonblock returns +// (<unspecified>, EAGAIN or EWOULDBLOCK). +func (ep *Endpoint) RecvFDNonblock() (int, error) { + return ep.recvFD(syscall.MSG_DONTWAIT) +} + +func (ep *Endpoint) recvFD(flags uintptr) (int, error) { + cmsgLen := syscall.CmsgLen(sizeofInt32) + ep.msghdr.SetControllen(cmsgLen) + _, _, e := syscall.Syscall(syscall.SYS_RECVMSG, uintptr(atomic.LoadInt32(&ep.sockfd)), uintptr((unsafe.Pointer)(&ep.msghdr)), flags|syscall.MSG_TRUNC) + if e != 0 { + return -1, e + } + if int(ep.msghdr.Controllen) != cmsgLen { + return -1, fmt.Errorf("received control message has incorrect length: got %d, wanted %d", ep.msghdr.Controllen, cmsgLen) + } + if ep.cmsg.Level != syscall.SOL_SOCKET || ep.cmsg.Type != syscall.SCM_RIGHTS { + return -1, fmt.Errorf("received control message has incorrect (level, type): got (%v, %v), wanted (%v, %v)", ep.cmsg.Level, ep.cmsg.Type, syscall.SOL_SOCKET, syscall.SCM_RIGHTS) + } + return int(*ep.cmsgData()), nil +} + +func (ep *Endpoint) cmsgData() *int32 { + // syscall.CmsgLen(0) == syscall.cmsgAlignOf(syscall.SizeofCmsghdr) + return (*int32)((unsafe.Pointer)(uintptr((unsafe.Pointer)(ep.cmsg)) + uintptr(syscall.CmsgLen(0)))) +} diff --git a/pkg/log/log.go b/pkg/log/log.go index 0765a1963..ab9ad01ef 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -50,6 +50,19 @@ const ( Debug ) +func (l Level) String() string { + switch l { + case Warning: + return "Warning" + case Info: + return "Info" + case Debug: + return "Debug" + default: + return fmt.Sprintf("Invalid level: %d", l) + } +} + // Emitter is the final destination for logs. type Emitter interface { // Emit emits the given log statement. This allows for control over the diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 15a1fe8a9..5dccb8e3c 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -6,6 +6,7 @@ go_library( name = "control", srcs = [ "control.go", + "logging.go", "pprof.go", "proc.go", "state.go", @@ -26,8 +27,10 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/state", + "//pkg/sentry/strace", "//pkg/sentry/usage", "//pkg/sentry/watchdog", + "//pkg/tcpip/link/sniffer", "//pkg/urpc", ], ) diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go new file mode 100644 index 000000000..811f24324 --- /dev/null +++ b/pkg/sentry/control/logging.go @@ -0,0 +1,136 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/strace" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" +) + +// LoggingArgs are the arguments to use for changing the logging +// level and strace list. +type LoggingArgs struct { + // SetLevel is a flag used to indicate that we should update + // the logging level. We should be able to change the strace + // list without affecting the logging level and vice versa. + SetLevel bool + + // Level is the log level that will be set if SetLevel is true. + Level log.Level + + // SetLogPackets indicates that we should update the log packets flag. + SetLogPackets bool + + // LogPackets is the actual value to set for LogPackets. + // SetLogPackets must be enabled to indicate that we're changing + // the value. + LogPackets bool + + // SetStrace is a flag used to indicate that strace related + // arguments were passed in. + SetStrace bool + + // EnableStrace is a flag from the CLI that specifies whether to + // enable strace at all. If this flag is false then a completely + // pristine copy of the syscall table will be swapped in. This + // approach is used to remain consistent with an empty strace + // whitelist meaning trace all system calls. + EnableStrace bool + + // Strace is the whitelist of syscalls to trace to log. If this + // and StraceEventWhitelist are empty trace all system calls. + StraceWhitelist []string + + // SetEventStrace is a flag used to indicate that event strace + // related arguments were passed in. + SetEventStrace bool + + // StraceEventWhitelist is the whitelist of syscalls to trace + // to event log. + StraceEventWhitelist []string +} + +// Logging provides functions related to logging. +type Logging struct{} + +// Change will change the log level and strace arguments. Although +// this functions signature requires an error it never acctually +// return san error. It's required by the URPC interface. +// Additionally, it may look odd that this is the only method +// attached to an empty struct but this is also part of how +// URPC dispatches. +func (l *Logging) Change(args *LoggingArgs, code *int) error { + if args.SetLevel { + // Logging uses an atomic for the level so this is thread safe. + log.SetLevel(args.Level) + } + + if args.SetLogPackets { + if args.LogPackets { + atomic.StoreUint32(&sniffer.LogPackets, 1) + } else { + atomic.StoreUint32(&sniffer.LogPackets, 0) + } + log.Infof("LogPackets set to: %v", atomic.LoadUint32(&sniffer.LogPackets)) + } + + if args.SetStrace { + if err := l.configureStrace(args); err != nil { + return fmt.Errorf("error configuring strace: %v", err) + } + } + + if args.SetEventStrace { + if err := l.configureEventStrace(args); err != nil { + return fmt.Errorf("error configuring event strace: %v", err) + } + } + + return nil +} + +func (l *Logging) configureStrace(args *LoggingArgs) error { + if args.EnableStrace { + // Install the whitelist specified. + if len(args.StraceWhitelist) > 0 { + if err := strace.Enable(args.StraceWhitelist, strace.SinkTypeLog); err != nil { + return err + } + } else { + // For convenience, if strace is enabled but whitelist + // is empty, enable everything to log. + strace.EnableAll(strace.SinkTypeLog) + } + } else { + // Uninstall all strace functions. + strace.Disable(strace.SinkTypeLog) + } + return nil +} + +func (l *Logging) configureEventStrace(args *LoggingArgs) error { + if len(args.StraceEventWhitelist) > 0 { + if err := strace.Enable(args.StraceEventWhitelist, strace.SinkTypeEvent); err != nil { + return err + } + } else { + strace.Disable(strace.SinkTypeEvent) + } + return nil +} diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 55ffe6c0c..8e1f5674d 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -310,9 +310,11 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error return 0, syserror.ErrInterrupted } + unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) // Handle append mode. if f.Flags().Append { if err := f.offsetForAppend(ctx, &f.offset); err != nil { + unlockAppendMu() f.mu.Unlock() return 0, err } @@ -322,6 +324,7 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error limit, ok := f.checkLimit(ctx, f.offset) switch { case ok && limit == 0: + unlockAppendMu() f.mu.Unlock() return 0, syserror.ErrExceedsFileSizeLimit case ok: @@ -333,6 +336,7 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error if n >= 0 && !f.flags.NonSeekable { atomic.StoreInt64(&f.offset, f.offset+n) } + unlockAppendMu() f.mu.Unlock() return n, err } @@ -348,13 +352,11 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 // However, on Linux, if a file is opened with O_APPEND, pwrite() // appends data to the end of the file, regardless of the value of // offset." + unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) + defer unlockAppendMu() + if f.Flags().Append { - if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted - } - defer f.mu.Unlock() if err := f.offsetForAppend(ctx, &offset); err != nil { - f.mu.Unlock() return 0, err } } @@ -373,7 +375,7 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 // offsetForAppend sets the given offset to the end of the file. // -// Precondition: the underlying file mutex should be held. +// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing. func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { uattr, err := f.Dirent.Inode.UnstableAttr(ctx) if err != nil { diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index a889586aa..e4aae1135 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -15,6 +15,8 @@ package fs import ( + "sync" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" @@ -55,6 +57,12 @@ type Inode struct { // overlay is the overlay entry for this Inode. overlay *overlayEntry + + // appendMu is used to synchronize write operations into files which + // have been opened with O_APPEND. Operations which change a file size + // have to take this lock for read. Write operations to files with + // O_APPEND have to take this lock for write. + appendMu sync.RWMutex `state:"nosave"` } // LockCtx is an Inode's lock context and contains different personalities of locks; both @@ -337,6 +345,8 @@ func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error { if i.overlay != nil { return overlayTruncate(ctx, i.overlay, d, size) } + i.appendMu.RLock() + defer i.appendMu.RUnlock() return i.InodeOperations.Truncate(ctx, i, size) } @@ -438,3 +448,12 @@ func (i *Inode) CheckCapability(ctx context.Context, cp linux.Capability) bool { } return creds.HasCapability(cp) } + +func (i *Inode) lockAppendMu(appendMode bool) func() { + if appendMode { + i.appendMu.Lock() + return i.appendMu.Unlock + } + i.appendMu.RLock() + return i.appendMu.RUnlock +} diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 57b8b14e3..920d86042 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -537,12 +537,6 @@ func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error { if o.upper != nil { err = o.upper.check(ctx, p) } else { - if p.Write { - // Since writes will be redirected to the upper filesystem, the lower - // filesystem need not be writable, but must be readable for copy-up. - p.Write = false - p.Read = true - } err = o.lower.check(ctx, p) } o.copyMu.RUnlock() diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index b70c583f3..da41a10ab 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -31,6 +31,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/binary", "//pkg/log", "//pkg/sentry/context", "//pkg/sentry/fs", diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index caa1a5c4d..37694620c 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -20,6 +20,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -55,9 +56,8 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo "psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), "ptype": newStaticProcInode(ctx, msrc, []byte("Type Device Function")), "route": newStaticProcInode(ctx, msrc, []byte("Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT")), - "tcp": newStaticProcInode(ctx, msrc, []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode")), - - "udp": newStaticProcInode(ctx, msrc, []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops")), + "tcp": seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc), + "udp": newStaticProcInode(ctx, msrc, []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops")), "unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc), } @@ -210,10 +210,6 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } var buf bytes.Buffer - // Header - fmt.Fprintf(&buf, "Num RefCount Protocol Flags Type St Inode Path\n") - - // Entries for _, se := range n.k.ListSockets() { s := se.Sock.Get() if s == nil { @@ -222,6 +218,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } sfile := s.(*fs.File) if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + s.DecRef() // Not a unix socket. continue } @@ -281,12 +278,160 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } fmt.Fprintf(&buf, "\n") - sfile.DecRef() + s.DecRef() + } + + data := []seqfile.SeqData{ + { + Buf: []byte("Num RefCount Protocol Flags Type St Inode Path\n"), + Handle: n, + }, + { + Buf: buf.Bytes(), + Handle: n, + }, + } + return data, 0 +} + +// netTCP implements seqfile.SeqSource for /proc/net/tcp. +// +// +stateify savable +type netTCP struct { + k *kernel.Kernel +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*netTCP) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + t := kernel.TaskFromContext(ctx) + + if h != nil { + return nil, 0 + } + + var buf bytes.Buffer + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { + s.DecRef() + // Not tcp4 sockets. + continue + } + + // Linux's documentation for the fields below can be found at + // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. + // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). + // Note that the header doesn't contain labels for all the fields. + + // Field: sl; entry number. + fmt.Fprintf(&buf, "%4d: ", se.ID) + + portBuf := make([]byte, 2) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = local.(linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, localAddr.Port) + fmt.Fprintf(&buf, "%08X:%04X ", + binary.LittleEndian.Uint32(localAddr.Addr[:]), + portBuf) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = remote.(linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) + fmt.Fprintf(&buf, "%08X:%04X ", + binary.LittleEndian.Uint32(remoteAddr.Addr[:]), + portBuf) + + // Field: state; socket state. + fmt.Fprintf(&buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(&buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when; timer active state and number of jiffies + // until timer expires. Unimplemented. + fmt.Fprintf(&buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt; number of unrecovered RTO timeouts. + // Unimplemented. + fmt.Fprintf(&buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(&buf, "%5d ", 0) + } else { + fmt.Fprintf(&buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + } + + // Field: timeout; number of unanswered 0-window probes. + // Unimplemented. + fmt.Fprintf(&buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(&buf, "%8d ", sfile.InodeID()) + + // Field: refcount. Don't count the ref we obtain while deferencing + // the weakref to this socket. + fmt.Fprintf(&buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(&buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: retransmit timeout. Unimplemented. + fmt.Fprintf(&buf, "%d ", 0) + + // Field: predicted tick of soft clock (delayed ACK control data). + // Unimplemented. + fmt.Fprintf(&buf, "%d ", 0) + + // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. + fmt.Fprintf(&buf, "%d ", 0) + + // Field: sending congestion window, Unimplemented. + fmt.Fprintf(&buf, "%d ", 0) + + // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. + // Unimplemented, report as large threshold. + fmt.Fprintf(&buf, "%d", -1) + + fmt.Fprintf(&buf, "\n") + + s.DecRef() } - data := []seqfile.SeqData{{ - Buf: buf.Bytes(), - Handle: (*netUnix)(nil), - }} + data := []seqfile.SeqData{ + { + Buf: []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n"), + Handle: n, + }, + { + Buf: buf.Bytes(), + Handle: n, + }, + } return data, 0 } diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index 978dc679b..eed1c2854 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -88,6 +88,8 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, // Check append-only mode and the limit. if !dstPipe { + unlock := dst.Dirent.Inode.lockAppendMu(dst.Flags().Append) + defer unlock() if dst.Flags().Append { if opts.DstOffset { // We need to acquire the lock. diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index a5fcdf969..881dd89b0 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -133,6 +133,9 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou } // Construct a mount which will follow the cache options provided. + // + // TODO(gvisor.dev/issue/179): There should be no reason to disable + // caching once bind mounts are properly supported. var msrc *fs.MountSource switch options[cacheKey] { case "", cacheAll: diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 37cb8c8b9..42779baa9 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -4,6 +4,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") load("//tools/go_stateify:defs.bzl", "go_library") go_template_instance( + name = "atomicptr_credentials", + out = "atomicptr_credentials.go", + package = "auth", + suffix = "Credentials", + template = "//third_party/gvsync:generic_atomicptr", + types = { + "Value": "Credentials", + }, +) + +go_template_instance( name = "id_map_range", out = "id_map_range.go", package = "auth", @@ -34,6 +45,7 @@ go_template_instance( go_library( name = "auth", srcs = [ + "atomicptr_credentials.go", "auth.go", "capability_set.go", "context.go", diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c297c5973..2e3a39d3b 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -386,10 +386,11 @@ type Task struct { // creds is the task's credentials. // - // creds is protected by mu, however the value itself is immutable and can - // only be changed by a copy. After reading the pointer, access will - // proceed outside the scope of mu. creds is owned by the task goroutine. - creds *auth.Credentials + // creds.Load() may be called without synchronization. creds.Store() is + // serialized by mu. creds is owned by the task goroutine. All + // auth.Credentials objects that creds may point to, or have pointed to + // in the past, must be treated as immutable. + creds auth.AtomicPtrCredentials // utsns is the task's UTS namespace. // @@ -597,7 +598,7 @@ func (t *Task) Value(key interface{}) interface{} { case CtxTask: return t case auth.CtxCredentials: - return t.creds + return t.Credentials() case context.CtxThreadGroupID: return int32(t.ThreadGroup().ID()) case fs.CtxRoot: diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 0e621f0d1..b5cc3860d 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -425,6 +425,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { if opts.NewAddressSpace || opts.NewSignalHandlers { return syserror.EINVAL } + creds := t.Credentials() if opts.NewThreadGroup { t.tg.signalHandlers.mu.Lock() if t.tg.tasksCount != 1 { @@ -439,8 +440,6 @@ func (t *Task) Unshare(opts *SharingOptions) error { if t.IsChrooted() { return syserror.EPERM } - // This temporary is needed because Go. - creds := t.Credentials() newUserNS, err := creds.NewChildUserNamespace() if err != nil { return err @@ -449,6 +448,8 @@ func (t *Task) Unshare(opts *SharingOptions) error { if err != nil { return err } + // Need to reload creds, becaue t.SetUserNamespace() changed task credentials. + creds = t.Credentials() } haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) if opts.NewPIDNamespace { @@ -473,7 +474,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that this must happen after NewUserNamespace, so the // new user namespace is used if there is one. - t.utsns = t.utsns.Clone(t.creds.UserNamespace) + t.utsns = t.utsns.Clone(creds.UserNamespace) } if opts.NewIPCNamespace { if !haveCapSysAdmin { @@ -482,7 +483,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" - t.ipcns = NewIPCNamespace(t.creds.UserNamespace) + t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldfds *FDMap if opts.NewFiles { diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index 39c138925..78ff14b20 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -25,30 +25,22 @@ import ( // // This value must be considered immutable. func (t *Task) Credentials() *auth.Credentials { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds + return t.creds.Load() } // UserNamespace returns the user namespace associated with the task. func (t *Task) UserNamespace() *auth.UserNamespace { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.UserNamespace + return t.Credentials().UserNamespace } // HasCapabilityIn checks if the task has capability cp in user namespace ns. func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.HasCapabilityIn(cp, ns) + return t.Credentials().HasCapabilityIn(cp, ns) } // HasCapability checks if the task has capability cp in its user namespace. func (t *Task) HasCapability(cp linux.Capability) bool { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.HasCapability(cp) + return t.Credentials().HasCapability(cp) } // SetUID implements the semantics of setuid(2). @@ -57,9 +49,12 @@ func (t *Task) SetUID(uid auth.UID) error { if !uid.Ok() { return syserror.EINVAL } + t.mu.Lock() defer t.mu.Unlock() - kuid := t.creds.UserNamespace.MapToKUID(uid) + + creds := t.Credentials() + kuid := creds.UserNamespace.MapToKUID(uid) if !kuid.Ok() { return syserror.EINVAL } @@ -67,17 +62,17 @@ func (t *Task) SetUID(uid auth.UID) error { // effective UID of the caller is root (more precisely: if the caller has // the CAP_SETUID capability), the real UID and saved set-user-ID are also // set." - setuid(2) - if t.creds.HasCapability(linux.CAP_SETUID) { + if creds.HasCapability(linux.CAP_SETUID) { t.setKUIDsUncheckedLocked(kuid, kuid, kuid) return nil } // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID // capability) and uid does not match the real UID or saved set-user-ID of // the calling process." - if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID { + if kuid != creds.RealKUID && kuid != creds.SavedKUID { return syserror.EPERM } - t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID) + t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID) return nil } @@ -87,37 +82,38 @@ func (t *Task) SetREUID(r, e auth.UID) error { defer t.mu.Unlock() // "Supplying a value of -1 for either the real or effective user ID forces // the system to leave that ID unchanged." - setreuid(2) - newR := t.creds.RealKUID + creds := t.Credentials() + newR := creds.RealKUID if r.Ok() { - newR = t.creds.UserNamespace.MapToKUID(r) + newR = creds.UserNamespace.MapToKUID(r) if !newR.Ok() { return syserror.EINVAL } } - newE := t.creds.EffectiveKUID + newE := creds.EffectiveKUID if e.Ok() { - newE = t.creds.UserNamespace.MapToKUID(e) + newE = creds.UserNamespace.MapToKUID(e) if !newE.Ok() { return syserror.EINVAL } } - if !t.creds.HasCapability(linux.CAP_SETUID) { + if !creds.HasCapability(linux.CAP_SETUID) { // "Unprivileged processes may only set the effective user ID to the // real user ID, the effective user ID, or the saved set-user-ID." - if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID { + if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID { return syserror.EPERM } // "Unprivileged users may only set the real user ID to the real user // ID or the effective user ID." - if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID { + if newR != creds.RealKUID && newR != creds.EffectiveKUID { return syserror.EPERM } } // "If the real user ID is set (i.e., ruid is not -1) or the effective user // ID is set to a value not equal to the previous real user ID, the saved // set-user-ID will be set to the new effective user ID." - newS := t.creds.SavedKUID - if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) { + newS := creds.SavedKUID + if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) { newS = newE } t.setKUIDsUncheckedLocked(newR, newE, newS) @@ -136,23 +132,24 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error { // arguments equals -1, the corresponding value is not changed." - // setresuid(2) var err error - newR := t.creds.RealKUID + creds := t.Credentials() + newR := creds.RealKUID if r.Ok() { - newR, err = t.creds.UseUID(r) + newR, err = creds.UseUID(r) if err != nil { return err } } - newE := t.creds.EffectiveKUID + newE := creds.EffectiveKUID if e.Ok() { - newE, err = t.creds.UseUID(e) + newE, err = creds.UseUID(e) if err != nil { return err } } - newS := t.creds.SavedKUID + newS := creds.SavedKUID if s.Ok() { - newS, err = t.creds.UseUID(s) + newS, err = creds.UseUID(s) if err != nil { return err } @@ -163,10 +160,10 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error { // Preconditions: t.mu must be locked. func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { - root := t.creds.UserNamespace.MapToKUID(auth.RootUID) - oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID - t.creds = t.creds.Fork() // See doc for creds. - t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + root := creds.UserNamespace.MapToKUID(auth.RootUID) + oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID + creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS // "1. If one or more of the real, effective or saved set user IDs was // previously 0, and as a result of the UID changes all of these IDs have a @@ -184,9 +181,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // being cleared." (A thread's effective capability set is always // cleared when such a credential change is made, // regardless of the setting of the "keep capabilities" flag.) - if !t.creds.KeepCaps { - t.creds.PermittedCaps = 0 - t.creds.EffectiveCaps = 0 + if !creds.KeepCaps { + creds.PermittedCaps = 0 + creds.EffectiveCaps = 0 } } // """ @@ -197,9 +194,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // permitted set is copied to the effective set. // """ if oldE == root && newE != root { - t.creds.EffectiveCaps = 0 + creds.EffectiveCaps = 0 } else if oldE != root && newE == root { - t.creds.EffectiveCaps = t.creds.PermittedCaps + creds.EffectiveCaps = creds.PermittedCaps } // "4. If the filesystem user ID is changed from 0 to nonzero (see // setfsuid(2)), then the following capabilities are cleared from the @@ -220,6 +217,7 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // Not documented, but compare Linux's kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } + t.creds.Store(creds) } // SetGID implements the semantics of setgid(2). @@ -227,20 +225,23 @@ func (t *Task) SetGID(gid auth.GID) error { if !gid.Ok() { return syserror.EINVAL } + t.mu.Lock() defer t.mu.Unlock() - kgid := t.creds.UserNamespace.MapToKGID(gid) + + creds := t.Credentials() + kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return syserror.EINVAL } - if t.creds.HasCapability(linux.CAP_SETGID) { + if creds.HasCapability(linux.CAP_SETGID) { t.setKGIDsUncheckedLocked(kgid, kgid, kgid) return nil } - if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID { + if kgid != creds.RealKGID && kgid != creds.SavedKGID { return syserror.EPERM } - t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID) + t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID) return nil } @@ -248,30 +249,32 @@ func (t *Task) SetGID(gid auth.GID) error { func (t *Task) SetREGID(r, e auth.GID) error { t.mu.Lock() defer t.mu.Unlock() - newR := t.creds.RealKGID + + creds := t.Credentials() + newR := creds.RealKGID if r.Ok() { - newR = t.creds.UserNamespace.MapToKGID(r) + newR = creds.UserNamespace.MapToKGID(r) if !newR.Ok() { return syserror.EINVAL } } - newE := t.creds.EffectiveKGID + newE := creds.EffectiveKGID if e.Ok() { - newE = t.creds.UserNamespace.MapToKGID(e) + newE = creds.UserNamespace.MapToKGID(e) if !newE.Ok() { return syserror.EINVAL } } - if !t.creds.HasCapability(linux.CAP_SETGID) { - if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID { + if !creds.HasCapability(linux.CAP_SETGID) { + if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID { return syserror.EPERM } - if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID { + if newR != creds.RealKGID && newR != creds.EffectiveKGID { return syserror.EPERM } } - newS := t.creds.SavedKGID - if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) { + newS := creds.SavedKGID + if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) { newS = newE } t.setKGIDsUncheckedLocked(newR, newE, newS) @@ -280,26 +283,29 @@ func (t *Task) SetREGID(r, e auth.GID) error { // SetRESGID implements the semantics of the setresgid(2) syscall. func (t *Task) SetRESGID(r, e, s auth.GID) error { + var err error + t.mu.Lock() defer t.mu.Unlock() - var err error - newR := t.creds.RealKGID + + creds := t.Credentials() + newR := creds.RealKGID if r.Ok() { - newR, err = t.creds.UseGID(r) + newR, err = creds.UseGID(r) if err != nil { return err } } - newE := t.creds.EffectiveKGID + newE := creds.EffectiveKGID if e.Ok() { - newE, err = t.creds.UseGID(e) + newE, err = creds.UseGID(e) if err != nil { return err } } - newS := t.creds.SavedKGID + newS := creds.SavedKGID if s.Ok() { - newS, err = t.creds.UseGID(s) + newS, err = creds.UseGID(s) if err != nil { return err } @@ -309,9 +315,9 @@ func (t *Task) SetRESGID(r, e, s auth.GID) error { } func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { - oldE := t.creds.EffectiveKGID - t.creds = t.creds.Fork() // See doc for creds. - t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + oldE := creds.EffectiveKGID + creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS if oldE != newE { // "[dumpability] is reset to the current value contained in @@ -327,6 +333,7 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { // kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } + t.creds.Store(creds) } // SetExtraGIDs attempts to change t's supplemental groups. All IDs are @@ -334,19 +341,21 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { func (t *Task) SetExtraGIDs(gids []auth.GID) error { t.mu.Lock() defer t.mu.Unlock() - if !t.creds.HasCapability(linux.CAP_SETGID) { + creds := t.Credentials() + if !creds.HasCapability(linux.CAP_SETGID) { return syserror.EPERM } kgids := make([]auth.KGID, len(gids)) for i, gid := range gids { - kgid := t.creds.UserNamespace.MapToKGID(gid) + kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return syserror.EINVAL } kgids[i] = kgid } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.ExtraKGIDs = kgids + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.ExtraKGIDs = kgids + t.creds.Store(creds) return nil } @@ -360,27 +369,29 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili if effective & ^permitted != 0 { return syserror.EPERM } + creds := t.Credentials() // "It is also a limiting superset for the capabilities that may be added // to the inheritable set by a thread that does not have the CAP_SETPCAP // capability in its effective set." - if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) { + if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) { return syserror.EPERM } // "If a thread drops a capability from its permitted set, it can never // reacquire that capability (unless it execve(2)s ..." - if permitted & ^t.creds.PermittedCaps != 0 { + if permitted & ^creds.PermittedCaps != 0 { return syserror.EPERM } // "... if a capability is not in the bounding set, then a thread can't add // this capability to its inheritable set, even if it was in its permitted // capabilities ..." - if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 { + if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.PermittedCaps = permitted - t.creds.InheritableCaps = inheritable - t.creds.EffectiveCaps = effective + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.PermittedCaps = permitted + creds.InheritableCaps = inheritable + creds.EffectiveCaps = effective + t.creds.Store(creds) return nil } @@ -389,11 +400,13 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili func (t *Task) DropBoundingCapability(cp linux.Capability) error { t.mu.Lock() defer t.mu.Unlock() - if !t.creds.HasCapability(linux.CAP_SETPCAP) { + creds := t.Credentials() + if !creds.HasCapability(linux.CAP_SETPCAP) { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + t.creds.Store(creds) return nil } @@ -402,31 +415,33 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { t.mu.Lock() defer t.mu.Unlock() + creds := t.Credentials() // "A process reassociating itself with a user namespace must have the // CAP_SYS_ADMIN capability in the target user namespace." - setns(2) // // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN // in ns (by rule 3 in auth.Credentials.HasCapability). - if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.UserNamespace = ns + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.UserNamespace = ns // "The child process created by clone(2) with the CLONE_NEWUSER flag // starts out with a complete set of capabilities in the new user // namespace. Likewise, a process that creates a new user namespace using // unshare(2) or joins an existing user namespace using setns(2) gains a // full set of capabilities in that namespace." - t.creds.PermittedCaps = auth.AllCapabilities - t.creds.InheritableCaps = 0 - t.creds.EffectiveCaps = auth.AllCapabilities - t.creds.BoundingCaps = auth.AllCapabilities + creds.PermittedCaps = auth.AllCapabilities + creds.InheritableCaps = 0 + creds.EffectiveCaps = auth.AllCapabilities + creds.BoundingCaps = auth.AllCapabilities // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER // flag sets the "securebits" flags (see capabilities(7)) to their default // values (all flags disabled) in the child (for clone(2)) or caller (for // unshare(2), or setns(2)." - user_namespaces(7) - t.creds.KeepCaps = false + creds.KeepCaps = false + t.creds.Store(creds) return nil } @@ -435,8 +450,9 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { func (t *Task) SetKeepCaps(k bool) { t.mu.Lock() defer t.mu.Unlock() - t.creds = t.creds.Fork() // See doc for creds. - t.creds.KeepCaps = k + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + creds.KeepCaps = k + t.creds.Store(creds) } // updateCredsForExec updates t.creds to reflect an execve(). @@ -512,15 +528,16 @@ func (t *Task) updateCredsForExecLocked() { // the effective user ID. var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0 fileEffective := false - root := t.creds.UserNamespace.MapToKUID(auth.RootUID) - if t.creds.EffectiveKUID == root || t.creds.RealKUID == root { - newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps - if t.creds.EffectiveKUID == root { + creds := t.Credentials() + root := creds.UserNamespace.MapToKUID(auth.RootUID) + if creds.EffectiveKUID == root || creds.RealKUID == root { + newPermitted = creds.InheritableCaps | creds.BoundingCaps + if creds.EffectiveKUID == root { fileEffective = true } } - t.creds = t.creds.Fork() // See doc for creds. + creds = creds.Fork() // The credentials object is immutable. See doc for creds. // Now we enter poorly-documented, somewhat confusing territory. (The // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds @@ -562,27 +579,28 @@ func (t *Task) updateCredsForExecLocked() { // But since no_new_privs is always set (A3 is always true), this becomes // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1 // is a no-op. So we can just do C1 and C2 unconditionally. - if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID { - t.creds.EffectiveKUID = t.creds.RealKUID - t.creds.EffectiveKGID = t.creds.RealKGID + if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID { + creds.EffectiveKUID = creds.RealKUID + creds.EffectiveKGID = creds.RealKGID t.parentDeathSignal = 0 } // (Saved set-user-ID is always set to the new effective user ID, and saved // set-group-ID is always set to the new effective group ID, regardless of // the above.) - t.creds.SavedKUID = t.creds.RealKUID - t.creds.SavedKGID = t.creds.RealKGID - t.creds.PermittedCaps &= newPermitted + creds.SavedKUID = creds.RealKUID + creds.SavedKGID = creds.RealKGID + creds.PermittedCaps &= newPermitted if fileEffective { - t.creds.EffectiveCaps = t.creds.PermittedCaps + creds.EffectiveCaps = creds.PermittedCaps } else { - t.creds.EffectiveCaps = 0 + creds.EffectiveCaps = 0 } // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent // calls to execve(2). - t.creds.KeepCaps = false + creds.KeepCaps = false // "The bounding set is inherited at fork(2) from the thread's parent, and // is preserved across an execve(2)". So we're done. + t.creds.Store(creds) } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 9458f5c2a..72caae537 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -119,7 +119,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { ptraceTracees: make(map[*Task]struct{}), allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, - creds: cfg.Credentials, niceness: cfg.Niceness, netns: cfg.NetworkNamespaced, utsns: cfg.UTSNamespace, @@ -129,6 +128,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, } + t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu t.ptraceTracer.Store((*Task)(nil)) // We don't construct t.blockingTimer until Task.run(); see that function diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index dca8e4c0e..f15b3415a 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -370,13 +370,16 @@ func (t *thread) destroy() { // init initializes trace options. func (t *thread) init() { - // Set our TRACESYSGOOD option to differeniate real SIGTRAP. + // Set our TRACESYSGOOD option to differeniate real SIGTRAP. We also + // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the + // sentry will immediately kill the associated stubs. + const PTRACE_O_EXITKILL = 0x100000 _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, syscall.PTRACE_SETOPTIONS, uintptr(t.tid), 0, - syscall.PTRACE_O_TRACESYSGOOD, + syscall.PTRACE_O_TRACESYSGOOD|syscall.PTRACE_O_TRACEEXIT|PTRACE_O_EXITKILL, 0, 0) if errno != 0 { panic(fmt.Sprintf("ptrace set options failed: %v", errno)) @@ -419,7 +422,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { // between syscall-enter-stop and syscall-exit-stop; it happens *after* // syscall-exit-stop.)" - ptrace(2), "Syscall-stops" if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) { - panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) + t.dumpAndPanic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) } // Grab registers. diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 2a41e8176..7f18b1ac8 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -379,7 +379,7 @@ var AMD64 = &kernel.SyscallTable{ 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil), 327: syscalls.Undocumented("preadv2", Preadv2), 328: syscalls.Undocumented("pwritev2", Pwritev2), - 397: syscalls.Undocumented("statx", Statx), + 332: syscalls.Supported("statx", Statx), }, Emulate: map[usermem.Addr]uintptr{ diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index d9ed02c99..04962726a 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -304,44 +304,100 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod return 0, syserror.ENOENT } - err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error { - if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR - } + fileFlags := linuxToFlags(flags) + // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. + fileFlags.LargeFile = true + + err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error { + // Resolve the name to see if it exists, and follow any + // symlinks along the way. We must do the symlink resolution + // manually because if the symlink target does not exist, we + // must create the target (and not the symlink itself). + var ( + found *fs.Dirent + err error + ) + for { + if !fs.IsDir(parent.Inode.StableAttr) { + return syserror.ENOTDIR + } - fileFlags := linuxToFlags(flags) - // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. - fileFlags.LargeFile = true + // Start by looking up the dirent at 'name'. + found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals) + if err != nil { + break + } + + // We found something (possibly a symlink). If the + // O_EXCL flag was passed, then we can immediately + // return EEXIST. + if flags&linux.O_EXCL != 0 { + return syserror.EEXIST + } + + // If we have a non-symlink, then we can proceed. + if !fs.IsSymlink(found.Inode.StableAttr) { + break + } + + // If O_NOFOLLOW was passed, then don't try to resolve + // anything. + if flags&linux.O_NOFOLLOW != 0 { + return syserror.ELOOP + } + + // Try to resolve the symlink directly to a Dirent. + resolved, err := found.Inode.Getlink(t) + if err == nil || err != fs.ErrResolveViaReadlink { + // No more resolution necessary. + found.DecRef() + found = resolved + break + } + + // Resolve the symlink to a path via Readlink. + path, err := found.Inode.Readlink(t) + if err != nil { + break + } + remainingTraversals-- + + // Get the new parent from the target path. + newParentPath, newName := fs.SplitLast(path) + newParent, err := t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals) + if err != nil { + break + } + + // Repeat the process with the parent and name of the + // symlink target. + parent.DecRef() + parent = newParent + name = newName + } - // Does this file exist already? - targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals) var newFile *fs.File switch err { case nil: // The file existed. - defer targetDirent.DecRef() - - // Check if we wanted to create. - if flags&linux.O_EXCL != 0 { - return syserror.EEXIST - } + defer found.DecRef() // Like sys_open, check for a few things about the // filesystem before trying to get a reference to the // fs.File. The same constraints on Check apply. - if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { + if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { return err } // Should we truncate the file? if flags&linux.O_TRUNC != 0 { - if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil { + if err := found.Inode.Truncate(t, found, 0); err != nil { return err } } // Create a new fs.File. - newFile, err = targetDirent.Inode.GetFile(t, targetDirent, fileFlags) + newFile, err = found.Inode.GetFile(t, found, fileFlags) if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } @@ -350,19 +406,19 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod // File does not exist. Proceed with creation. // Do we have write permissions on the parent? - if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } // Attempt a creation. perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) - newFile, err = d.Create(t, root, name, fileFlags, perms) + newFile, err = parent.Create(t, root, name, fileFlags, perms) if err != nil { // No luck, bail. return err } defer newFile.DecRef() - targetDirent = newFile.Dirent + found = newFile.Dirent default: return err } @@ -378,10 +434,10 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod fd = uintptr(newFD) // Queue the open inotify event. The creation event is - // automatically queued when the dirent is targetDirent. The - // open events are implemented at the syscall layer so we need - // to manually queue one here. - targetDirent.InotifyEvent(linux.IN_OPEN, 0) + // automatically queued when the dirent is found. The open + // events are implemented at the syscall layer so we need to + // manually queue one here. + found.InotifyEvent(linux.IN_OPEN, 0) return nil }) |