diff options
40 files changed, 1468 insertions, 198 deletions
diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD new file mode 100644 index 000000000..e54e7371c --- /dev/null +++ b/pkg/fdchannel/BUILD @@ -0,0 +1,17 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "fdchannel", + srcs = ["fdchannel_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/fdchannel", + visibility = ["//visibility:public"], +) + +go_test( + name = "fdchannel_test", + size = "small", + srcs = ["fdchannel_test.go"], + embed = [":fdchannel"], +) diff --git a/pkg/fdchannel/fdchannel_test.go b/pkg/fdchannel/fdchannel_test.go new file mode 100644 index 000000000..5d01dc636 --- /dev/null +++ b/pkg/fdchannel/fdchannel_test.go @@ -0,0 +1,131 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdchannel + +import ( + "io/ioutil" + "os" + "sync" + "syscall" + "testing" + "time" +) + +func TestSendRecvFD(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + recvFD, err := recvEP.RecvFDNonblock() + if err != syscall.EAGAIN && err != syscall.EWOULDBLOCK { + t.Errorf("RecvFDNonblock before SendFD: got (%d, %v), wanted (<unspecified>, EAGAIN or EWOULDBLOCK", recvFD, err) + } + + if err := sendEP.SendFD(int(sendFile.Fd())); err != nil { + t.Fatalf("SendFD failed: %v", err) + } + recvFD, err = recvEP.RecvFD() + if err != nil { + t.Fatalf("RecvFD failed: %v", err) + } + recvFile := os.NewFile(uintptr(recvFD), "received file") + defer recvFile.Close() + + sendInfo, err := sendFile.Stat() + if err != nil { + t.Fatalf("failed to stat sent file: %v", err) + } + sendInfoSys := sendInfo.Sys() + sendStat, ok := sendInfoSys.(*syscall.Stat_t) + if !ok { + t.Fatalf("sent file's FileInfo is backed by unknown type %T", sendInfoSys) + } + + recvInfo, err := recvFile.Stat() + if err != nil { + t.Fatalf("failed to stat received file: %v", err) + } + recvInfoSys := recvInfo.Sys() + recvStat, ok := recvInfoSys.(*syscall.Stat_t) + if !ok { + t.Fatalf("received file's FileInfo is backed by unknown type %T", recvInfoSys) + } + + if sendStat.Dev != recvStat.Dev || sendStat.Ino != recvStat.Ino { + t.Errorf("sent file (dev=%d, ino=%d) does not match received file (dev=%d, ino=%d)", sendStat.Dev, sendStat.Ino, recvStat.Dev, recvStat.Ino) + } +} + +func TestShutdownThenRecvFD(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + recvEP.Shutdown() + if _, err := recvEP.RecvFD(); err == nil { + t.Error("RecvFD succeeded unexpectedly") + } +} + +func TestRecvFDThenShutdown(t *testing.T) { + sendFile, err := ioutil.TempFile("", "fdchannel_test_") + if err != nil { + t.Fatalf("failed to create temporary file: %v", err) + } + defer sendFile.Close() + + chanFDs, err := NewConnectedSockets() + if err != nil { + t.Fatalf("failed to create fdchannel sockets: %v", err) + } + sendEP := NewEndpoint(chanFDs[0]) + defer sendEP.Destroy() + recvEP := NewEndpoint(chanFDs[1]) + defer recvEP.Destroy() + + var receiverWG sync.WaitGroup + receiverWG.Add(1) + go func() { + defer receiverWG.Done() + if _, err := recvEP.RecvFD(); err == nil { + t.Error("RecvFD succeeded unexpectedly") + } + }() + defer receiverWG.Wait() + time.Sleep(time.Second) // to ensure recvEP.RecvFD() has blocked + recvEP.Shutdown() +} diff --git a/pkg/fdchannel/fdchannel_unsafe.go b/pkg/fdchannel/fdchannel_unsafe.go new file mode 100644 index 000000000..367235be5 --- /dev/null +++ b/pkg/fdchannel/fdchannel_unsafe.go @@ -0,0 +1,146 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris + +// Package fdchannel implements passing file descriptors between processes over +// Unix domain sockets. +package fdchannel + +import ( + "fmt" + "reflect" + "sync/atomic" + "syscall" + "unsafe" +) + +// int32 is the real type of a file descriptor. +const sizeofInt32 = int(unsafe.Sizeof(int32(0))) + +// NewConnectedSockets returns a pair of file descriptors, owned by the caller, +// representing connected sockets that may be passed to separate calls to +// NewEndpoint to create connected Endpoints. +func NewConnectedSockets() ([2]int, error) { + return syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0) +} + +// Endpoint sends file descriptors to, and receives them from, another +// connected Endpoint. +// +// Endpoint is not copyable or movable by value. +type Endpoint struct { + sockfd int32 // accessed using atomic memory operations + msghdr syscall.Msghdr + cmsg *syscall.Cmsghdr // followed by sizeofInt32 bytes of data +} + +// Init must be called on zero-value Endpoints before first use. sockfd must be +// a blocking AF_UNIX SOCK_SEQPACKET socket. +func (ep *Endpoint) Init(sockfd int) { + // "Datagram sockets in various domains (e.g., the UNIX and Internet + // domains) permit zero-length datagrams." - recv(2). Experimentally, + // sendmsg+recvmsg for a zero-length datagram is slightly faster than + // sendmsg+recvmsg for a single byte over a stream socket. + cmsgSlice := make([]byte, syscall.CmsgSpace(sizeofInt32)) + cmsgReflect := (*reflect.SliceHeader)((unsafe.Pointer)(&cmsgSlice)) + ep.sockfd = int32(sockfd) + ep.msghdr.Control = (*byte)((unsafe.Pointer)(cmsgReflect.Data)) + ep.cmsg = (*syscall.Cmsghdr)((unsafe.Pointer)(cmsgReflect.Data)) + // ep.msghdr.Controllen and ep.cmsg.* are mutated by recvmsg(2), so they're + // set before calling sendmsg/recvmsg. +} + +// NewEndpoint is a convenience function that returns an initialized Endpoint +// allocated on the heap. +func NewEndpoint(sockfd int) *Endpoint { + ep := &Endpoint{} + ep.Init(sockfd) + return ep +} + +// Destroy releases resources owned by ep. No other Endpoint methods may be +// called after Destroy. +func (ep *Endpoint) Destroy() { + // These need not use sync/atomic since there must not be any concurrent + // calls to Endpoint methods. + if ep.sockfd >= 0 { + syscall.Close(int(ep.sockfd)) + ep.sockfd = -1 + } +} + +// Shutdown causes concurrent and future calls to ep.SendFD(), ep.RecvFD(), and +// ep.RecvFDNonblock(), as well as the same calls in the connected Endpoint, to +// unblock and return errors. It does not wait for concurrent calls to return. +// +// Shutdown is the only Endpoint method that may be called concurrently with +// other methods. +func (ep *Endpoint) Shutdown() { + if sockfd := int(atomic.SwapInt32(&ep.sockfd, -1)); sockfd >= 0 { + syscall.Shutdown(sockfd, syscall.SHUT_RDWR) + syscall.Close(sockfd) + } +} + +// SendFD sends the open file description represented by the given file +// descriptor to the connected Endpoint. +func (ep *Endpoint) SendFD(fd int) error { + cmsgLen := syscall.CmsgLen(sizeofInt32) + ep.cmsg.Level = syscall.SOL_SOCKET + ep.cmsg.Type = syscall.SCM_RIGHTS + ep.cmsg.SetLen(cmsgLen) + *ep.cmsgData() = int32(fd) + ep.msghdr.SetControllen(cmsgLen) + _, _, e := syscall.Syscall(syscall.SYS_SENDMSG, uintptr(atomic.LoadInt32(&ep.sockfd)), uintptr((unsafe.Pointer)(&ep.msghdr)), 0) + if e != 0 { + return e + } + return nil +} + +// RecvFD receives an open file description from the connected Endpoint and +// returns a file descriptor representing it, owned by the caller. +func (ep *Endpoint) RecvFD() (int, error) { + return ep.recvFD(0) +} + +// RecvFDNonblock receives an open file description from the connected Endpoint +// and returns a file descriptor representing it, owned by the caller. If there +// are no pending receivable open file descriptions, RecvFDNonblock returns +// (<unspecified>, EAGAIN or EWOULDBLOCK). +func (ep *Endpoint) RecvFDNonblock() (int, error) { + return ep.recvFD(syscall.MSG_DONTWAIT) +} + +func (ep *Endpoint) recvFD(flags uintptr) (int, error) { + cmsgLen := syscall.CmsgLen(sizeofInt32) + ep.msghdr.SetControllen(cmsgLen) + _, _, e := syscall.Syscall(syscall.SYS_RECVMSG, uintptr(atomic.LoadInt32(&ep.sockfd)), uintptr((unsafe.Pointer)(&ep.msghdr)), flags|syscall.MSG_TRUNC) + if e != 0 { + return -1, e + } + if int(ep.msghdr.Controllen) != cmsgLen { + return -1, fmt.Errorf("received control message has incorrect length: got %d, wanted %d", ep.msghdr.Controllen, cmsgLen) + } + if ep.cmsg.Level != syscall.SOL_SOCKET || ep.cmsg.Type != syscall.SCM_RIGHTS { + return -1, fmt.Errorf("received control message has incorrect (level, type): got (%v, %v), wanted (%v, %v)", ep.cmsg.Level, ep.cmsg.Type, syscall.SOL_SOCKET, syscall.SCM_RIGHTS) + } + return int(*ep.cmsgData()), nil +} + +func (ep *Endpoint) cmsgData() *int32 { + // syscall.CmsgLen(0) == syscall.cmsgAlignOf(syscall.SizeofCmsghdr) + return (*int32)((unsafe.Pointer)(uintptr((unsafe.Pointer)(ep.cmsg)) + uintptr(syscall.CmsgLen(0)))) +} diff --git a/pkg/log/log.go b/pkg/log/log.go index 0765a1963..ab9ad01ef 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -50,6 +50,19 @@ const ( Debug ) +func (l Level) String() string { + switch l { + case Warning: + return "Warning" + case Info: + return "Info" + case Debug: + return "Debug" + default: + return fmt.Sprintf("Invalid level: %d", l) + } +} + // Emitter is the final destination for logs. type Emitter interface { // Emit emits the given log statement. This allows for control over the diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 15a1fe8a9..5dccb8e3c 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -6,6 +6,7 @@ go_library( name = "control", srcs = [ "control.go", + "logging.go", "pprof.go", "proc.go", "state.go", @@ -26,8 +27,10 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/state", + "//pkg/sentry/strace", "//pkg/sentry/usage", "//pkg/sentry/watchdog", + "//pkg/tcpip/link/sniffer", "//pkg/urpc", ], ) diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go new file mode 100644 index 000000000..811f24324 --- /dev/null +++ b/pkg/sentry/control/logging.go @@ -0,0 +1,136 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/strace" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" +) + +// LoggingArgs are the arguments to use for changing the logging +// level and strace list. +type LoggingArgs struct { + // SetLevel is a flag used to indicate that we should update + // the logging level. We should be able to change the strace + // list without affecting the logging level and vice versa. + SetLevel bool + + // Level is the log level that will be set if SetLevel is true. + Level log.Level + + // SetLogPackets indicates that we should update the log packets flag. + SetLogPackets bool + + // LogPackets is the actual value to set for LogPackets. + // SetLogPackets must be enabled to indicate that we're changing + // the value. + LogPackets bool + + // SetStrace is a flag used to indicate that strace related + // arguments were passed in. + SetStrace bool + + // EnableStrace is a flag from the CLI that specifies whether to + // enable strace at all. If this flag is false then a completely + // pristine copy of the syscall table will be swapped in. This + // approach is used to remain consistent with an empty strace + // whitelist meaning trace all system calls. + EnableStrace bool + + // Strace is the whitelist of syscalls to trace to log. If this + // and StraceEventWhitelist are empty trace all system calls. + StraceWhitelist []string + + // SetEventStrace is a flag used to indicate that event strace + // related arguments were passed in. + SetEventStrace bool + + // StraceEventWhitelist is the whitelist of syscalls to trace + // to event log. + StraceEventWhitelist []string +} + +// Logging provides functions related to logging. +type Logging struct{} + +// Change will change the log level and strace arguments. Although +// this functions signature requires an error it never acctually +// return san error. It's required by the URPC interface. +// Additionally, it may look odd that this is the only method +// attached to an empty struct but this is also part of how +// URPC dispatches. +func (l *Logging) Change(args *LoggingArgs, code *int) error { + if args.SetLevel { + // Logging uses an atomic for the level so this is thread safe. + log.SetLevel(args.Level) + } + + if args.SetLogPackets { + if args.LogPackets { + atomic.StoreUint32(&sniffer.LogPackets, 1) + } else { + atomic.StoreUint32(&sniffer.LogPackets, 0) + } + log.Infof("LogPackets set to: %v", atomic.LoadUint32(&sniffer.LogPackets)) + } + + if args.SetStrace { + if err := l.configureStrace(args); err != nil { + return fmt.Errorf("error configuring strace: %v", err) + } + } + + if args.SetEventStrace { + if err := l.configureEventStrace(args); err != nil { + return fmt.Errorf("error configuring event strace: %v", err) + } + } + + return nil +} + +func (l *Logging) configureStrace(args *LoggingArgs) error { + if args.EnableStrace { + // Install the whitelist specified. + if len(args.StraceWhitelist) > 0 { + if err := strace.Enable(args.StraceWhitelist, strace.SinkTypeLog); err != nil { + return err + } + } else { + // For convenience, if strace is enabled but whitelist + // is empty, enable everything to log. + strace.EnableAll(strace.SinkTypeLog) + } + } else { + // Uninstall all strace functions. + strace.Disable(strace.SinkTypeLog) + } + return nil +} + +func (l *Logging) configureEventStrace(args *LoggingArgs) error { + if len(args.StraceEventWhitelist) > 0 { + if err := strace.Enable(args.StraceEventWhitelist, strace.SinkTypeEvent); err != nil { + return err + } + } else { + strace.Disable(strace.SinkTypeEvent) + } + return nil +} diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 55ffe6c0c..8e1f5674d 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -310,9 +310,11 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error return 0, syserror.ErrInterrupted } + unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) // Handle append mode. if f.Flags().Append { if err := f.offsetForAppend(ctx, &f.offset); err != nil { + unlockAppendMu() f.mu.Unlock() return 0, err } @@ -322,6 +324,7 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error limit, ok := f.checkLimit(ctx, f.offset) switch { case ok && limit == 0: + unlockAppendMu() f.mu.Unlock() return 0, syserror.ErrExceedsFileSizeLimit case ok: @@ -333,6 +336,7 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error if n >= 0 && !f.flags.NonSeekable { atomic.StoreInt64(&f.offset, f.offset+n) } + unlockAppendMu() f.mu.Unlock() return n, err } @@ -348,13 +352,11 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 // However, on Linux, if a file is opened with O_APPEND, pwrite() // appends data to the end of the file, regardless of the value of // offset." + unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) + defer unlockAppendMu() + if f.Flags().Append { - if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted - } - defer f.mu.Unlock() if err := f.offsetForAppend(ctx, &offset); err != nil { - f.mu.Unlock() return 0, err } } @@ -373,7 +375,7 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 // offsetForAppend sets the given offset to the end of the file. // -// Precondition: the underlying file mutex should be held. +// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing. func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { uattr, err := f.Dirent.Inode.UnstableAttr(ctx) if err != nil { diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index a889586aa..e4aae1135 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -15,6 +15,8 @@ package fs import ( + "sync" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" @@ -55,6 +57,12 @@ type Inode struct { // overlay is the overlay entry for this Inode. overlay *overlayEntry + + // appendMu is used to synchronize write operations into files which + // have been opened with O_APPEND. Operations which change a file size + // have to take this lock for read. Write operations to files with + // O_APPEND have to take this lock for write. + appendMu sync.RWMutex `state:"nosave"` } // LockCtx is an Inode's lock context and contains different personalities of locks; both @@ -337,6 +345,8 @@ func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error { if i.overlay != nil { return overlayTruncate(ctx, i.overlay, d, size) } + i.appendMu.RLock() + defer i.appendMu.RUnlock() return i.InodeOperations.Truncate(ctx, i, size) } @@ -438,3 +448,12 @@ func (i *Inode) CheckCapability(ctx context.Context, cp linux.Capability) bool { } return creds.HasCapability(cp) } + +func (i *Inode) lockAppendMu(appendMode bool) func() { + if appendMode { + i.appendMu.Lock() + return i.appendMu.Unlock + } + i.appendMu.RLock() + return i.appendMu.RUnlock +} diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 57b8b14e3..920d86042 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -537,12 +537,6 @@ func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error { if o.upper != nil { err = o.upper.check(ctx, p) } else { - if p.Write { - // Since writes will be redirected to the upper filesystem, the lower - // filesystem need not be writable, but must be readable for copy-up. - p.Write = false - p.Read = true - } err = o.lower.check(ctx, p) } o.copyMu.RUnlock() diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index b70c583f3..da41a10ab 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -31,6 +31,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/binary", "//pkg/log", "//pkg/sentry/context", "//pkg/sentry/fs", diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index caa1a5c4d..37694620c 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -20,6 +20,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -55,9 +56,8 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo "psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), "ptype": newStaticProcInode(ctx, msrc, []byte("Type Device Function")), "route": newStaticProcInode(ctx, msrc, []byte("Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT")), - "tcp": newStaticProcInode(ctx, msrc, []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode")), - - "udp": newStaticProcInode(ctx, msrc, []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops")), + "tcp": seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc), + "udp": newStaticProcInode(ctx, msrc, []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops")), "unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc), } @@ -210,10 +210,6 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } var buf bytes.Buffer - // Header - fmt.Fprintf(&buf, "Num RefCount Protocol Flags Type St Inode Path\n") - - // Entries for _, se := range n.k.ListSockets() { s := se.Sock.Get() if s == nil { @@ -222,6 +218,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } sfile := s.(*fs.File) if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + s.DecRef() // Not a unix socket. continue } @@ -281,12 +278,160 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } fmt.Fprintf(&buf, "\n") - sfile.DecRef() + s.DecRef() + } + + data := []seqfile.SeqData{ + { + Buf: []byte("Num RefCount Protocol Flags Type St Inode Path\n"), + Handle: n, + }, + { + Buf: buf.Bytes(), + Handle: n, + }, + } + return data, 0 +} + +// netTCP implements seqfile.SeqSource for /proc/net/tcp. +// +// +stateify savable +type netTCP struct { + k *kernel.Kernel +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*netTCP) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + t := kernel.TaskFromContext(ctx) + + if h != nil { + return nil, 0 + } + + var buf bytes.Buffer + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { + s.DecRef() + // Not tcp4 sockets. + continue + } + + // Linux's documentation for the fields below can be found at + // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. + // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). + // Note that the header doesn't contain labels for all the fields. + + // Field: sl; entry number. + fmt.Fprintf(&buf, "%4d: ", se.ID) + + portBuf := make([]byte, 2) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = local.(linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, localAddr.Port) + fmt.Fprintf(&buf, "%08X:%04X ", + binary.LittleEndian.Uint32(localAddr.Addr[:]), + portBuf) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = remote.(linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) + fmt.Fprintf(&buf, "%08X:%04X ", + binary.LittleEndian.Uint32(remoteAddr.Addr[:]), + portBuf) + + // Field: state; socket state. + fmt.Fprintf(&buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(&buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when; timer active state and number of jiffies + // until timer expires. Unimplemented. + fmt.Fprintf(&buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt; number of unrecovered RTO timeouts. + // Unimplemented. + fmt.Fprintf(&buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(&buf, "%5d ", 0) + } else { + fmt.Fprintf(&buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + } + + // Field: timeout; number of unanswered 0-window probes. + // Unimplemented. + fmt.Fprintf(&buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(&buf, "%8d ", sfile.InodeID()) + + // Field: refcount. Don't count the ref we obtain while deferencing + // the weakref to this socket. + fmt.Fprintf(&buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(&buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: retransmit timeout. Unimplemented. + fmt.Fprintf(&buf, "%d ", 0) + + // Field: predicted tick of soft clock (delayed ACK control data). + // Unimplemented. + fmt.Fprintf(&buf, "%d ", 0) + + // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. + fmt.Fprintf(&buf, "%d ", 0) + + // Field: sending congestion window, Unimplemented. + fmt.Fprintf(&buf, "%d ", 0) + + // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. + // Unimplemented, report as large threshold. + fmt.Fprintf(&buf, "%d", -1) + + fmt.Fprintf(&buf, "\n") + + s.DecRef() } - data := []seqfile.SeqData{{ - Buf: buf.Bytes(), - Handle: (*netUnix)(nil), - }} + data := []seqfile.SeqData{ + { + Buf: []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n"), + Handle: n, + }, + { + Buf: buf.Bytes(), + Handle: n, + }, + } return data, 0 } diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index 978dc679b..eed1c2854 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -88,6 +88,8 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, // Check append-only mode and the limit. if !dstPipe { + unlock := dst.Dirent.Inode.lockAppendMu(dst.Flags().Append) + defer unlock() if dst.Flags().Append { if opts.DstOffset { // We need to acquire the lock. diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index a5fcdf969..881dd89b0 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -133,6 +133,9 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou } // Construct a mount which will follow the cache options provided. + // + // TODO(gvisor.dev/issue/179): There should be no reason to disable + // caching once bind mounts are properly supported. var msrc *fs.MountSource switch options[cacheKey] { case "", cacheAll: diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 37cb8c8b9..42779baa9 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -4,6 +4,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") load("//tools/go_stateify:defs.bzl", "go_library") go_template_instance( + name = "atomicptr_credentials", + out = "atomicptr_credentials.go", + package = "auth", + suffix = "Credentials", + template = "//third_party/gvsync:generic_atomicptr", + types = { + "Value": "Credentials", + }, +) + +go_template_instance( name = "id_map_range", out = "id_map_range.go", package = "auth", @@ -34,6 +45,7 @@ go_template_instance( go_library( name = "auth", srcs = [ + "atomicptr_credentials.go", "auth.go", "capability_set.go", "context.go", diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c297c5973..2e3a39d3b 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -386,10 +386,11 @@ type Task struct { // creds is the task's credentials. // - // creds is protected by mu, however the value itself is immutable and can - // only be changed by a copy. After reading the pointer, access will - // proceed outside the scope of mu. creds is owned by the task goroutine. - creds *auth.Credentials + // creds.Load() may be called without synchronization. creds.Store() is + // serialized by mu. creds is owned by the task goroutine. All + // auth.Credentials objects that creds may point to, or have pointed to + // in the past, must be treated as immutable. + creds auth.AtomicPtrCredentials // utsns is the task's UTS namespace. // @@ -597,7 +598,7 @@ func (t *Task) Value(key interface{}) interface{} { case CtxTask: return t case auth.CtxCredentials: - return t.creds + return t.Credentials() case context.CtxThreadGroupID: return int32(t.ThreadGroup().ID()) case fs.CtxRoot: diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 0e621f0d1..b5cc3860d 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -425,6 +425,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { if opts.NewAddressSpace || opts.NewSignalHandlers { return syserror.EINVAL } + creds := t.Credentials() if opts.NewThreadGroup { t.tg.signalHandlers.mu.Lock() if t.tg.tasksCount != 1 { @@ -439,8 +440,6 @@ func (t *Task) Unshare(opts *SharingOptions) error { if t.IsChrooted() { return syserror.EPERM } - // This temporary is needed because Go. - creds := t.Credentials() newUserNS, err := creds.NewChildUserNamespace() if err != nil { return err @@ -449,6 +448,8 @@ func (t *Task) Unshare(opts *SharingOptions) error { if err != nil { return err } + // Need to reload creds, becaue t.SetUserNamespace() changed task credentials. + creds = t.Credentials() } haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) if opts.NewPIDNamespace { @@ -473,7 +474,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that this must happen after NewUserNamespace, so the // new user namespace is used if there is one. - t.utsns = t.utsns.Clone(t.creds.UserNamespace) + t.utsns = t.utsns.Clone(creds.UserNamespace) } if opts.NewIPCNamespace { if !haveCapSysAdmin { @@ -482,7 +483,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" - t.ipcns = NewIPCNamespace(t.creds.UserNamespace) + t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldfds *FDMap if opts.NewFiles { diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index 39c138925..78ff14b20 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -25,30 +25,22 @@ import ( // // This value must be considered immutable. func (t *Task) Credentials() *auth.Credentials { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds + return t.creds.Load() } // UserNamespace returns the user namespace associated with the task. func (t *Task) UserNamespace() *auth.UserNamespace { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.UserNamespace + return t.Credentials().UserNamespace } // HasCapabilityIn checks if the task has capability cp in user namespace ns. func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.HasCapabilityIn(cp, ns) + return t.Credentials().HasCapabilityIn(cp, ns) } // HasCapability checks if the task has capability cp in its user namespace. func (t *Task) HasCapability(cp linux.Capability) bool { - t.mu.Lock() - defer t.mu.Unlock() - return t.creds.HasCapability(cp) + return t.Credentials().HasCapability(cp) } // SetUID implements the semantics of setuid(2). @@ -57,9 +49,12 @@ func (t *Task) SetUID(uid auth.UID) error { if !uid.Ok() { return syserror.EINVAL } + t.mu.Lock() defer t.mu.Unlock() - kuid := t.creds.UserNamespace.MapToKUID(uid) + + creds := t.Credentials() + kuid := creds.UserNamespace.MapToKUID(uid) if !kuid.Ok() { return syserror.EINVAL } @@ -67,17 +62,17 @@ func (t *Task) SetUID(uid auth.UID) error { // effective UID of the caller is root (more precisely: if the caller has // the CAP_SETUID capability), the real UID and saved set-user-ID are also // set." - setuid(2) - if t.creds.HasCapability(linux.CAP_SETUID) { + if creds.HasCapability(linux.CAP_SETUID) { t.setKUIDsUncheckedLocked(kuid, kuid, kuid) return nil } // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID // capability) and uid does not match the real UID or saved set-user-ID of // the calling process." - if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID { + if kuid != creds.RealKUID && kuid != creds.SavedKUID { return syserror.EPERM } - t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID) + t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID) return nil } @@ -87,37 +82,38 @@ func (t *Task) SetREUID(r, e auth.UID) error { defer t.mu.Unlock() // "Supplying a value of -1 for either the real or effective user ID forces // the system to leave that ID unchanged." - setreuid(2) - newR := t.creds.RealKUID + creds := t.Credentials() + newR := creds.RealKUID if r.Ok() { - newR = t.creds.UserNamespace.MapToKUID(r) + newR = creds.UserNamespace.MapToKUID(r) if !newR.Ok() { return syserror.EINVAL } } - newE := t.creds.EffectiveKUID + newE := creds.EffectiveKUID if e.Ok() { - newE = t.creds.UserNamespace.MapToKUID(e) + newE = creds.UserNamespace.MapToKUID(e) if !newE.Ok() { return syserror.EINVAL } } - if !t.creds.HasCapability(linux.CAP_SETUID) { + if !creds.HasCapability(linux.CAP_SETUID) { // "Unprivileged processes may only set the effective user ID to the // real user ID, the effective user ID, or the saved set-user-ID." - if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID { + if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID { return syserror.EPERM } // "Unprivileged users may only set the real user ID to the real user // ID or the effective user ID." - if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID { + if newR != creds.RealKUID && newR != creds.EffectiveKUID { return syserror.EPERM } } // "If the real user ID is set (i.e., ruid is not -1) or the effective user // ID is set to a value not equal to the previous real user ID, the saved // set-user-ID will be set to the new effective user ID." - newS := t.creds.SavedKUID - if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) { + newS := creds.SavedKUID + if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) { newS = newE } t.setKUIDsUncheckedLocked(newR, newE, newS) @@ -136,23 +132,24 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error { // arguments equals -1, the corresponding value is not changed." - // setresuid(2) var err error - newR := t.creds.RealKUID + creds := t.Credentials() + newR := creds.RealKUID if r.Ok() { - newR, err = t.creds.UseUID(r) + newR, err = creds.UseUID(r) if err != nil { return err } } - newE := t.creds.EffectiveKUID + newE := creds.EffectiveKUID if e.Ok() { - newE, err = t.creds.UseUID(e) + newE, err = creds.UseUID(e) if err != nil { return err } } - newS := t.creds.SavedKUID + newS := creds.SavedKUID if s.Ok() { - newS, err = t.creds.UseUID(s) + newS, err = creds.UseUID(s) if err != nil { return err } @@ -163,10 +160,10 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error { // Preconditions: t.mu must be locked. func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { - root := t.creds.UserNamespace.MapToKUID(auth.RootUID) - oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID - t.creds = t.creds.Fork() // See doc for creds. - t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + root := creds.UserNamespace.MapToKUID(auth.RootUID) + oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID + creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS // "1. If one or more of the real, effective or saved set user IDs was // previously 0, and as a result of the UID changes all of these IDs have a @@ -184,9 +181,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // being cleared." (A thread's effective capability set is always // cleared when such a credential change is made, // regardless of the setting of the "keep capabilities" flag.) - if !t.creds.KeepCaps { - t.creds.PermittedCaps = 0 - t.creds.EffectiveCaps = 0 + if !creds.KeepCaps { + creds.PermittedCaps = 0 + creds.EffectiveCaps = 0 } } // """ @@ -197,9 +194,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // permitted set is copied to the effective set. // """ if oldE == root && newE != root { - t.creds.EffectiveCaps = 0 + creds.EffectiveCaps = 0 } else if oldE != root && newE == root { - t.creds.EffectiveCaps = t.creds.PermittedCaps + creds.EffectiveCaps = creds.PermittedCaps } // "4. If the filesystem user ID is changed from 0 to nonzero (see // setfsuid(2)), then the following capabilities are cleared from the @@ -220,6 +217,7 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // Not documented, but compare Linux's kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } + t.creds.Store(creds) } // SetGID implements the semantics of setgid(2). @@ -227,20 +225,23 @@ func (t *Task) SetGID(gid auth.GID) error { if !gid.Ok() { return syserror.EINVAL } + t.mu.Lock() defer t.mu.Unlock() - kgid := t.creds.UserNamespace.MapToKGID(gid) + + creds := t.Credentials() + kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return syserror.EINVAL } - if t.creds.HasCapability(linux.CAP_SETGID) { + if creds.HasCapability(linux.CAP_SETGID) { t.setKGIDsUncheckedLocked(kgid, kgid, kgid) return nil } - if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID { + if kgid != creds.RealKGID && kgid != creds.SavedKGID { return syserror.EPERM } - t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID) + t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID) return nil } @@ -248,30 +249,32 @@ func (t *Task) SetGID(gid auth.GID) error { func (t *Task) SetREGID(r, e auth.GID) error { t.mu.Lock() defer t.mu.Unlock() - newR := t.creds.RealKGID + + creds := t.Credentials() + newR := creds.RealKGID if r.Ok() { - newR = t.creds.UserNamespace.MapToKGID(r) + newR = creds.UserNamespace.MapToKGID(r) if !newR.Ok() { return syserror.EINVAL } } - newE := t.creds.EffectiveKGID + newE := creds.EffectiveKGID if e.Ok() { - newE = t.creds.UserNamespace.MapToKGID(e) + newE = creds.UserNamespace.MapToKGID(e) if !newE.Ok() { return syserror.EINVAL } } - if !t.creds.HasCapability(linux.CAP_SETGID) { - if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID { + if !creds.HasCapability(linux.CAP_SETGID) { + if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID { return syserror.EPERM } - if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID { + if newR != creds.RealKGID && newR != creds.EffectiveKGID { return syserror.EPERM } } - newS := t.creds.SavedKGID - if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) { + newS := creds.SavedKGID + if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) { newS = newE } t.setKGIDsUncheckedLocked(newR, newE, newS) @@ -280,26 +283,29 @@ func (t *Task) SetREGID(r, e auth.GID) error { // SetRESGID implements the semantics of the setresgid(2) syscall. func (t *Task) SetRESGID(r, e, s auth.GID) error { + var err error + t.mu.Lock() defer t.mu.Unlock() - var err error - newR := t.creds.RealKGID + + creds := t.Credentials() + newR := creds.RealKGID if r.Ok() { - newR, err = t.creds.UseGID(r) + newR, err = creds.UseGID(r) if err != nil { return err } } - newE := t.creds.EffectiveKGID + newE := creds.EffectiveKGID if e.Ok() { - newE, err = t.creds.UseGID(e) + newE, err = creds.UseGID(e) if err != nil { return err } } - newS := t.creds.SavedKGID + newS := creds.SavedKGID if s.Ok() { - newS, err = t.creds.UseGID(s) + newS, err = creds.UseGID(s) if err != nil { return err } @@ -309,9 +315,9 @@ func (t *Task) SetRESGID(r, e, s auth.GID) error { } func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { - oldE := t.creds.EffectiveKGID - t.creds = t.creds.Fork() // See doc for creds. - t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + oldE := creds.EffectiveKGID + creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS if oldE != newE { // "[dumpability] is reset to the current value contained in @@ -327,6 +333,7 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { // kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } + t.creds.Store(creds) } // SetExtraGIDs attempts to change t's supplemental groups. All IDs are @@ -334,19 +341,21 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { func (t *Task) SetExtraGIDs(gids []auth.GID) error { t.mu.Lock() defer t.mu.Unlock() - if !t.creds.HasCapability(linux.CAP_SETGID) { + creds := t.Credentials() + if !creds.HasCapability(linux.CAP_SETGID) { return syserror.EPERM } kgids := make([]auth.KGID, len(gids)) for i, gid := range gids { - kgid := t.creds.UserNamespace.MapToKGID(gid) + kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return syserror.EINVAL } kgids[i] = kgid } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.ExtraKGIDs = kgids + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.ExtraKGIDs = kgids + t.creds.Store(creds) return nil } @@ -360,27 +369,29 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili if effective & ^permitted != 0 { return syserror.EPERM } + creds := t.Credentials() // "It is also a limiting superset for the capabilities that may be added // to the inheritable set by a thread that does not have the CAP_SETPCAP // capability in its effective set." - if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) { + if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) { return syserror.EPERM } // "If a thread drops a capability from its permitted set, it can never // reacquire that capability (unless it execve(2)s ..." - if permitted & ^t.creds.PermittedCaps != 0 { + if permitted & ^creds.PermittedCaps != 0 { return syserror.EPERM } // "... if a capability is not in the bounding set, then a thread can't add // this capability to its inheritable set, even if it was in its permitted // capabilities ..." - if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 { + if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.PermittedCaps = permitted - t.creds.InheritableCaps = inheritable - t.creds.EffectiveCaps = effective + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.PermittedCaps = permitted + creds.InheritableCaps = inheritable + creds.EffectiveCaps = effective + t.creds.Store(creds) return nil } @@ -389,11 +400,13 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili func (t *Task) DropBoundingCapability(cp linux.Capability) error { t.mu.Lock() defer t.mu.Unlock() - if !t.creds.HasCapability(linux.CAP_SETPCAP) { + creds := t.Credentials() + if !creds.HasCapability(linux.CAP_SETPCAP) { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + t.creds.Store(creds) return nil } @@ -402,31 +415,33 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { t.mu.Lock() defer t.mu.Unlock() + creds := t.Credentials() // "A process reassociating itself with a user namespace must have the // CAP_SYS_ADMIN capability in the target user namespace." - setns(2) // // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN // in ns (by rule 3 in auth.Credentials.HasCapability). - if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { return syserror.EPERM } - t.creds = t.creds.Fork() // See doc for creds. - t.creds.UserNamespace = ns + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.UserNamespace = ns // "The child process created by clone(2) with the CLONE_NEWUSER flag // starts out with a complete set of capabilities in the new user // namespace. Likewise, a process that creates a new user namespace using // unshare(2) or joins an existing user namespace using setns(2) gains a // full set of capabilities in that namespace." - t.creds.PermittedCaps = auth.AllCapabilities - t.creds.InheritableCaps = 0 - t.creds.EffectiveCaps = auth.AllCapabilities - t.creds.BoundingCaps = auth.AllCapabilities + creds.PermittedCaps = auth.AllCapabilities + creds.InheritableCaps = 0 + creds.EffectiveCaps = auth.AllCapabilities + creds.BoundingCaps = auth.AllCapabilities // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER // flag sets the "securebits" flags (see capabilities(7)) to their default // values (all flags disabled) in the child (for clone(2)) or caller (for // unshare(2), or setns(2)." - user_namespaces(7) - t.creds.KeepCaps = false + creds.KeepCaps = false + t.creds.Store(creds) return nil } @@ -435,8 +450,9 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { func (t *Task) SetKeepCaps(k bool) { t.mu.Lock() defer t.mu.Unlock() - t.creds = t.creds.Fork() // See doc for creds. - t.creds.KeepCaps = k + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + creds.KeepCaps = k + t.creds.Store(creds) } // updateCredsForExec updates t.creds to reflect an execve(). @@ -512,15 +528,16 @@ func (t *Task) updateCredsForExecLocked() { // the effective user ID. var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0 fileEffective := false - root := t.creds.UserNamespace.MapToKUID(auth.RootUID) - if t.creds.EffectiveKUID == root || t.creds.RealKUID == root { - newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps - if t.creds.EffectiveKUID == root { + creds := t.Credentials() + root := creds.UserNamespace.MapToKUID(auth.RootUID) + if creds.EffectiveKUID == root || creds.RealKUID == root { + newPermitted = creds.InheritableCaps | creds.BoundingCaps + if creds.EffectiveKUID == root { fileEffective = true } } - t.creds = t.creds.Fork() // See doc for creds. + creds = creds.Fork() // The credentials object is immutable. See doc for creds. // Now we enter poorly-documented, somewhat confusing territory. (The // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds @@ -562,27 +579,28 @@ func (t *Task) updateCredsForExecLocked() { // But since no_new_privs is always set (A3 is always true), this becomes // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1 // is a no-op. So we can just do C1 and C2 unconditionally. - if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID { - t.creds.EffectiveKUID = t.creds.RealKUID - t.creds.EffectiveKGID = t.creds.RealKGID + if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID { + creds.EffectiveKUID = creds.RealKUID + creds.EffectiveKGID = creds.RealKGID t.parentDeathSignal = 0 } // (Saved set-user-ID is always set to the new effective user ID, and saved // set-group-ID is always set to the new effective group ID, regardless of // the above.) - t.creds.SavedKUID = t.creds.RealKUID - t.creds.SavedKGID = t.creds.RealKGID - t.creds.PermittedCaps &= newPermitted + creds.SavedKUID = creds.RealKUID + creds.SavedKGID = creds.RealKGID + creds.PermittedCaps &= newPermitted if fileEffective { - t.creds.EffectiveCaps = t.creds.PermittedCaps + creds.EffectiveCaps = creds.PermittedCaps } else { - t.creds.EffectiveCaps = 0 + creds.EffectiveCaps = 0 } // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent // calls to execve(2). - t.creds.KeepCaps = false + creds.KeepCaps = false // "The bounding set is inherited at fork(2) from the thread's parent, and // is preserved across an execve(2)". So we're done. + t.creds.Store(creds) } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 9458f5c2a..72caae537 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -119,7 +119,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { ptraceTracees: make(map[*Task]struct{}), allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, - creds: cfg.Credentials, niceness: cfg.Niceness, netns: cfg.NetworkNamespaced, utsns: cfg.UTSNamespace, @@ -129,6 +128,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, } + t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu t.ptraceTracer.Store((*Task)(nil)) // We don't construct t.blockingTimer until Task.run(); see that function diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index dca8e4c0e..f15b3415a 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -370,13 +370,16 @@ func (t *thread) destroy() { // init initializes trace options. func (t *thread) init() { - // Set our TRACESYSGOOD option to differeniate real SIGTRAP. + // Set our TRACESYSGOOD option to differeniate real SIGTRAP. We also + // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the + // sentry will immediately kill the associated stubs. + const PTRACE_O_EXITKILL = 0x100000 _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, syscall.PTRACE_SETOPTIONS, uintptr(t.tid), 0, - syscall.PTRACE_O_TRACESYSGOOD, + syscall.PTRACE_O_TRACESYSGOOD|syscall.PTRACE_O_TRACEEXIT|PTRACE_O_EXITKILL, 0, 0) if errno != 0 { panic(fmt.Sprintf("ptrace set options failed: %v", errno)) @@ -419,7 +422,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { // between syscall-enter-stop and syscall-exit-stop; it happens *after* // syscall-exit-stop.)" - ptrace(2), "Syscall-stops" if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) { - panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) + t.dumpAndPanic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) } // Grab registers. diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 2a41e8176..7f18b1ac8 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -379,7 +379,7 @@ var AMD64 = &kernel.SyscallTable{ 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil), 327: syscalls.Undocumented("preadv2", Preadv2), 328: syscalls.Undocumented("pwritev2", Pwritev2), - 397: syscalls.Undocumented("statx", Statx), + 332: syscalls.Supported("statx", Statx), }, Emulate: map[usermem.Addr]uintptr{ diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index d9ed02c99..04962726a 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -304,44 +304,100 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod return 0, syserror.ENOENT } - err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error { - if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR - } + fileFlags := linuxToFlags(flags) + // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. + fileFlags.LargeFile = true + + err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error { + // Resolve the name to see if it exists, and follow any + // symlinks along the way. We must do the symlink resolution + // manually because if the symlink target does not exist, we + // must create the target (and not the symlink itself). + var ( + found *fs.Dirent + err error + ) + for { + if !fs.IsDir(parent.Inode.StableAttr) { + return syserror.ENOTDIR + } - fileFlags := linuxToFlags(flags) - // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. - fileFlags.LargeFile = true + // Start by looking up the dirent at 'name'. + found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals) + if err != nil { + break + } + + // We found something (possibly a symlink). If the + // O_EXCL flag was passed, then we can immediately + // return EEXIST. + if flags&linux.O_EXCL != 0 { + return syserror.EEXIST + } + + // If we have a non-symlink, then we can proceed. + if !fs.IsSymlink(found.Inode.StableAttr) { + break + } + + // If O_NOFOLLOW was passed, then don't try to resolve + // anything. + if flags&linux.O_NOFOLLOW != 0 { + return syserror.ELOOP + } + + // Try to resolve the symlink directly to a Dirent. + resolved, err := found.Inode.Getlink(t) + if err == nil || err != fs.ErrResolveViaReadlink { + // No more resolution necessary. + found.DecRef() + found = resolved + break + } + + // Resolve the symlink to a path via Readlink. + path, err := found.Inode.Readlink(t) + if err != nil { + break + } + remainingTraversals-- + + // Get the new parent from the target path. + newParentPath, newName := fs.SplitLast(path) + newParent, err := t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals) + if err != nil { + break + } + + // Repeat the process with the parent and name of the + // symlink target. + parent.DecRef() + parent = newParent + name = newName + } - // Does this file exist already? - targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals) var newFile *fs.File switch err { case nil: // The file existed. - defer targetDirent.DecRef() - - // Check if we wanted to create. - if flags&linux.O_EXCL != 0 { - return syserror.EEXIST - } + defer found.DecRef() // Like sys_open, check for a few things about the // filesystem before trying to get a reference to the // fs.File. The same constraints on Check apply. - if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { + if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { return err } // Should we truncate the file? if flags&linux.O_TRUNC != 0 { - if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil { + if err := found.Inode.Truncate(t, found, 0); err != nil { return err } } // Create a new fs.File. - newFile, err = targetDirent.Inode.GetFile(t, targetDirent, fileFlags) + newFile, err = found.Inode.GetFile(t, found, fileFlags) if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } @@ -350,19 +406,19 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod // File does not exist. Proceed with creation. // Do we have write permissions on the parent? - if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } // Attempt a creation. perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) - newFile, err = d.Create(t, root, name, fileFlags, perms) + newFile, err = parent.Create(t, root, name, fileFlags, perms) if err != nil { // No luck, bail. return err } defer newFile.DecRef() - targetDirent = newFile.Dirent + found = newFile.Dirent default: return err } @@ -378,10 +434,10 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod fd = uintptr(newFD) // Queue the open inotify event. The creation event is - // automatically queued when the dirent is targetDirent. The - // open events are implemented at the syscall layer so we need - // to manually queue one here. - targetDirent.InotifyEvent(linux.IN_OPEN, 0) + // automatically queued when the dirent is found. The open + // events are implemented at the syscall layer so we need to + // manually queue one here. + found.InotifyEvent(linux.IN_OPEN, 0) return nil }) diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 7f41a9c53..d79aaff60 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -96,8 +96,10 @@ const ( // SandboxStacks collects sandbox stacks for debugging. SandboxStacks = "debug.Stacks" +) - // Profiling related commands (see pprof.go for more details). +// Profiling related commands (see pprof.go for more details). +const ( StartCPUProfile = "Profile.StartCPUProfile" StopCPUProfile = "Profile.StopCPUProfile" HeapProfile = "Profile.HeapProfile" @@ -105,6 +107,11 @@ const ( StopTrace = "Profile.StopTrace" ) +// Logging related commands (see logging.go for more details). +const ( + ChangeLogging = "Logging.Change" +) + // ControlSocketAddr generates an abstract unix socket name for the given ID. func ControlSocketAddr(id string) string { return fmt.Sprintf("\x00runsc-sandbox.%s", id) @@ -143,6 +150,7 @@ func newController(fd int, l *Loader) (*controller, error) { } srv.Register(&debug{}) + srv.Register(&control.Logging{}) if l.conf.ProfileEnable { srv.Register(&control.Profile{}) } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 67a286212..5c2220d83 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -85,6 +85,19 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, if err != nil { return nil, fmt.Errorf("creating tmpfs overlay: %v", err) } + + // Replicate permissions and owner from lower to upper mount point. + attr, err := lower.UnstableAttr(ctx) + if err != nil { + return nil, fmt.Errorf("reading attributes from lower mount point: %v", err) + } + if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) { + return nil, fmt.Errorf("error setting permission to upper mount point") + } + if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil { + return nil, fmt.Errorf("setting owner to upper mount point: %v", err) + } + return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 4af45bfcc..eca592e5b 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -37,6 +37,9 @@ import ( func init() { log.SetLevel(log.Debug) rand.Seed(time.Now().UnixNano()) + if err := fsgofer.OpenProcSelfFD(); err != nil { + panic(err) + } } func testConfig() *Config { diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index 30a69acf0..7313e473f 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -17,12 +17,15 @@ package cmd import ( "context" "os" + "strconv" + "strings" "syscall" "time" "flag" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/container" ) @@ -36,6 +39,9 @@ type Debug struct { profileCPU string profileDelay int trace string + strace string + logLevel string + logPackets string } // Name implements subcommands.Command. @@ -62,6 +68,9 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile") f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.") f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox") + f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`) + f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).") + f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.") } // Execute implements subcommands.Command.Execute. @@ -78,7 +87,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) var err error c, err = container.Load(conf.RootDir, f.Arg(0)) if err != nil { - Fatalf("loading container %q: %v", f.Arg(0), err) + return Errorf("loading container %q: %v", f.Arg(0), err) } } else { if f.NArg() != 0 { @@ -88,12 +97,12 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Go over all sandboxes and find the one that matches PID. ids, err := container.List(conf.RootDir) if err != nil { - Fatalf("listing containers: %v", err) + return Errorf("listing containers: %v", err) } for _, id := range ids { candidate, err := container.Load(conf.RootDir, id) if err != nil { - Fatalf("loading container %q: %v", id, err) + return Errorf("loading container %q: %v", id, err) } if candidate.SandboxPid() == d.pid { c = candidate @@ -101,38 +110,38 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } } if c == nil { - Fatalf("container with PID %d not found", d.pid) + return Errorf("container with PID %d not found", d.pid) } } if c.Sandbox == nil || !c.Sandbox.IsRunning() { - Fatalf("container sandbox is not running") + return Errorf("container sandbox is not running") } log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid) if d.signal > 0 { log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid) if err := syscall.Kill(c.Sandbox.Pid, syscall.Signal(d.signal)); err != nil { - Fatalf("failed to send signal %d to processs %d", d.signal, c.Sandbox.Pid) + return Errorf("failed to send signal %d to processs %d", d.signal, c.Sandbox.Pid) } } if d.stacks { log.Infof("Retrieving sandbox stacks") stacks, err := c.Sandbox.Stacks() if err != nil { - Fatalf("retrieving stacks: %v", err) + return Errorf("retrieving stacks: %v", err) } log.Infof(" *** Stack dump ***\n%s", stacks) } if d.profileHeap != "" { f, err := os.Create(d.profileHeap) if err != nil { - Fatalf(err.Error()) + return Errorf(err.Error()) } defer f.Close() if err := c.Sandbox.HeapProfile(f); err != nil { - Fatalf(err.Error()) + return Errorf(err.Error()) } log.Infof("Heap profile written to %q", d.profileHeap) } @@ -142,7 +151,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) delay = true f, err := os.Create(d.profileCPU) if err != nil { - Fatalf(err.Error()) + return Errorf(err.Error()) } defer func() { f.Close() @@ -152,7 +161,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) log.Infof("CPU profile written to %q", d.profileCPU) }() if err := c.Sandbox.StartCPUProfile(f); err != nil { - Fatalf(err.Error()) + return Errorf(err.Error()) } log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU) } @@ -160,7 +169,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) delay = true f, err := os.Create(d.trace) if err != nil { - Fatalf(err.Error()) + return Errorf(err.Error()) } defer func() { f.Close() @@ -170,15 +179,71 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) log.Infof("Trace written to %q", d.trace) }() if err := c.Sandbox.StartTrace(f); err != nil { - Fatalf(err.Error()) + return Errorf(err.Error()) } log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace) } + if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 { + args := control.LoggingArgs{} + switch strings.ToLower(d.strace) { + case "": + // strace not set, nothing to do here. + + case "off": + log.Infof("Disabling strace") + args.SetStrace = true + + case "all": + log.Infof("Enabling all straces") + args.SetStrace = true + args.EnableStrace = true + + default: + log.Infof("Enabling strace for syscalls: %s", d.strace) + args.SetStrace = true + args.EnableStrace = true + args.StraceWhitelist = strings.Split(d.strace, ",") + } + + if len(d.logLevel) != 0 { + args.SetLevel = true + switch strings.ToLower(d.logLevel) { + case "warning", "0": + args.Level = log.Warning + case "info", "1": + args.Level = log.Info + case "debug", "2": + args.Level = log.Debug + default: + return Errorf("invalid log level %q", d.logLevel) + } + log.Infof("Setting log level %v", args.Level) + } + + if len(d.logPackets) != 0 { + args.SetLogPackets = true + lp, err := strconv.ParseBool(d.logPackets) + if err != nil { + return Errorf("invalid value for log_packets %q", d.logPackets) + } + args.LogPackets = lp + if args.LogPackets { + log.Infof("Enabling packet logging") + } else { + log.Infof("Disabling packet logging") + } + } + + if err := c.Sandbox.ChangeLogging(args); err != nil { + return Errorf(err.Error()) + } + log.Infof("Logging options changed") + } + if delay { time.Sleep(time.Duration(d.profileDelay) * time.Second) - } return subcommands.ExitSuccess diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 7adc23a77..e817eff77 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -235,7 +235,11 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi cmd.SysProcAttr = &syscall.SysProcAttr{ Setsid: true, Setctty: true, - Ctty: int(tty.Fd()), + // The Ctty FD must be the FD in the child process's FD + // table. Since we set cmd.Stdin/Stdout/Stderr to the + // tty FD, we can use any of 0, 1, or 2 here. + // See https://github.com/golang/go/issues/29458. + Ctty: 0, } } diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 52609a57a..9faabf494 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -152,6 +152,10 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // modes exactly as sent by the sandbox, which will have applied its own umask. syscall.Umask(0) + if err := fsgofer.OpenProcSelfFD(); err != nil { + Fatalf("failed to open /proc/self/fd: %v", err) + } + if err := syscall.Chroot(root); err != nil { Fatalf("failed to chroot to %q: %v", root, err) } diff --git a/runsc/console/BUILD b/runsc/console/BUILD index 2d71cd371..e623c1a0f 100644 --- a/runsc/console/BUILD +++ b/runsc/console/BUILD @@ -4,7 +4,9 @@ package(licenses = ["notice"]) go_library( name = "console", - srcs = ["console.go"], + srcs = [ + "console.go", + ], importpath = "gvisor.dev/gvisor/runsc/console", visibility = [ "//runsc:__subpackages__", diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 8f50af780..f970ce88d 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -28,6 +28,7 @@ import ( "path" "path/filepath" "runtime" + "strconv" "sync" "syscall" @@ -223,6 +224,28 @@ type localFile struct { lastDirentOffset uint64 } +var procSelfFD *fd.FD + +// OpenProcSelfFD opens the /proc/self/fd directory, which will be used to +// reopen file descriptors. +func OpenProcSelfFD() error { + d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0) + if err != nil { + return fmt.Errorf("error opening /proc/self/fd: %v", err) + } + procSelfFD = fd.New(d) + return nil +} + +func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { + d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0) + if err != nil { + return nil, err + } + + return fd.New(d), nil +} + func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) { path := path.Join(parent.hostPath, name) f, err := openAnyFile(path, func(mode int) (*fd.FD, error) { @@ -348,7 +371,7 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { // name_to_handle_at and open_by_handle_at aren't supported by overlay2. log.Debugf("Open reopening file, mode: %v, %q", mode, l.hostPath) var err error - newFile, err = fd.Open(l.hostPath, openFlags|mode.OSFlags(), 0) + newFile, err = reopenProcFd(l.file, openFlags|mode.OSFlags()) if err != nil { return nil, p9.QID{}, 0, extractErrno(err) } @@ -477,7 +500,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { // Duplicate current file if 'names' is empty. if len(names) == 0 { newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { - return fd.Open(l.hostPath, openFlags|mode, 0) + return reopenProcFd(l.file, openFlags|mode) }) if err != nil { return nil, nil, extractErrno(err) @@ -635,7 +658,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { f := l.file if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { var err error - f, err = fd.Open(l.hostPath, openFlags|syscall.O_WRONLY, 0) + f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY) if err != nil { return extractErrno(err) } diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 68267df1b..0a162bb8a 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -31,6 +31,10 @@ func init() { allConfs = append(allConfs, rwConfs...) allConfs = append(allConfs, roConfs...) + + if err := OpenProcSelfFD(); err != nil { + panic(err) + } } func assertPanic(t *testing.T, f func()) { diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 3bd0291c0..6bebf0737 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -437,10 +437,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF defer tty.Close() // Set the TTY as a controlling TTY on the sandbox process. - // Note that the Ctty field must be the FD of the TTY in the - // *new* process, not this process. Since we are about to - // assign the TTY to nextFD, we can use that value here. cmd.SysProcAttr.Setctty = true + // The Ctty FD must be the FD in the child process's FD table, + // which will be nextFD in this case. + // See https://github.com/golang/go/issues/29458. cmd.SysProcAttr.Ctty = nextFD // Pass the tty as all stdio fds to sandbox. @@ -960,7 +960,7 @@ func (s *Sandbox) StartTrace(f *os.File) error { return nil } -// StopTrace stops a previously started trace.. +// StopTrace stops a previously started trace. func (s *Sandbox) StopTrace() error { log.Debugf("Trace stop %q", s.ID) conn, err := s.sandboxConnect() @@ -975,6 +975,21 @@ func (s *Sandbox) StopTrace() error { return nil } +// ChangeLogging changes logging options. +func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { + log.Debugf("Change logging start %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + if err := conn.Call(boot.ChangeLogging, &args, nil); err != nil { + return fmt.Errorf("changing sandbox %q logging: %v", s.ID, err) + } + return nil +} + // DestroyContainer destroys the given container. If it is the root container, // then the entire sandbox is destroyed. func (s *Sandbox) DestroyContainer(cid string) error { diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 731e2aa85..b06e46c03 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -240,7 +240,7 @@ syscall_test( syscall_test(test = "//test/syscalls/linux:munmap_test") syscall_test( - add_overlay = False, # TODO(gvisor.dev/issue/316): enable when fixed. + add_overlay = True, test = "//test/syscalls/linux:open_create_test", ) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 0618fea58..8a24d8c0b 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1177,6 +1177,7 @@ cc_binary( "//test/util:temp_path", "//test/util:test_main", "//test/util:test_util", + "//test/util:thread_util", "@com_google_absl//absl/strings", "@com_google_googletest//:gtest", ], @@ -2940,6 +2941,8 @@ cc_binary( testonly = 1, srcs = ["tcp_socket.cc"], linkstatic = 1, + # FIXME(b/135470853) + tags = ["flaky"], deps = [ ":socket_test_util", "//test/util:file_descriptor", @@ -3340,3 +3343,18 @@ cc_binary( "@com_google_googletest//:gtest", ], ) + +cc_binary( + name = "proc_net_tcp_test", + testonly = 1, + srcs = ["proc_net_tcp.cc"], + linkstatic = 1, + deps = [ + ":ip_socket_test_util", + "//test/util:file_descriptor", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_absl//absl/strings", + "@com_google_googletest//:gtest", + ], +) diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc index 42646bb02..e0525f386 100644 --- a/test/syscalls/linux/open.cc +++ b/test/syscalls/linux/open.cc @@ -28,6 +28,7 @@ #include "test/util/fs_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" +#include "test/util/thread_util.h" namespace gvisor { namespace testing { @@ -214,6 +215,42 @@ TEST_F(OpenTest, AppendOnly) { SyscallSucceedsWithValue(kBufSize * 3)); } +TEST_F(OpenTest, AppendConcurrentWrite) { + constexpr int kThreadCount = 5; + constexpr int kBytesPerThread = 10000; + std::unique_ptr<ScopedThread> threads[kThreadCount]; + + // In case of the uncached policy, we expect that a file system can be changed + // externally, so we create a new inode each time when we open a file and we + // can't guarantee that writes to files with O_APPEND will work correctly. + SKIP_IF(getenv("GVISOR_GOFER_UNCACHED")); + + EXPECT_THAT(truncate(test_file_name_.c_str(), 0), SyscallSucceeds()); + + std::string filename = test_file_name_; + DisableSave ds; // Too many syscalls. + // Start kThreadCount threads which will write concurrently into the same + // file. + for (int i = 0; i < kThreadCount; i++) { + threads[i] = absl::make_unique<ScopedThread>([filename]() { + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_RDWR | O_APPEND)); + + for (int j = 0; j < kBytesPerThread; j++) { + EXPECT_THAT(WriteFd(fd.get(), &j, 1), SyscallSucceedsWithValue(1)); + } + }); + } + for (int i = 0; i < kThreadCount; i++) { + threads[i]->Join(); + } + + // Check that the size of the file is correct. + struct stat st; + EXPECT_THAT(stat(test_file_name_.c_str(), &st), SyscallSucceeds()); + EXPECT_EQ(st.st_size, kThreadCount * kBytesPerThread); +} + TEST_F(OpenTest, Truncate) { { // First write some data to the new file and close it. diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc new file mode 100644 index 000000000..578b20680 --- /dev/null +++ b/test/syscalls/linux/proc_net_tcp.cc @@ -0,0 +1,281 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include "gtest/gtest.h" +#include "gtest/gtest.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_join.h" +#include "absl/strings/str_split.h" +#include "test/syscalls/linux/ip_socket_test_util.h" +#include "test/util/file_descriptor.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { +namespace { + +using absl::StrCat; +using absl::StrSplit; + +constexpr char kProcNetTCPHeader[] = + " sl local_address rem_address st tx_queue rx_queue tr tm->when " + "retrnsmt uid timeout inode " + " "; + +// Possible values of the "st" field in a /proc/net/tcp entry. Source: Linux +// kernel, include/net/tcp_states.h. +enum { + TCP_ESTABLISHED = 1, + TCP_SYN_SENT, + TCP_SYN_RECV, + TCP_FIN_WAIT1, + TCP_FIN_WAIT2, + TCP_TIME_WAIT, + TCP_CLOSE, + TCP_CLOSE_WAIT, + TCP_LAST_ACK, + TCP_LISTEN, + TCP_CLOSING, + TCP_NEW_SYN_RECV, + + TCP_MAX_STATES +}; + +// TCPEntry represents a single entry from /proc/net/tcp. +struct TCPEntry { + uint32_t local_addr; + uint16_t local_port; + + uint32_t remote_addr; + uint16_t remote_port; + + uint64_t state; + uint64_t uid; + uint64_t inode; +}; + +uint32_t IP(const struct sockaddr* addr) { + auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr); + return in_addr->sin_addr.s_addr; +} + +uint16_t Port(const struct sockaddr* addr) { + auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr); + return ntohs(in_addr->sin_port); +} + +// Finds the first entry in 'entries' for which 'predicate' returns true. +// Returns true on match, and sets 'match' to point to the matching entry. +bool FindBy(std::vector<TCPEntry> entries, TCPEntry* match, + std::function<bool(const TCPEntry&)> predicate) { + for (int i = 0; i < entries.size(); ++i) { + if (predicate(entries[i])) { + *match = entries[i]; + return true; + } + } + return false; +} + +bool FindByLocalAddr(std::vector<TCPEntry> entries, TCPEntry* match, + const struct sockaddr* addr) { + uint32_t host = IP(addr); + uint16_t port = Port(addr); + return FindBy(entries, match, [host, port](const TCPEntry& e) { + return (e.local_addr == host && e.local_port == port); + }); +} + +bool FindByRemoteAddr(std::vector<TCPEntry> entries, TCPEntry* match, + const struct sockaddr* addr) { + uint32_t host = IP(addr); + uint16_t port = Port(addr); + return FindBy(entries, match, [host, port](const TCPEntry& e) { + return (e.remote_addr == host && e.remote_port == port); + }); +} + +// Returns a parsed representation of /proc/net/tcp entries. +PosixErrorOr<std::vector<TCPEntry>> ProcNetTCPEntries() { + std::string content; + RETURN_IF_ERRNO(GetContents("/proc/net/tcp", &content)); + + bool found_header = false; + std::vector<TCPEntry> entries; + std::vector<std::string> lines = StrSplit(content, '\n'); + std::cerr << "<contents of /proc/net/tcp>" << std::endl; + for (std::string line : lines) { + std::cerr << line << std::endl; + + if (!found_header) { + EXPECT_EQ(line, kProcNetTCPHeader); + found_header = true; + continue; + } + if (line.empty()) { + continue; + } + + // Parse a single entry from /proc/net/tcp. + // + // Example entries: + // + // clang-format off + // + // sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode + // 0: 00000000:006F 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 1968 1 0000000000000000 100 0 0 10 0 + // 1: 0100007F:7533 00000000:0000 0A 00000000:00000000 00:00000000 00000000 120 0 10684 1 0000000000000000 100 0 0 10 0 + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + // + // clang-format on + + TCPEntry entry; + std::vector<std::string> fields = + StrSplit(line, absl::ByAnyChar(": "), absl::SkipEmpty()); + + ASSIGN_OR_RETURN_ERRNO(entry.local_addr, AtoiBase(fields[1], 16)); + ASSIGN_OR_RETURN_ERRNO(entry.local_port, AtoiBase(fields[2], 16)); + + ASSIGN_OR_RETURN_ERRNO(entry.remote_addr, AtoiBase(fields[3], 16)); + ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16)); + + ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16)); + ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11])); + ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13])); + + entries.push_back(entry); + } + std::cerr << "<end of /proc/net/tcp>" << std::endl; + + return entries; +} + +TEST(ProcNetTCP, Exists) { + const std::string content = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/tcp")); + const std::string header_line = StrCat(kProcNetTCPHeader, "\n"); + if (IsRunningOnGvisor()) { + // Should be just the header since we don't have any tcp sockets yet. + EXPECT_EQ(content, header_line); + } else { + // On a general linux machine, we could have abitrary sockets on the system, + // so just check the header. + EXPECT_THAT(content, ::testing::StartsWith(header_line)); + } +} + +TEST(ProcNetTCP, EntryUID) { + auto sockets = + ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPAcceptBindSocketPair(0).Create()); + std::vector<TCPEntry> entries = + ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries()); + TCPEntry e; + EXPECT_TRUE(FindByLocalAddr(entries, &e, sockets->first_addr())); + EXPECT_EQ(e.uid, geteuid()); + EXPECT_TRUE(FindByRemoteAddr(entries, &e, sockets->first_addr())); + EXPECT_EQ(e.uid, geteuid()); +} + +TEST(ProcNetTCP, BindAcceptConnect) { + auto sockets = + ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPAcceptBindSocketPair(0).Create()); + std::vector<TCPEntry> entries = + ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries()); + // We can only make assertions about the total number of entries if we control + // the entire "machine". + if (IsRunningOnGvisor()) { + EXPECT_EQ(entries.size(), 2); + } + + TCPEntry e; + EXPECT_TRUE(FindByLocalAddr(entries, &e, sockets->first_addr())); + EXPECT_TRUE(FindByRemoteAddr(entries, &e, sockets->first_addr())); +} + +TEST(ProcNetTCP, InodeReasonable) { + auto sockets = + ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPAcceptBindSocketPair(0).Create()); + std::vector<TCPEntry> entries = + ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries()); + + TCPEntry accepted_entry; + ASSERT_TRUE(FindByLocalAddr(entries, &accepted_entry, sockets->first_addr())); + EXPECT_NE(accepted_entry.inode, 0); + + TCPEntry client_entry; + ASSERT_TRUE(FindByRemoteAddr(entries, &client_entry, sockets->first_addr())); + EXPECT_NE(client_entry.inode, 0); + EXPECT_NE(accepted_entry.inode, client_entry.inode); +} + +TEST(ProcNetTCP, State) { + std::unique_ptr<FileDescriptor> server = + ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPUnboundSocket(0).Create()); + + auto test_addr = V4Loopback(); + ASSERT_THAT( + bind(server->get(), reinterpret_cast<struct sockaddr*>(&test_addr.addr), + test_addr.addr_len), + SyscallSucceeds()); + + struct sockaddr addr; + socklen_t addrlen = sizeof(struct sockaddr); + ASSERT_THAT(getsockname(server->get(), &addr, &addrlen), SyscallSucceeds()); + ASSERT_EQ(addrlen, sizeof(struct sockaddr)); + + ASSERT_THAT(listen(server->get(), 10), SyscallSucceeds()); + std::vector<TCPEntry> entries = + ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries()); + TCPEntry listen_entry; + ASSERT_TRUE(FindByLocalAddr(entries, &listen_entry, &addr)); + EXPECT_EQ(listen_entry.state, TCP_LISTEN); + + std::unique_ptr<FileDescriptor> client = + ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPUnboundSocket(0).Create()); + ASSERT_THAT(connect(client->get(), &addr, addrlen), SyscallSucceeds()); + entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries()); + ASSERT_TRUE(FindByLocalAddr(entries, &listen_entry, &addr)); + EXPECT_EQ(listen_entry.state, TCP_LISTEN); + TCPEntry client_entry; + ASSERT_TRUE(FindByRemoteAddr(entries, &client_entry, &addr)); + EXPECT_EQ(client_entry.state, TCP_ESTABLISHED); + + FileDescriptor accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(server->get(), nullptr, nullptr)); + + const uint32_t accepted_local_host = IP(&addr); + const uint16_t accepted_local_port = Port(&addr); + + entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries()); + TCPEntry accepted_entry; + ASSERT_TRUE(FindBy(entries, &accepted_entry, + [client_entry, accepted_local_host, + accepted_local_port](const TCPEntry& e) { + return e.local_addr == accepted_local_host && + e.local_port == accepted_local_port && + e.remote_addr == client_entry.local_addr && + e.remote_port == client_entry.local_port; + })); + EXPECT_EQ(accepted_entry.state, TCP_ESTABLISHED); +} + +} // namespace +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc index 82d325c17..74acbe92c 100644 --- a/test/syscalls/linux/proc_net_unix.cc +++ b/test/syscalls/linux/proc_net_unix.cc @@ -162,7 +162,7 @@ PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() { // Finds the first entry in 'entries' for which 'predicate' returns true. // Returns true on match, and sets 'match' to point to the matching entry. bool FindBy(std::vector<UnixEntry> entries, UnixEntry* match, - std::function<bool(UnixEntry)> predicate) { + std::function<bool(const UnixEntry&)> predicate) { for (int i = 0; i < entries.size(); ++i) { if (predicate(entries[i])) { *match = entries[i]; @@ -174,7 +174,8 @@ bool FindBy(std::vector<UnixEntry> entries, UnixEntry* match, bool FindByPath(std::vector<UnixEntry> entries, UnixEntry* match, const std::string& path) { - return FindBy(entries, match, [path](UnixEntry e) { return e.path == path; }); + return FindBy(entries, match, + [path](const UnixEntry& e) { return e.path == path; }); } TEST(ProcNetUnix, Exists) { diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index 0e914215d..510f7bee5 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -558,7 +558,7 @@ TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) { #ifndef SYS_statx #if defined(__x86_64__) -#define SYS_statx 397 +#define SYS_statx 332 #else #error "Unknown architecture" #endif @@ -607,7 +607,8 @@ int statx(int dirfd, const char *pathname, int flags, unsigned int mask, } TEST_F(StatTest, StatxAbsPath) { - SKIP_IF(statx(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + errno == ENOSYS); struct kernel_statx stx; EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, STATX_ALL, &stx), @@ -616,7 +617,8 @@ TEST_F(StatTest, StatxAbsPath) { } TEST_F(StatTest, StatxRelPathDirFD) { - SKIP_IF(statx(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + errno == ENOSYS); struct kernel_statx stx; auto const dirfd = @@ -629,7 +631,8 @@ TEST_F(StatTest, StatxRelPathDirFD) { } TEST_F(StatTest, StatxRelPathCwd) { - SKIP_IF(statx(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + errno == ENOSYS); ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds()); auto filename = std::string(Basename(test_file_name_)); @@ -640,7 +643,8 @@ TEST_F(StatTest, StatxRelPathCwd) { } TEST_F(StatTest, StatxEmptyPath) { - SKIP_IF(statx(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + errno == ENOSYS); const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY)); struct kernel_statx stx; diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc index 494072a9b..dce8de9ec 100644 --- a/test/syscalls/linux/symlink.cc +++ b/test/syscalls/linux/symlink.cc @@ -272,6 +272,77 @@ TEST(SymlinkTest, ChmodSymlink) { EXPECT_EQ(FilePermission(newpath), 0777); } +class ParamSymlinkTest : public ::testing::TestWithParam<std::string> {}; + +// Test that creating an existing symlink with creat will create the target. +TEST_P(ParamSymlinkTest, CreatLinkCreatesTarget) { + const std::string target = GetParam(); + const std::string linkpath = NewTempAbsPath(); + + ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds()); + + int fd; + EXPECT_THAT(fd = creat(linkpath.c_str(), 0666), SyscallSucceeds()); + ASSERT_THAT(close(fd), SyscallSucceeds()); + + ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds()); + struct stat st; + EXPECT_THAT(stat(target.c_str(), &st), SyscallSucceeds()); + + ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds()); + ASSERT_THAT(unlink(target.c_str()), SyscallSucceeds()); +} + +// Test that opening an existing symlink with O_CREAT will create the target. +TEST_P(ParamSymlinkTest, OpenLinkCreatesTarget) { + const std::string target = GetParam(); + const std::string linkpath = NewTempAbsPath(); + + ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds()); + + int fd; + EXPECT_THAT(fd = open(linkpath.c_str(), O_CREAT, 0666), SyscallSucceeds()); + ASSERT_THAT(close(fd), SyscallSucceeds()); + + ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds()); + struct stat st; + EXPECT_THAT(stat(target.c_str(), &st), SyscallSucceeds()); + + ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds()); + ASSERT_THAT(unlink(target.c_str()), SyscallSucceeds()); +} + +// Test that opening an existing symlink with O_CREAT|O_EXCL will fail with +// EEXIST. +TEST_P(ParamSymlinkTest, OpenLinkExclFails) { + const std::string target = GetParam(); + const std::string linkpath = NewTempAbsPath(); + + ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds()); + + EXPECT_THAT(open(linkpath.c_str(), O_CREAT | O_EXCL, 0666), + SyscallFailsWithErrno(EEXIST)); + + ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds()); +} + +// Test that opening an existing symlink with O_CREAT|O_NOFOLLOW will fail with +// ELOOP. +TEST_P(ParamSymlinkTest, OpenLinkNoFollowFails) { + const std::string target = GetParam(); + const std::string linkpath = NewTempAbsPath(); + + ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds()); + + EXPECT_THAT(open(linkpath.c_str(), O_CREAT | O_NOFOLLOW, 0666), + SyscallFailsWithErrno(ELOOP)); + + ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds()); +} + +INSTANTIATE_TEST_SUITE_P(AbsAndRelTarget, ParamSymlinkTest, + ::testing::Values(NewTempAbsPath(), NewTempRelPath())); + } // namespace } // namespace testing diff --git a/third_party/gvsync/atomicptr_unsafe.go b/third_party/gvsync/atomicptr_unsafe.go index 53a943282..525c4beed 100644 --- a/third_party/gvsync/atomicptr_unsafe.go +++ b/third_party/gvsync/atomicptr_unsafe.go @@ -21,8 +21,18 @@ type Value struct{} // Note that copying AtomicPtr by value performs a non-atomic read of the // stored pointer, which is unsafe if Store() can be called concurrently; in // this case, do `dst.Store(src.Load())` instead. +// +// +stateify savable type AtomicPtr struct { - ptr unsafe.Pointer + ptr unsafe.Pointer `state:".(*Value)"` +} + +func (p *AtomicPtr) savePtr() *Value { + return p.Load() +} + +func (p *AtomicPtr) loadPtr(v *Value) { + p.Store(v) } // Load returns the value set by the most recent Store. It returns nil if there diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go index 4e5cc53a2..22c714c13 100644 --- a/tools/go_generics/generics.go +++ b/tools/go_generics/generics.go @@ -222,7 +222,11 @@ func main() { // Modify the state tag appropriately. if m := stateTagRegexp.FindStringSubmatch(ident.Name); m != nil { if t := identifierRegexp.FindStringSubmatch(m[2]); t != nil { - ident.Name = m[1] + `state:".(` + t[1] + *prefix + t[2] + *suffix + t[3] + `)"` + m[3] + typeName := *prefix + t[2] + *suffix + if n, ok := types[t[2]]; ok { + typeName = n + } + ident.Name = m[1] + `state:".(` + t[1] + typeName + t[3] + `)"` + m[3] } } } |