diff options
Diffstat (limited to 'pkg/sentry/fsimpl')
74 files changed, 5680 insertions, 1421 deletions
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD new file mode 100644 index 000000000..585764223 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -0,0 +1,43 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "devpts", + srcs = [ + "devpts.go", + "line_discipline.go", + "master.go", + "queue.go", + "slave.go", + "terminal.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/unimpl", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "devpts_test", + size = "small", + srcs = ["devpts_test.go"], + library = ":devpts", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/contexttest", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go new file mode 100644 index 000000000..181d765d3 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -0,0 +1,209 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package devpts provides a filesystem implementation that behaves like +// devpts. +package devpts + +import ( + "fmt" + "math" + "sort" + "strconv" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the filesystem name. +const Name = "devpts" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +var _ vfs.FilesystemType = (*FilesystemType)(nil) + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + // No data allowed. + if opts.Data != "" { + return nil, nil, syserror.EINVAL + } + + fs, root := fstype.newFilesystem(vfsObj, creds) + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// newFilesystem creates a new devpts filesystem with root directory and ptmx +// master inode. It returns the filesystem and root Dentry. +func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*kernfs.Filesystem, *kernfs.Dentry) { + fs := &kernfs.Filesystem{} + fs.Init(vfsObj, fstype) + + // Construct the root directory. This is always inode id 1. + root := &rootInode{ + slaves: make(map[uint32]*slaveInode), + } + root.InodeAttrs.Init(creds, 1, linux.ModeDirectory|0555) + root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + root.dentry.Init(root) + + // Construct the pts master inode and dentry. Linux always uses inode + // id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx. + master := &masterInode{ + root: root, + } + master.InodeAttrs.Init(creds, 2, linux.ModeCharacterDevice|0666) + master.dentry.Init(master) + + // Add the master as a child of the root. + links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{ + "ptmx": &master.dentry, + }) + root.IncLinks(links) + + return fs, &root.dentry +} + +// rootInode is the root directory inode for the devpts mounts. +type rootInode struct { + kernfs.AlwaysValid + kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.OrderedChildren + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // master is the master pty inode. Immutable. + master *masterInode + + // root is the root directory inode for this filesystem. Immutable. + root *rootInode + + // mu protects the fields below. + mu sync.Mutex + + // slaves maps pty ids to slave inodes. + slaves map[uint32]*slaveInode + + // nextIdx is the next pty index to use. Must be accessed atomically. + // + // TODO(b/29356795): reuse indices when ptys are closed. + nextIdx uint32 +} + +var _ kernfs.Inode = (*rootInode)(nil) + +// allocateTerminal creates a new Terminal and installs a pts node for it. +func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) { + i.mu.Lock() + defer i.mu.Unlock() + if i.nextIdx == math.MaxUint32 { + return nil, syserror.ENOMEM + } + idx := i.nextIdx + i.nextIdx++ + + // Sanity check that slave with idx does not exist. + if _, ok := i.slaves[idx]; ok { + panic(fmt.Sprintf("pty index collision; index %d already exists", idx)) + } + + // Create the new terminal and slave. + t := newTerminal(idx) + slave := &slaveInode{ + root: i, + t: t, + } + // Linux always uses pty index + 3 as the inode id. See + // fs/devpts/inode.c:devpts_pty_new(). + slave.InodeAttrs.Init(creds, uint64(idx+3), linux.ModeCharacterDevice|0600) + slave.dentry.Init(slave) + i.slaves[idx] = slave + + return t, nil +} + +// masterClose is called when the master end of t is closed. +func (i *rootInode) masterClose(t *Terminal) { + i.mu.Lock() + defer i.mu.Unlock() + + // Sanity check that slave with idx exists. + if _, ok := i.slaves[t.n]; !ok { + panic(fmt.Sprintf("pty with index %d does not exist", t.n)) + } + delete(i.slaves, t.n) +} + +// Open implements kernfs.Inode.Open. +func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// Lookup implements kernfs.Inode.Lookup. +func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + idx, err := strconv.ParseUint(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + i.mu.Lock() + defer i.mu.Unlock() + if si, ok := i.slaves[uint32(idx)]; ok { + si.dentry.IncRef() + return si.dentry.VFSDentry(), nil + + } + return nil, syserror.ENOENT +} + +// IterDirents implements kernfs.Inode.IterDirents. +func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + i.mu.Lock() + defer i.mu.Unlock() + ids := make([]int, 0, len(i.slaves)) + for id := range i.slaves { + ids = append(ids, int(id)) + } + sort.Ints(ids) + for _, id := range ids[relOffset:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(id), 10), + Type: linux.DT_CHR, + Ino: i.slaves[uint32(id)].InodeAttrs.Ino(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go new file mode 100644 index 000000000..b7c149047 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/devpts_test.go @@ -0,0 +1,56 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" +) + +func TestSimpleMasterToSlave(t *testing.T) { + ld := newLineDiscipline(linux.DefaultSlaveTermios) + ctx := contexttest.Context(t) + inBytes := []byte("hello, tty\n") + src := usermem.BytesIOSequence(inBytes) + outBytes := make([]byte, 32) + dst := usermem.BytesIOSequence(outBytes) + + // Write to the input queue. + nw, err := ld.inputQueueWrite(ctx, src) + if err != nil { + t.Fatalf("error writing to input queue: %v", err) + } + if nw != int64(len(inBytes)) { + t.Fatalf("wrote wrong length: got %d, want %d", nw, len(inBytes)) + } + + // Read from the input queue. + nr, err := ld.inputQueueRead(ctx, dst) + if err != nil { + t.Fatalf("error reading from input queue: %v", err) + } + if nr != int64(len(inBytes)) { + t.Fatalf("read wrong length: got %d, want %d", nr, len(inBytes)) + } + + outStr := string(outBytes[:nr]) + inStr := string(inBytes) + if outStr != inStr { + t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr) + } +} diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go new file mode 100644 index 000000000..e201801d6 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -0,0 +1,449 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "bytes" + "unicode/utf8" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +const ( + // canonMaxBytes is the number of bytes that fit into a single line of + // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE + // in include/linux/tty.h. + canonMaxBytes = 4096 + + // nonCanonMaxBytes is the maximum number of bytes that can be read at + // a time in noncanonical mode. + nonCanonMaxBytes = canonMaxBytes - 1 + + spacesPerTab = 8 +) + +// lineDiscipline dictates how input and output are handled between the +// pseudoterminal (pty) master and slave. It can be configured to alter I/O, +// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man +// pages are good resources for how to affect the line discipline: +// +// * termios(3) +// * tty_ioctl(4) +// +// This file corresponds most closely to drivers/tty/n_tty.c. +// +// lineDiscipline has a simple structure but supports a multitude of options +// (see the above man pages). It consists of two queues of bytes: one from the +// terminal master to slave (the input queue) and one from slave to master (the +// output queue). When bytes are written to one end of the pty, the line +// discipline reads the bytes, modifies them or takes special action if +// required, and enqueues them to be read by the other end of the pty: +// +// input from terminal +-------------+ input to process (e.g. bash) +// +------------------------>| input queue |---------------------------+ +// | (inputQueueWrite) +-------------+ (inputQueueRead) | +// | | +// | v +// masterFD slaveFD +// ^ | +// | | +// | output to terminal +--------------+ output from process | +// +------------------------| output queue |<--------------------------+ +// (outputQueueRead) +--------------+ (outputQueueWrite) +// +// Lock order: +// termiosMu +// inQueue.mu +// outQueue.mu +// +// +stateify savable +type lineDiscipline struct { + // sizeMu protects size. + sizeMu sync.Mutex `state:"nosave"` + + // size is the terminal size (width and height). + size linux.WindowSize + + // inQueue is the input queue of the terminal. + inQueue queue + + // outQueue is the output queue of the terminal. + outQueue queue + + // termiosMu protects termios. + termiosMu sync.RWMutex `state:"nosave"` + + // termios is the terminal configuration used by the lineDiscipline. + termios linux.KernelTermios + + // column is the location in a row of the cursor. This is important for + // handling certain special characters like backspace. + column int + + // masterWaiter is used to wait on the master end of the TTY. + masterWaiter waiter.Queue `state:"zerovalue"` + + // slaveWaiter is used to wait on the slave end of the TTY. + slaveWaiter waiter.Queue `state:"zerovalue"` +} + +func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { + ld := lineDiscipline{termios: termios} + ld.inQueue.transformer = &inputQueueTransformer{} + ld.outQueue.transformer = &outputQueueTransformer{} + return &ld +} + +// getTermios gets the linux.Termios for the tty. +func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + // We must copy a Termios struct, not KernelTermios. + t := l.termios.ToTermios() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err +} + +// setTermios sets a linux.Termios for the tty. +func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + l.termiosMu.Lock() + defer l.termiosMu.Unlock() + oldCanonEnabled := l.termios.LEnabled(linux.ICANON) + // We must copy a Termios struct, not KernelTermios. + var t linux.Termios + _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{ + AddressSpaceActive: true, + }) + l.termios.FromTermios(t) + + // If canonical mode is turned off, move bytes from inQueue's wait + // buffer to its read buffer. Anything already in the read buffer is + // now readable. + if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) { + l.inQueue.mu.Lock() + l.inQueue.pushWaitBufLocked(l) + l.inQueue.readable = true + l.inQueue.mu.Unlock() + l.slaveWaiter.Notify(waiter.EventIn) + } + + return 0, err +} + +func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + l.sizeMu.Lock() + defer l.sizeMu.Unlock() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err +} + +func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + l.sizeMu.Lock() + defer l.sizeMu.Unlock() + _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err +} + +func (l *lineDiscipline) masterReadiness() waiter.EventMask { + // We don't have to lock a termios because the default master termios + // is immutable. + return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios) +} + +func (l *lineDiscipline) slaveReadiness() waiter.EventMask { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios) +} + +func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + return l.inQueue.readableSize(ctx, io, args) +} + +func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, pushed, err := l.inQueue.read(ctx, dst, l) + if err != nil { + return 0, err + } + if n > 0 { + l.masterWaiter.Notify(waiter.EventOut) + if pushed { + l.slaveWaiter.Notify(waiter.EventIn) + } + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, err := l.inQueue.write(ctx, src, l) + if err != nil { + return 0, err + } + if n > 0 { + l.slaveWaiter.Notify(waiter.EventIn) + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + return l.outQueue.readableSize(ctx, io, args) +} + +func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, pushed, err := l.outQueue.read(ctx, dst, l) + if err != nil { + return 0, err + } + if n > 0 { + l.slaveWaiter.Notify(waiter.EventOut) + if pushed { + l.masterWaiter.Notify(waiter.EventIn) + } + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, err := l.outQueue.write(ctx, src, l) + if err != nil { + return 0, err + } + if n > 0 { + l.masterWaiter.Notify(waiter.EventIn) + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +// transformer is a helper interface to make it easier to stateify queue. +type transformer interface { + // transform functions require queue's mutex to be held. + transform(*lineDiscipline, *queue, []byte) int +} + +// outputQueueTransformer implements transformer. It performs line discipline +// transformations on the output queue. +// +// +stateify savable +type outputQueueTransformer struct{} + +// transform does output processing for one end of the pty. See +// drivers/tty/n_tty.c:do_output_char for an analogous kernel function. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { + // transformOutput is effectively always in noncanonical mode, as the + // master termios never has ICANON set. + + if !l.termios.OEnabled(linux.OPOST) { + q.readBuf = append(q.readBuf, buf...) + if len(q.readBuf) > 0 { + q.readable = true + } + return len(buf) + } + + var ret int + for len(buf) > 0 { + size := l.peek(buf) + cBytes := append([]byte{}, buf[:size]...) + ret += size + buf = buf[size:] + // We're guaranteed that cBytes has at least one element. + switch cBytes[0] { + case '\n': + if l.termios.OEnabled(linux.ONLRET) { + l.column = 0 + } + if l.termios.OEnabled(linux.ONLCR) { + q.readBuf = append(q.readBuf, '\r', '\n') + continue + } + case '\r': + if l.termios.OEnabled(linux.ONOCR) && l.column == 0 { + continue + } + if l.termios.OEnabled(linux.OCRNL) { + cBytes[0] = '\n' + if l.termios.OEnabled(linux.ONLRET) { + l.column = 0 + } + break + } + l.column = 0 + case '\t': + spaces := spacesPerTab - l.column%spacesPerTab + if l.termios.OutputFlags&linux.TABDLY == linux.XTABS { + l.column += spaces + q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...) + continue + } + l.column += spaces + case '\b': + if l.column > 0 { + l.column-- + } + default: + l.column++ + } + q.readBuf = append(q.readBuf, cBytes...) + } + if len(q.readBuf) > 0 { + q.readable = true + } + return ret +} + +// inputQueueTransformer implements transformer. It performs line discipline +// transformations on the input queue. +// +// +stateify savable +type inputQueueTransformer struct{} + +// transform does input processing for one end of the pty. Characters read are +// transformed according to flags set in the termios struct. See +// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel +// function. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { + // If there's a line waiting to be read in canonical mode, don't write + // anything else to the read buffer. + if l.termios.LEnabled(linux.ICANON) && q.readable { + return 0 + } + + maxBytes := nonCanonMaxBytes + if l.termios.LEnabled(linux.ICANON) { + maxBytes = canonMaxBytes + } + + var ret int + for len(buf) > 0 && len(q.readBuf) < canonMaxBytes { + size := l.peek(buf) + cBytes := append([]byte{}, buf[:size]...) + // We're guaranteed that cBytes has at least one element. + switch cBytes[0] { + case '\r': + if l.termios.IEnabled(linux.IGNCR) { + buf = buf[size:] + ret += size + continue + } + if l.termios.IEnabled(linux.ICRNL) { + cBytes[0] = '\n' + } + case '\n': + if l.termios.IEnabled(linux.INLCR) { + cBytes[0] = '\r' + } + } + + // In canonical mode, we discard non-terminating characters + // after the first 4095. + if l.shouldDiscard(q, cBytes) { + buf = buf[size:] + ret += size + continue + } + + // Stop if the buffer would be overfilled. + if len(q.readBuf)+size > maxBytes { + break + } + buf = buf[size:] + ret += size + + // If we get EOF, make the buffer available for reading. + if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) { + q.readable = true + break + } + + q.readBuf = append(q.readBuf, cBytes...) + + // Anything written to the readBuf will have to be echoed. + if l.termios.LEnabled(linux.ECHO) { + l.outQueue.writeBytes(cBytes, l) + l.masterWaiter.Notify(waiter.EventIn) + } + + // If we finish a line, make it available for reading. + if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) { + q.readable = true + break + } + } + + // In noncanonical mode, everything is readable. + if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 { + q.readable = true + } + + return ret +} + +// shouldDiscard returns whether c should be discarded. In canonical mode, if +// too many bytes are enqueued, we keep reading input and discarding it until +// we find a terminating character. Signal/echo processing still occurs. +// +// Precondition: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool { + return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes) +} + +// peek returns the size in bytes of the next character to process. As long as +// b isn't empty, peek returns a value of at least 1. +func (l *lineDiscipline) peek(b []byte) int { + size := 1 + // If UTF-8 support is enabled, runes might be multiple bytes. + if l.termios.IEnabled(linux.IUTF8) { + _, size = utf8.DecodeRune(b) + } + return size +} + +// LINT.ThenChange(../../fs/tty/line_discipline.go) diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go new file mode 100644 index 000000000..04a292927 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -0,0 +1,226 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +// masterInode is the inode for the master end of the Terminal. +type masterInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // root is the devpts root inode. + root *rootInode +} + +var _ kernfs.Inode = (*masterInode)(nil) + +// Open implements kernfs.Inode.Open. +func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + t, err := mi.root.allocateTerminal(rp.Credentials()) + if err != nil { + return nil, err + } + + mi.IncRef() + fd := &masterFileDescription{ + inode: mi, + t: t, + } + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + mi.DecRef() + return nil, err + } + return &fd.vfsfd, nil +} + +// Stat implements kernfs.Inode.Stat. +func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := mi.InodeAttrs.Stat(vfsfs, opts) + if err != nil { + return linux.Statx{}, err + } + statx.Blksize = 1024 + statx.RdevMajor = linux.TTYAUX_MAJOR + statx.RdevMinor = linux.PTMX_MINOR + return statx, nil +} + +// SetStat implements kernfs.Inode.SetStat +func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask&linux.STATX_SIZE != 0 { + return syserror.EINVAL + } + return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) +} + +type masterFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + inode *masterInode + t *Terminal +} + +var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (mfd *masterFileDescription) Release() { + mfd.inode.root.masterClose(mfd.t) + mfd.inode.DecRef() +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (mfd *masterFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + mfd.t.ld.masterWaiter.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (mfd *masterFileDescription) EventUnregister(e *waiter.Entry) { + mfd.t.ld.masterWaiter.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (mfd *masterFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return mfd.t.ld.masterReadiness() +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (mfd *masterFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return mfd.t.ld.outputQueueRead(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return mfd.t.ld.inputQueueWrite(ctx, src) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ + // Get the number of bytes in the output queue read buffer. + return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args) + case linux.TCGETS: + // N.B. TCGETS on the master actually returns the configuration + // of the slave end. + return mfd.t.ld.getTermios(ctx, io, args) + case linux.TCSETS: + // N.B. TCSETS on the master actually affects the configuration + // of the slave end. + return mfd.t.ld.setTermios(ctx, io, args) + case linux.TCSETSW: + // TODO(b/29356795): This should drain the output queue first. + return mfd.t.ld.setTermios(ctx, io, args) + case linux.TIOCGPTN: + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + case linux.TIOCSPTLCK: + // TODO(b/29356795): Implement pty locking. For now just pretend we do. + return 0, nil + case linux.TIOCGWINSZ: + return 0, mfd.t.ld.windowSize(ctx, io, args) + case linux.TIOCSWINSZ: + return 0, mfd.t.ld.setWindowSize(ctx, io, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */) + default: + maybeEmitUnimplementedEvent(ctx, cmd) + return 0, syserror.ENOTTY + } +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() + return mfd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() + return mfd.inode.Stat(fs, opts) +} + +// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. +func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { + switch cmd { + case linux.TCGETS, + linux.TCSETS, + linux.TCSETSW, + linux.TCSETSF, + linux.TIOCGWINSZ, + linux.TIOCSWINSZ, + linux.TIOCSETD, + linux.TIOCSBRK, + linux.TIOCCBRK, + linux.TCSBRK, + linux.TCSBRKP, + linux.TIOCSTI, + linux.TIOCCONS, + linux.FIONBIO, + linux.TIOCEXCL, + linux.TIOCNXCL, + linux.TIOCGEXCL, + linux.TIOCGSID, + linux.TIOCGETD, + linux.TIOCVHANGUP, + linux.TIOCGDEV, + linux.TIOCMGET, + linux.TIOCMSET, + linux.TIOCMBIC, + linux.TIOCMBIS, + linux.TIOCGICOUNT, + linux.TCFLSH, + linux.TIOCSSERIAL, + linux.TIOCGPTPEER: + + unimpl.EmitUnimplementedEvent(ctx) + } +} + +// LINT.ThenChange(../../fs/tty/master.go) diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go new file mode 100644 index 000000000..29a6be858 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -0,0 +1,240 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +// waitBufMaxBytes is the maximum size of a wait buffer. It is based on +// TTYB_DEFAULT_MEM_LIMIT. +const waitBufMaxBytes = 131072 + +// queue represents one of the input or output queues between a pty master and +// slave. Bytes written to a queue are added to the read buffer until it is +// full, at which point they are written to the wait buffer. Bytes are +// processed (i.e. undergo termios transformations) as they are added to the +// read buffer. The read buffer is readable when its length is nonzero and +// readable is true. +// +// +stateify savable +type queue struct { + // mu protects everything in queue. + mu sync.Mutex `state:"nosave"` + + // readBuf is buffer of data ready to be read when readable is true. + // This data has been processed. + readBuf []byte + + // waitBuf contains data that can't fit into readBuf. It is put here + // until it can be loaded into the read buffer. waitBuf contains data + // that hasn't been processed. + waitBuf [][]byte + waitBufLen uint64 + + // readable indicates whether the read buffer can be read from. In + // canonical mode, there can be an unterminated line in the read buffer, + // so readable must be checked. + readable bool + + // transform is the the queue's function for transforming bytes + // entering the queue. For example, transform might convert all '\r's + // entering the queue to '\n's. + transformer +} + +// readReadiness returns whether q is ready to be read from. +func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + if len(q.readBuf) > 0 && q.readable { + return waiter.EventIn + } + return waiter.EventMask(0) +} + +// writeReadiness returns whether q is ready to be written to. +func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + if q.waitBufLen < waitBufMaxBytes { + return waiter.EventOut + } + return waiter.EventMask(0) +} + +// readableSize writes the number of readable bytes to userspace. +func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + q.mu.Lock() + defer q.mu.Unlock() + var size int32 + if q.readable { + size = int32(len(q.readBuf)) + } + + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err + +} + +// read reads from q to userspace. It returns the number of bytes read as well +// as whether the read caused more readable data to become available (whether +// data was pushed from the wait buffer to the read buffer). +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { + q.mu.Lock() + defer q.mu.Unlock() + + if !q.readable { + return 0, false, syserror.ErrWouldBlock + } + + if dst.NumBytes() > canonMaxBytes { + dst = dst.TakeFirst(canonMaxBytes) + } + + n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) { + src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf)) + n, err := safemem.CopySeq(dst, src) + if err != nil { + return 0, err + } + q.readBuf = q.readBuf[n:] + + // If we read everything, this queue is no longer readable. + if len(q.readBuf) == 0 { + q.readable = false + } + + return n, nil + })) + if err != nil { + return 0, false, err + } + + // Move data from the queue's wait buffer to its read buffer. + nPushed := q.pushWaitBufLocked(l) + + return int64(n), nPushed > 0, nil +} + +// write writes to q from userspace. +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) { + q.mu.Lock() + defer q.mu.Unlock() + + // Copy data into the wait buffer. + n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) { + copyLen := src.NumBytes() + room := waitBufMaxBytes - q.waitBufLen + // If out of room, return EAGAIN. + if room == 0 && copyLen > 0 { + return 0, syserror.ErrWouldBlock + } + // Cap the size of the wait buffer. + if copyLen > room { + copyLen = room + src = src.TakeFirst64(room) + } + buf := make([]byte, copyLen) + + // Copy the data into the wait buffer. + dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)) + n, err := safemem.CopySeq(dst, src) + if err != nil { + return 0, err + } + q.waitBufAppend(buf) + + return n, nil + })) + if err != nil { + return 0, err + } + + // Push data from the wait to the read buffer. + q.pushWaitBufLocked(l) + + return n, nil +} + +// writeBytes writes to q from b. +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) writeBytes(b []byte, l *lineDiscipline) { + q.mu.Lock() + defer q.mu.Unlock() + + // Write to the wait buffer. + q.waitBufAppend(b) + q.pushWaitBufLocked(l) +} + +// pushWaitBufLocked fills the queue's read buffer with data from the wait +// buffer. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be locked. +func (q *queue) pushWaitBufLocked(l *lineDiscipline) int { + if q.waitBufLen == 0 { + return 0 + } + + // Move data from the wait to the read buffer. + var total int + var i int + for i = 0; i < len(q.waitBuf); i++ { + n := q.transform(l, q, q.waitBuf[i]) + total += n + if n != len(q.waitBuf[i]) { + // The read buffer filled up without consuming the + // entire buffer. + q.waitBuf[i] = q.waitBuf[i][n:] + break + } + } + + // Update wait buffer based on consumed data. + q.waitBuf = q.waitBuf[i:] + q.waitBufLen -= uint64(total) + + return total +} + +// Precondition: q.mu must be locked. +func (q *queue) waitBufAppend(b []byte) { + q.waitBuf = append(q.waitBuf, b) + q.waitBufLen += uint64(len(b)) +} + +// LINT.ThenChange(../../fs/tty/queue.go) diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go new file mode 100644 index 000000000..0a98dc896 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -0,0 +1,186 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +// slaveInode is the inode for the slave end of the Terminal. +type slaveInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // root is the devpts root inode. + root *rootInode + + // t is the connected Terminal. + t *Terminal +} + +var _ kernfs.Inode = (*slaveInode)(nil) + +// Open implements kernfs.Inode.Open. +func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + si.IncRef() + fd := &slaveFileDescription{ + inode: si, + } + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + si.DecRef() + return nil, err + } + return &fd.vfsfd, nil + +} + +// Valid implements kernfs.Inode.Valid. +func (si *slaveInode) Valid(context.Context) bool { + // Return valid if the slave still exists. + si.root.mu.Lock() + defer si.root.mu.Unlock() + _, ok := si.root.slaves[si.t.n] + return ok +} + +// Stat implements kernfs.Inode.Stat. +func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := si.InodeAttrs.Stat(vfsfs, opts) + if err != nil { + return linux.Statx{}, err + } + statx.Blksize = 1024 + statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR + statx.RdevMinor = si.t.n + return statx, nil +} + +// SetStat implements kernfs.Inode.SetStat +func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask&linux.STATX_SIZE != 0 { + return syserror.EINVAL + } + return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) +} + +type slaveFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + inode *slaveInode +} + +var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil) + +// Release implements fs.FileOperations.Release. +func (sfd *slaveFileDescription) Release() { + sfd.inode.DecRef() +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) { + sfd.inode.t.ld.slaveWaiter.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return sfd.inode.t.ld.slaveReadiness() +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return sfd.inode.t.ld.inputQueueRead(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return sfd.inode.t.ld.outputQueueWrite(ctx, src) +} + +// Ioctl implements vfs.FileDescripionImpl.Ioctl. +func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ + // Get the number of bytes in the input queue read buffer. + return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args) + case linux.TCGETS: + return sfd.inode.t.ld.getTermios(ctx, io, args) + case linux.TCSETS: + return sfd.inode.t.ld.setTermios(ctx, io, args) + case linux.TCSETSW: + // TODO(b/29356795): This should drain the output queue first. + return sfd.inode.t.ld.setTermios(ctx, io, args) + case linux.TIOCGPTN: + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + case linux.TIOCGWINSZ: + return 0, sfd.inode.t.ld.windowSize(ctx, io, args) + case linux.TIOCSWINSZ: + return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */) + default: + maybeEmitUnimplementedEvent(ctx, cmd) + return 0, syserror.ENOTTY + } +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() + return sfd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() + return sfd.inode.Stat(fs, opts) +} + +// LINT.ThenChange(../../fs/tty/slave.go) diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go new file mode 100644 index 000000000..b44e673d8 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/terminal.go @@ -0,0 +1,124 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/usermem" +) + +// LINT.IfChanges + +// Terminal is a pseudoterminal. +// +// +stateify savable +type Terminal struct { + // n is the terminal index. It is immutable. + n uint32 + + // ld is the line discipline of the terminal. It is immutable. + ld *lineDiscipline + + // masterKTTY contains the controlling process of the master end of + // this terminal. This field is immutable. + masterKTTY *kernel.TTY + + // slaveKTTY contains the controlling process of the slave end of this + // terminal. This field is immutable. + slaveKTTY *kernel.TTY +} + +func newTerminal(n uint32) *Terminal { + termios := linux.DefaultSlaveTermios + t := Terminal{ + n: n, + ld: newLineDiscipline(termios), + masterKTTY: &kernel.TTY{Index: n}, + slaveKTTY: &kernel.TTY{Index: n}, + } + return &t +} + +// setControllingTTY makes tm the controlling terminal of the calling thread +// group. +func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("setControllingTTY must be called from a task context") + } + + return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int()) +} + +// releaseControllingTTY removes tm as the controlling terminal of the calling +// thread group. +func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("releaseControllingTTY must be called from a task context") + } + + return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster)) +} + +// foregroundProcessGroup gets the process group ID of tm's foreground process. +func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("foregroundProcessGroup must be called from a task context") + } + + ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster)) + if err != nil { + return 0, err + } + + // Write it out to *arg. + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err +} + +// foregroundProcessGroup sets tm's foreground process. +func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("setForegroundProcessGroup must be called from a task context") + } + + // Read in the process group ID. + var pgid int32 + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid)) + return uintptr(ret), err +} + +func (tm *Terminal) tty(isMaster bool) *kernel.TTY { + if isMaster { + return tm.masterKTTY + } + return tm.slaveKTTY +} + +// LINT.ThenChange(../../fs/tty/terminal.go) diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index abd4f24e7..142ee53b0 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -42,6 +42,11 @@ type FilesystemType struct { root *vfs.Dentry } +// Name implements vfs.FilesystemType.Name. +func (*FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fst.initOnce.Do(func() { @@ -158,16 +163,25 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v func (a *Accessor) UserspaceInit(ctx context.Context) error { actx := a.wrapContext(ctx) - // systemd: src/shared/dev-setup.c:dev_setup() + // Initialize symlinks. for _, symlink := range []struct { source string target string }{ - // /proc/kcore is not implemented. + // systemd: src/shared/dev-setup.c:dev_setup() {source: "fd", target: "/proc/self/fd"}, {source: "stdin", target: "/proc/self/fd/0"}, {source: "stdout", target: "/proc/self/fd/1"}, {source: "stderr", target: "/proc/self/fd/2"}, + // /proc/kcore is not implemented. + + // Linux implements /dev/ptmx as a device node, but advises + // container implementations to create /dev/ptmx as a symlink + // to pts/ptmx (Documentation/filesystems/devpts.txt). Systemd + // follows this advice (src/nspawn/nspawn.c:setup_pts()), while + // LXC tries to create a bind mount and falls back to a symlink + // (src/lxc/conf.c:lxc_setup_devpts()). + {source: "ptmx", target: "pts/ptmx"}, } { if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil { return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err) diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 6f78f478f..ff861d0fe 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "ext", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "ext", srcs = [ @@ -26,6 +37,7 @@ go_library( "extent_file.go", "file_description.go", "filesystem.go", + "fstree.go", "inode.go", "regular_file.go", "symlink.go", @@ -45,6 +57,7 @@ go_library( "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", "//pkg/sync", @@ -80,8 +93,8 @@ go_test( "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/test/testutil", "//pkg/usermem", - "//runsc/testutil", "@com_github_google_go-cmp//cmp:go_default_library", "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", ], diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index a080cb189..bfbd7c3d4 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -22,6 +22,10 @@ import ( type dentry struct { vfsd vfs.Dentry + // Protected by filesystem.mu. + parent *dentry + name string + // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory Inode (with hard links). inode is // immutable. diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index bd6ede995..12b875c8f 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -31,6 +30,10 @@ import ( type directory struct { inode inode + // childCache maps filenames to dentries for children for which dentries + // have been instantiated. childCache is protected by filesystem.mu. + childCache map[string]*dentry + // mu serializes the changes to childList. // Lock Order (outermost locks must be taken first): // directory.mu @@ -50,9 +53,13 @@ type directory struct { childMap map[string]*dirent } -// newDirectroy is the directory constructor. -func newDirectroy(inode inode, newDirent bool) (*directory, error) { - file := &directory{inode: inode, childMap: make(map[string]*dirent)} +// newDirectory is the directory constructor. +func newDirectory(inode inode, newDirent bool) (*directory, error) { + file := &directory{ + inode: inode, + childCache: make(map[string]*dentry), + childMap: make(map[string]*dirent), + } file.inode.impl = file // Initialize childList by reading dirents from the underlying file. @@ -299,9 +306,3 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in fd.off = offset return offset, nil } - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - // mmap(2) specifies that EACCESS should be returned for non-regular file fds. - return syserror.EACCES -} diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index 373d23b74..7176af6d1 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -30,6 +30,9 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// Name is the name of this filesystem. +const Name = "ext" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -91,8 +94,13 @@ func isCompatible(sb disklayout.SuperBlock) bool { return true } +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // TODO(b/134676337): Ensure that the user is mounting readonly. If not, // EACCESS should be returned according to mount(2). Filesystem independent // flags (like readonly) are currently not available in pkg/sentry/vfs. @@ -103,7 +111,7 @@ func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile } fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} - fs.vfsfs.Init(vfsObj, &fs) + fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { return nil, nil, err diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 29bb73765..64e9a579f 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -32,9 +32,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/runsc/testutil" ) const ( diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index e05429d41..2c22a04af 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -22,6 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -87,14 +89,33 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } for { - nextVFSD, err := rp.ResolveComponent(vfsd) - if err != nil { - return nil, nil, err + name := rp.Component() + if name == "." { + rp.Advance() + return vfsd, inode, nil } - if nextVFSD == nil { - // Since the Dentry tree is not the sole source of truth for extfs, if it's - // not in the Dentry tree, it might need to be pulled from disk. - childDirent, ok := inode.impl.(*directory).childMap[rp.Component()] + d := vfsd.Impl().(*dentry) + if name == ".." { + isRoot, err := rp.CheckRoot(vfsd) + if err != nil { + return nil, nil, err + } + if isRoot || d.parent == nil { + rp.Advance() + return vfsd, inode, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, nil, err + } + rp.Advance() + return &d.parent.vfsd, d.parent.inode, nil + } + + dir := inode.impl.(*directory) + child, ok := dir.childCache[name] + if !ok { + // We may need to instantiate a new dentry for this child. + childDirent, ok := dir.childMap[name] if !ok { // The underlying inode does not exist on disk. return nil, nil, syserror.ENOENT @@ -113,21 +134,22 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } // incRef because this is being added to the dentry tree. childInode.incRef() - child := newDentry(childInode) - vfsd.InsertChild(&child.vfsd, rp.Component()) - - // Continue as usual now that nextVFSD is not nil. - nextVFSD = &child.vfsd + child = newDentry(childInode) + child.parent = d + child.name = name + dir.childCache[name] = child + } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, nil, err } - nextInode := nextVFSD.Impl().(*dentry).inode - if nextInode.isSymlink() && rp.ShouldFollowSymlink() { - if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil { + if child.inode.isSymlink() && rp.ShouldFollowSymlink() { + if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil { return nil, nil, err } continue } rp.Advance() - return nextVFSD, nextInode, nil + return &child.vfsd, child.inode, nil } } @@ -255,6 +277,15 @@ func (fs *filesystem) statTo(stat *linux.Statfs) { // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID. } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + return inode.checkPermissions(rp.Credentials(), ats) +} + // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { vfsd, inode, err := fs.walk(rp, false) @@ -453,8 +484,19 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + + // TODO(b/134676337): Support sockets. + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { _, _, err := fs.walk(rp, false) if err != nil { return nil, err @@ -463,7 +505,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { _, _, err := fs.walk(rp, false) if err != nil { return "", err @@ -493,5 +535,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 6962083f5..a98512350 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -136,7 +136,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } return &f.inode, nil case linux.ModeDirectory: - f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType) + f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType) if err != nil { return nil, err } @@ -186,7 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt } func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID()) + return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID()) } // statTo writes the statx fields to the output parameter. diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 4ba76a1e8..b9c4beee4 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) @@ -15,15 +15,26 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "gofer", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "gofer", srcs = [ "dentry_list.go", "directory.go", "filesystem.go", + "fstree.go", "gofer.go", "handle.go", - "handle_unsafe.go", "p9file.go", "pagemath.go", "regular_file.go", @@ -41,11 +52,13 @@ go_library( "//pkg/p9", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/hostfd", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/syserror", @@ -53,3 +66,13 @@ go_library( "//pkg/usermem", ], ) + +go_test( + name = "gofer_test", + srcs = ["gofer_test.go"], + library = ":gofer", + deps = [ + "//pkg/p9", + "//pkg/sentry/contexttest", + ], +) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 5dbfc6250..c67766ab2 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -21,21 +21,66 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func (d *dentry) isDir() bool { return d.fileType() == linux.S_IFDIR } -// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != -// InteropModeShared. -func (d *dentry) cacheNegativeChildLocked(name string) { - if d.negativeChildren == nil { - d.negativeChildren = make(map[string]struct{}) +// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked. +// d.isDir(). child must be a newly-created dentry that has never had a parent. +func (d *dentry) cacheNewChildLocked(child *dentry, name string) { + d.IncRef() // reference held by child on its parent + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*dentry) } - d.negativeChildren[name] = struct{}{} + d.children[name] = child +} + +// Preconditions: d.dirMu must be locked. d.isDir(). +func (d *dentry) cacheNegativeLookupLocked(name string) { + // Don't cache negative lookups if InteropModeShared is in effect (since + // this makes remote lookup unavoidable), or if d.isSynthetic() (in which + // case the only files in the directory are those for which a dentry exists + // in d.children). Instead, just delete any previously-cached dentry. + if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { + delete(d.children, name) + return + } + if d.children == nil { + d.children = make(map[string]*dentry) + } + d.children[name] = nil +} + +// createSyntheticDirectory creates a synthetic directory with the given name +// in d. +// +// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain +// a child with the given name. +func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) { + d2 := &dentry{ + refs: 1, // held by d + fs: d.fs, + mode: uint32(mode) | linux.S_IFDIR, + uid: uint32(kuid), + gid: uint32(kgid), + blockSize: usermem.PageSize, // arbitrary + handle: handle{ + fd: -1, + }, + } + d2.pf.dentry = d2 + d2.vfsd.Init(d2) + + d.cacheNewChildLocked(d2, name) + d.syntheticChildren++ } type directoryFD struct { @@ -56,14 +101,19 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.mu.Lock() defer fd.mu.Unlock() + d := fd.dentry() if fd.dirents == nil { - ds, err := fd.dentry().getDirents(ctx) + ds, err := d.getDirents(ctx) if err != nil { return err } fd.dirents = ds } + if d.cachedMetadataAuthoritative() { + d.touchAtime(fd.vfsfd.Mount()) + } + for fd.off < int64(len(fd.dirents)) { if err := cb.Handle(fd.dirents[fd.off]); err != nil { return err @@ -75,23 +125,21 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Preconditions: d.isDir(). There exists at least one directoryFD representing d. func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { - // 9P2000.L's readdir does not specify behavior in the presence of - // concurrent mutation of an iterated directory, so implementations may - // duplicate or omit entries in this case, which violates POSIX semantics. - // Thus we read all directory entries while holding d.dirMu to exclude - // directory mutations. (Note that it is impossible for the client to - // exclude concurrent mutation from other remote filesystem users. Since - // there is no way to detect if the server has incorrectly omitted - // directory entries, we simply assume that the server is well-behaved - // under InteropModeShared.) This is inconsistent with Linux (which appears - // to assume that directory fids have the correct semantics, and translates - // struct file_operations::readdir calls directly to readdir RPCs), but is - // consistent with VFS1. - // - // NOTE(b/135560623): In particular, some gofer implementations may not - // retain state between calls to Readdir, so may not provide a coherent - // directory stream across in the presence of mutation. + // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the + // presence of concurrent mutation of an iterated directory, so + // implementations may duplicate or omit entries in this case, which + // violates POSIX semantics. Thus we read all directory entries while + // holding d.dirMu to exclude directory mutations. (Note that it is + // impossible for the client to exclude concurrent mutation from other + // remote filesystem users. Since there is no way to detect if the server + // has incorrectly omitted directory entries, we simply assume that the + // server is well-behaved under InteropModeShared.) This is inconsistent + // with Linux (which appears to assume that directory fids have the correct + // semantics, and translates struct file_operations::readdir calls directly + // to readdir RPCs), but is consistent with VFS1. + // filesystem.renameMu is needed for d.parent, and must be locked before + // dentry.dirMu. d.fs.renameMu.RLock() defer d.fs.renameMu.RUnlock() d.dirMu.Lock() @@ -102,7 +150,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // It's not clear if 9P2000.L's readdir is expected to return "." and "..", // so we generate them here. - parent := d.vfsd.ParentOrSelf().Impl().(*dentry) + parent := genericParentOrSelf(d) dirents := []vfs.Dirent{ { Name: ".", @@ -117,50 +165,81 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { NextOff: 2, }, } - off := uint64(0) - const count = 64 * 1024 // for consistency with the vfs1 client - d.handleMu.RLock() - defer d.handleMu.RUnlock() - if !d.handleReadable { - // This should not be possible because a readable handle should have - // been opened when the calling directoryFD was opened. - panic("gofer.dentry.getDirents called without a readable handle") - } - for { - p9ds, err := d.handle.file.readdir(ctx, off, count) - if err != nil { - return nil, err + var realChildren map[string]struct{} + if !d.isSynthetic() { + if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { + // Record the set of children d actually has so that we don't emit + // duplicate entries for synthetic children. + realChildren = make(map[string]struct{}) + } + off := uint64(0) + const count = 64 * 1024 // for consistency with the vfs1 client + d.handleMu.RLock() + if !d.handleReadable { + // This should not be possible because a readable handle should + // have been opened when the calling directoryFD was opened. + d.handleMu.RUnlock() + panic("gofer.dentry.getDirents called without a readable handle") } - if len(p9ds) == 0 { - // Cache dirents for future directoryFDs if permitted. - if d.fs.opts.interop != InteropModeShared { - d.dirents = dirents + for { + p9ds, err := d.handle.file.readdir(ctx, off, count) + if err != nil { + d.handleMu.RUnlock() + return nil, err } - return dirents, nil + if len(p9ds) == 0 { + d.handleMu.RUnlock() + break + } + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: p9d.QID.Path, + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[p9d.Name] = struct{}{} + } + } + off = p9ds[len(p9ds)-1].Offset } - for _, p9d := range p9ds { - if p9d.Name == "." || p9d.Name == ".." { + } + // Emit entries for synthetic children. + if d.syntheticChildren != 0 { + for _, child := range d.children { + if child == nil || !child.isSynthetic() { continue } - dirent := vfs.Dirent{ - Name: p9d.Name, - Ino: p9d.QID.Path, - NextOff: int64(len(dirents) + 1), - } - // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or - // DMSOCKET. - switch p9d.Type { - case p9.TypeSymlink: - dirent.Type = linux.DT_LNK - case p9.TypeDir: - dirent.Type = linux.DT_DIR - default: - dirent.Type = linux.DT_REG + if _, ok := realChildren[child.name]; ok { + continue } - dirents = append(dirents, dirent) + dirents = append(dirents, vfs.Dirent{ + Name: child.name, + Type: uint8(atomic.LoadUint32(&child.mode) >> 12), + Ino: child.ino, + NextOff: int64(len(dirents) + 1), + }) } - off = p9ds[len(p9ds)-1].Offset } + // Cache dirents for future directoryFDs if permitted. + if d.cachedMetadataAuthoritative() { + d.dirents = dirents + } + return dirents, nil } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 5cfb0dc4c..98ccb42fd 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -21,20 +21,24 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { - // Snapshot current dentries and special files. + // Snapshot current syncable dentries and special files. fs.syncMu.Lock() - ds := make([]*dentry, 0, len(fs.dentries)) - for d := range fs.dentries { + ds := make([]*dentry, 0, len(fs.syncableDentries)) + for d := range fs.syncableDentries { + d.IncRef() ds = append(ds, d) } sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { + sffd.vfsfd.IncRef() sffds = append(sffds, sffd) } fs.syncMu.Unlock() @@ -45,9 +49,6 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync regular files. for _, d := range ds { - if !d.TryIncRef() { - continue - } err := d.syncSharedHandle(ctx) d.DecRef() if err != nil && retErr == nil { @@ -58,9 +59,6 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - if !sffd.vfsfd.TryIncRef() { - continue - } err := sffd.Sync(ctx) sffd.vfsfd.DecRef() if err != nil && retErr == nil { @@ -112,13 +110,15 @@ func putDentrySlice(ds *[]*dentry) { // to *ds. // // Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached -// metadata must be up to date. +// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata +// must be up to date. +// +// Postconditions: The returned dentry's cached metadata is up to date. func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: @@ -128,39 +128,42 @@ afterSymlink: return d, nil } if name == ".." { - parentVFSD, err := rp.ResolveParent(&d.vfsd) - if err != nil { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil } - parent := parentVFSD.Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // We must assume that parentVFSD is correct, because if d has been - // moved elsewhere in the remote filesystem so that its parent has - // changed, we have no way of determining its new parent's location - // in the filesystem. Get updated metadata for parentVFSD. - _, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask()) + // We must assume that d.parent is correct, because if d has been moved + // elsewhere in the remote filesystem so that its parent has changed, + // we have no way of determining its new parent's location in the + // filesystem. + // + // Call rp.CheckMount() before updating d.parent's metadata, since if + // we traverse to another mount then d.parent's metadata is irrelevant. + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + if d != d.parent && !d.cachedMetadataAuthoritative() { + _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) if err != nil { return nil, err } - parent.updateFromP9Attrs(attrMask, &attr) + d.parent.updateFromP9Attrs(attrMask, &attr) } rp.Advance() - return parent, nil - } - childVFSD, err := rp.ResolveChild(&d.vfsd, name) - if err != nil { - return nil, err + return d.parent, nil } - // FIXME(jamieliu): Linux performs revalidation before mount lookup - // (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(), - // __follow_mount_rcu()). - child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds) + child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds) if err != nil { return nil, err } if child == nil { return nil, syserror.ENOENT } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { @@ -175,38 +178,37 @@ afterSymlink: return child, nil } -// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) -// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be -// nil) to verify that the returned child (or lack thereof) is correct. If no file -// exists at name, revalidateChildLocked returns (nil, nil). +// getChildLocked returns a dentry representing the child of parent with the +// given name. If no such child exists, getChildLocked returns (nil, nil). // // Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. // parent.isDir(). name is not "." or "..". // -// Postconditions: If revalidateChildLocked returns a non-nil dentry, its -// cached metadata is up to date. -func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) { - if childVFSD != nil && fs.opts.interop != InteropModeShared { - // We have a cached dentry that is assumed to be correct. - return childVFSD.Impl().(*dentry), nil - } - // We either don't have a cached dentry or need to verify that it's still - // correct, either of which requires a remote lookup. Check if this name is - // valid before performing the lookup. +// Postconditions: If getChildLocked returns a non-nil dentry, its cached +// metadata is up to date. +func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if len(name) > maxFilenameLen { return nil, syserror.ENAMETOOLONG } - // Check if we've already cached this lookup with a negative result. - if _, ok := parent.negativeChildren[name]; ok { - return nil, nil + child, ok := parent.children[name] + if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() { + // Whether child is nil or not, it is cached information that is + // assumed to be correct. + return child, nil } - // Perform the remote lookup. + // We either don't have cached information or need to verify that it's + // still correct, either of which requires a remote lookup. Check if this + // name is valid before performing the lookup. + return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds) +} + +// Preconditions: As for getChildLocked. !parent.isSynthetic(). +func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { return nil, err } - if childVFSD != nil { - child := childVFSD.Impl().(*dentry) + if child != nil { if !file.isNil() && qid.Path == child.ino { // The file at this path hasn't changed. Just update cached // metadata. @@ -214,29 +216,44 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir child.updateFromP9Attrs(attrMask, &attr) return child, nil } - // The file at this path has changed or no longer exists. Remove - // the stale dentry from the tree, and re-evaluate its caching - // status (i.e. if it has 0 references, drop it). - vfsObj.ForceDeleteDentry(childVFSD) + if file.isNil() && child.isSynthetic() { + // We have a synthetic file, and no remote file has arisen to + // replace it. + return child, nil + } + // The file at this path has changed or no longer exists. Mark the + // dentry invalidated, and re-evaluate its caching status (i.e. if it + // has 0 references, drop it). Wait to update parent.children until we + // know what to replace the existing dentry with (i.e. one of the + // returns below), to avoid a redundant map access. + vfsObj.InvalidateDentry(&child.vfsd) + if child.isSynthetic() { + // Normally we don't mark invalidated dentries as deleted since + // they may still exist (but at a different path), and also for + // consistency with Linux. However, synthetic files are guaranteed + // to become unreachable if their dentries are invalidated, so + // treat their invalidation as deletion. + child.setDeleted() + parent.syntheticChildren-- + child.decRefLocked() + parent.dirents = nil + } *ds = appendDentry(*ds, child) - childVFSD = nil } if file.isNil() { // No file exists at this path now. Cache the negative lookup if // allowed. - if fs.opts.interop != InteropModeShared { - parent.cacheNegativeChildLocked(name) - } + parent.cacheNegativeLookupLocked(name) return nil, nil } // Create a new dentry representing the file. - child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) + child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) if err != nil { file.close(ctx) + delete(parent.children, name) return nil, err } - parent.IncRef() // reference held by child on its parent - parent.vfsd.InsertChild(&child.vfsd, name) + parent.cacheNewChildLocked(child, name) // For now, child has 0 references, so our caller should call // child.checkCachingLocked(). *ds = appendDentry(*ds, child) @@ -248,8 +265,9 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // -// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop == -// InteropModeShared, then d's cached metadata must be up to date. +// Preconditions: fs.renameMu must be locked. !rp.Done(). If +// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to +// date. func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() @@ -271,7 +289,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // Preconditions: fs.renameMu must be locked. func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { d := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !d.cachedMetadataAuthoritative() { // Get updated metadata for rp.Start() as required by fs.stepLocked(). if err := d.updateFromGetattr(ctx); err != nil { return nil, err @@ -293,16 +311,17 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, } // doCreateAt checks that creating a file at rp is permitted, then invokes -// create to do so. +// createInRemoteDir (if the parent directory is a real remote directory) or +// createInSyntheticDir (if the parent directory is synthetic) to do so. // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -313,12 +332,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - if parent.isDeleted() { - return syserror.ENOENT - } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST @@ -329,6 +345,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if !dir && rp.MustBeDir() { return syserror.ENOENT } + if parent.isDeleted() { + return syserror.ENOENT + } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err @@ -336,6 +355,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir defer mnt.EndWrite() parent.dirMu.Lock() defer parent.dirMu.Unlock() + if parent.isSynthetic() { + if child := parent.children[name]; child != nil { + return syserror.EEXIST + } + if createInSyntheticDir == nil { + return syserror.EPERM + } + if err := createInSyntheticDir(parent, name); err != nil { + return err + } + parent.touchCMtime() + parent.dirents = nil + return nil + } if fs.opts.interop == InteropModeShared { // The existence of a dentry at name would be inconclusive because the // file it represents may have been deleted from the remote filesystem, @@ -344,18 +377,21 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // will fail with EEXIST like we would have. If the RPC succeeds, and a // stale dentry exists, the dentry will fail revalidation next time // it's used. - return create(parent, name) + return createInRemoteDir(parent, name) } - if parent.vfsd.Child(name) != nil { + if child := parent.children[name]; child != nil { return syserror.EEXIST } // No cached dentry exists; however, there might still be an existing file // at name. As above, we attempt the file creation RPC anyway. - if err := create(parent, name); err != nil { + if err := createInRemoteDir(parent, name); err != nil { return err } - parent.touchCMtime(ctx) - delete(parent.negativeChildren, name) + if child, ok := parent.children[name]; ok && child == nil { + // Delete the now-stale negative dentry. + delete(parent.children, name) + } + parent.touchCMtime() parent.dirents = nil return nil } @@ -366,7 +402,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -377,7 +413,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -403,64 +439,107 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b defer mntns.DecRef() parent.dirMu.Lock() defer parent.dirMu.Unlock() - childVFSD := parent.vfsd.Child(name) - var child *dentry + child, ok := parent.children[name] + if ok && child == nil { + return syserror.ENOENT + } // We only need a dentry representing the file at name if it can be a mount - // point. If childVFSD is nil, then it can't be a mount point. If childVFSD - // is non-nil but stale, the actual file can't be a mount point either; we + // point. If child is nil, then it can't be a mount point. If child is + // non-nil but stale, the actual file can't be a mount point either; we // detect this case by just speculatively calling PrepareDeleteDentry and // only revalidating the dentry if that fails (indicating that the existing // dentry is a mount point). - if childVFSD != nil { - child = childVFSD.Impl().(*dentry) - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { - child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds) + if child != nil { + child.dirMu.Lock() + defer child.dirMu.Unlock() + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + if parent.cachedMetadataAuthoritative() { + return err + } + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) if err != nil { return err } if child != nil { - childVFSD = &child.vfsd - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - } else { - childVFSD = nil } } - } else if _, ok := parent.negativeChildren[name]; ok { - return syserror.ENOENT } flags := uint32(0) + // If a dentry exists, use it for best-effort checks on its deletability. if dir { - if child != nil && !child.isDir() { - return syserror.ENOTDIR + if child != nil { + // child must be an empty directory. + if child.syntheticChildren != 0 { + // This is definitely not an empty directory, irrespective of + // fs.opts.interop. + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + // If InteropModeShared is in effect and the first call to + // PrepareDeleteDentry above succeeded, then child wasn't + // revalidated (so we can't expect its file type to be correct) and + // individually revalidating its children (to confirm that they + // still exist) would be a waste of time. + if child.cachedMetadataAuthoritative() { + if !child.isDir() { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTDIR + } + for _, grandchild := range child.children { + if grandchild != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + } + } } flags = linux.AT_REMOVEDIR } else { + // child must be a non-directory file. if child != nil && child.isDir() { + vfsObj.AbortDeleteDentry(&child.vfsd) return syserror.EISDIR } if rp.MustBeDir() { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } return syserror.ENOTDIR } } - err = parent.file.unlinkAt(ctx, name, flags) - if err != nil { - if childVFSD != nil { - vfsObj.AbortDeleteDentry(childVFSD) + if parent.isSynthetic() { + if child == nil { + return syserror.ENOENT + } + } else { + err = parent.file.unlinkAt(ctx, name, flags) + if err != nil { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err } - return err - } - if fs.opts.interop != InteropModeShared { - parent.touchCMtime(ctx) - parent.cacheNegativeChildLocked(name) - parent.dirents = nil } if child != nil { + vfsObj.CommitDeleteDentry(&child.vfsd) child.setDeleted() - vfsObj.CommitDeleteDentry(childVFSD) + if child.isSynthetic() { + parent.syntheticChildren-- + child.decRefLocked() + } ds = appendDentry(ds, child) } + parent.cacheNegativeLookupLocked(name) + if parent.cachedMetadataAuthoritative() { + parent.dirents = nil + parent.touchCMtime() + if dir { + parent.decLinks() + } + } return nil } @@ -499,6 +578,18 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { putDentrySlice(*ds) } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry @@ -512,7 +603,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.isDir() { return nil, syserror.ENOTDIR } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -526,7 +617,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -549,15 +640,32 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. } // 9P2000.L supports hard links, but we don't. return syserror.EPERM - }) + }, nil) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + creds := rp.Credentials() return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { - creds := rp.Credentials() - _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - return err + if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil { + if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + return err + } + ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) + parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) + } + if fs.opts.interop != InteropModeShared { + parent.incLinks() + } + return nil + }, func(parent *dentry, name string) error { + if !opts.ForSyntheticMountpoint { + // Can't create non-synthetic files in synthetic directories. + return syserror.EPERM + } + parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) + parent.incLinks() + return nil }) } @@ -567,7 +675,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) return err - }) + }, nil) } // OpenAt implements vfs.FilesystemImpl.OpenAt. @@ -587,7 +695,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by fs.stepLocked(). if err := start.updateFromGetattr(ctx); err != nil { return nil, err @@ -603,13 +711,17 @@ afterTrailingSymlink: return nil, err } // Check for search permission in the parent directory. - if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Determine whether or not we need to create a file. parent.dirMu.Lock() child, err := fs.stepLocked(ctx, rp, parent, &ds) if err == syserror.ENOENT && mayCreate { + if parent.isSynthetic() { + parent.dirMu.Unlock() + return nil, syserror.EPERM + } fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) parent.dirMu.Unlock() return fd, err @@ -640,7 +752,7 @@ afterTrailingSymlink: // Preconditions: fs.renameMu must be locked. func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) - if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } mnt := rp.Mount() @@ -669,8 +781,10 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { - return nil, err + if !d.isSynthetic() { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { + return nil, err + } } fd := &directoryFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { @@ -700,8 +814,9 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf } // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. +// !d.isSynthetic(). func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { - if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if d.isDeleted() { @@ -777,17 +892,14 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // eligible for caching yet, so we don't need to append to a dentry slice.) child.refs = 1 // Insert the dentry into the tree. - d.IncRef() // reference held by child on its parent d - d.vfsd.InsertChild(&child.vfsd, name) - if d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) - delete(d.negativeChildren, name) + d.cacheNewChildLocked(child, name) + if d.cachedMetadataAuthoritative() { + d.touchCMtime() d.dirents = nil } // Finally, construct a file description representing the created file. var childVFSFD *vfs.FileDescription - mnt.IncRef() if useRegularFileFD { fd := ®ularFileFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ @@ -858,12 +970,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !oldParent.cachedMetadataAuthoritative() { if err := oldParent.updateFromGetattr(ctx); err != nil { return err } } - if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } vfsObj := rp.VirtualFilesystem() @@ -871,7 +983,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // directory, we need to check for write permission on it. oldParent.dirMu.Lock() defer oldParent.dirMu.Unlock() - renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds) + renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds) if err != nil { return err } @@ -879,11 +991,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOENT } if renamed.isDir() { - if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) { + if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { return syserror.EINVAL } if oldParent != newParent { - if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } } @@ -894,7 +1006,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if oldParent != newParent { - if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } newParent.dirMu.Lock() @@ -903,32 +1015,21 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if newParent.isDeleted() { return syserror.ENOENT } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry - // This is similar to unlinkAt, except: - // - // - We revalidate the replaced dentry unconditionally for simplicity. - // - // - If rp.MustBeDir(), then we need a dentry representing the replaced - // file regardless to confirm that it's a directory. - if replacedVFSD != nil || rp.MustBeDir() { - replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds) - if err != nil { - return err - } - if replaced != nil { - if replaced.isDir() { - if !renamed.isDir() { - return syserror.EISDIR - } - } else { - if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR - } + replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) + if err != nil { + return err + } + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR } - replacedVFSD = &replaced.vfsd } else { - replacedVFSD = nil + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } } } @@ -940,17 +1041,73 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } - if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { - vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) - return err + + // Update the remote filesystem. + if !renamed.isSynthetic() { + if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } else if replaced != nil && !replaced.isSynthetic() { + // We are replacing an existing real file with a synthetic one, so we + // need to unlink the former. + flags := uint32(0) + if replaced.isDir() { + flags = linux.AT_REMOVEDIR + } + if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + + // Update the dentry tree. + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + if replaced != nil { + replaced.setDeleted() + if replaced.isSynthetic() { + newParent.syntheticChildren-- + replaced.decRefLocked() + } + ds = appendDentry(ds, replaced) + } + oldParent.cacheNegativeLookupLocked(oldName) + // We don't use newParent.cacheNewChildLocked() since we don't want to mess + // with reference counts and queue oldParent for checkCachingLocked if the + // parent isn't actually changing. + if oldParent != newParent { + ds = appendDentry(ds, oldParent) + newParent.IncRef() + if renamed.isSynthetic() { + oldParent.syntheticChildren-- + newParent.syntheticChildren++ + } } - if fs.opts.interop != InteropModeShared { - oldParent.cacheNegativeChildLocked(oldName) + renamed.parent = newParent + renamed.name = newName + if newParent.children == nil { + newParent.children = make(map[string]*dentry) + } + newParent.children[newName] = renamed + + // Update metadata. + if renamed.cachedMetadataAuthoritative() { + renamed.touchCtime() + } + if oldParent.cachedMetadataAuthoritative() { oldParent.dirents = nil - delete(newParent.negativeChildren, newName) + oldParent.touchCMtime() + if renamed.isDir() { + oldParent.decLinks() + } + } + if newParent.cachedMetadataAuthoritative() { newParent.dirents = nil + newParent.touchCMtime() + if renamed.isDir() { + newParent.incLinks() + } } - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) return nil } @@ -997,6 +1154,10 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } + // If d is synthetic, invoke statfs on the first ancestor of d that isn't. + for d.isSynthetic() { + d = d.parent + } fsstat, err := d.file.statFS(ctx) if err != nil { return linux.Statfs{}, err @@ -1026,7 +1187,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ creds := rp.Credentials() _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) return err - }) + }, nil) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. @@ -1034,8 +1195,21 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return fs.unlinkAt(ctx, rp, false /* dir */) } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + // TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) @@ -1043,11 +1217,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ if err != nil { return nil, err } - return d.listxattr(ctx) + return d.listxattr(ctx, rp.Credentials(), size) } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) @@ -1055,7 +1229,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam if err != nil { return "", err } - return d.getxattr(ctx, name) + return d.getxattr(ctx, rp.Credentials(), &opts) } // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. @@ -1067,7 +1241,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt if err != nil { return err } - return d.setxattr(ctx, &opts) + return d.setxattr(ctx, rp.Credentials(), &opts) } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. @@ -1079,12 +1253,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, if err != nil { return err } - return d.removexattr(ctx, name) + return d.removexattr(ctx, rp.Credentials(), name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.renameMu.RLock() defer fs.renameMu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index c4a8f0b38..8b4e91d17 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -27,13 +27,15 @@ // dentry.handleMu // dentry.dataMu // -// Locking dentry.dirMu in multiple dentries requires holding -// filesystem.renameMu for writing. +// Locking dentry.dirMu in multiple dentries requires that either ancestor +// dentries are locked before descendant dentries, or that filesystem.renameMu +// is locked for writing. package gofer import ( "fmt" "strconv" + "strings" "sync" "sync/atomic" "syscall" @@ -44,6 +46,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -72,6 +75,9 @@ type filesystem struct { // client is the client used by this filesystem. client is immutable. client *p9.Client + // clock is a realtime clock used to set timestamps in file operations. + clock ktime.Clock + // uid and gid are the effective KUID and KGID of the filesystem's creator, // and are used as the owner and group for files that don't specify one. // uid and gid are immutable. @@ -97,11 +103,12 @@ type filesystem struct { cachedDentries dentryList cachedDentriesLen uint64 - // dentries contains all dentries in this filesystem. specialFileFDs - // contains all open specialFileFDs. These fields are protected by syncMu. - syncMu sync.Mutex - dentries map[*dentry]struct{} - specialFileFDs map[*specialFileFD]struct{} + // syncableDentries contains all dentries in this filesystem for which + // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs. + // These fields are protected by syncMu. + syncMu sync.Mutex + syncableDentries map[*dentry]struct{} + specialFileFDs map[*specialFileFD]struct{} } type filesystemOptions struct { @@ -182,7 +189,8 @@ const ( // InteropModeShared is appropriate when there are users of the remote // filesystem that may mutate its state other than the client. // - // - The client must verify cached filesystem state before using it. + // - The client must verify ("revalidate") cached filesystem state before + // using it. // // - Client changes to filesystem state must be sent to the remote // filesystem synchronously. @@ -199,6 +207,11 @@ const ( InteropModeShared ) +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mfp := pgalloc.MemoryFileProviderFromContext(ctx) @@ -366,15 +379,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Construct the filesystem object. fs := &filesystem{ - mfp: mfp, - opts: fsopts, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, - client: client, - dentries: make(map[*dentry]struct{}), - specialFileFDs: make(map[*specialFileFD]struct{}), + mfp: mfp, + opts: fsopts, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + client: client, + clock: ktime.RealtimeClockFromContext(ctx), + syncableDentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), } - fs.vfsfs.Init(vfsObj, fs) + fs.vfsfs.Init(vfsObj, &fstype, fs) // Construct the root dentry. root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) @@ -398,7 +412,7 @@ func (fs *filesystem) Release() { mf := fs.mfp.MemoryFile() fs.syncMu.Lock() - for d := range fs.dentries { + for d := range fs.syncableDentries { d.handleMu.Lock() d.dataMu.Lock() if d.handleWritable { @@ -433,16 +447,35 @@ type dentry struct { vfsd vfs.Dentry // refs is the reference count. Each dentry holds a reference on its - // parent, even if disowned. refs is accessed using atomic memory - // operations. + // parent, even if disowned. An additional reference is held on all + // synthetic dentries until they are unlinked or invalidated. When refs + // reaches 0, the dentry may be added to the cache or destroyed. If refs == + // -1, the dentry has already been destroyed. refs is accessed using atomic + // memory operations. refs int64 // fs is the owning filesystem. fs is immutable. fs *filesystem + // parent is this dentry's parent directory. Each dentry holds a reference + // on its parent. If this dentry is a filesystem root, parent is nil. + // parent is protected by filesystem.renameMu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.renameMu. + name string + // We don't support hard links, so each dentry maps 1:1 to an inode. // file is the unopened p9.File that backs this dentry. file is immutable. + // + // If file.isNil(), this dentry represents a synthetic file, i.e. a file + // that does not exist on the remote filesystem. As of this writing, this + // is only possible for a directory created with + // MkdirOptions.ForSyntheticMountpoint == true. + // TODO(gvisor.dev/issue/1476): Support synthetic sockets (and pipes). file p9file // If deleted is non-zero, the file represented by this dentry has been @@ -457,15 +490,26 @@ type dentry struct { dirMu sync.Mutex - // If this dentry represents a directory, and InteropModeShared is not in - // effect, negativeChildren is a set of child names in this directory that - // are known not to exist. negativeChildren is protected by dirMu. - negativeChildren map[string]struct{} - - // If this dentry represents a directory, InteropModeShared is not in - // effect, and dirents is not nil, it is a cache of all entries in the - // directory, in the order they were returned by the server. dirents is - // protected by dirMu. + // If this dentry represents a directory, children contains: + // + // - Mappings of child filenames to dentries representing those children. + // + // - Mappings of child filenames that are known not to exist to nil + // dentries (only if InteropModeShared is not in effect and the directory + // is not synthetic). + // + // children is protected by dirMu. + children map[string]*dentry + + // If this dentry represents a directory, syntheticChildren is the number + // of child dentries for which dentry.isSynthetic() == true. + // syntheticChildren is protected by dirMu. + syntheticChildren int + + // If this dentry represents a directory, + // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it + // is a cache of all entries in the directory, in the order they were + // returned by the server. dirents is protected by dirMu. dirents []vfs.Dirent // Cached metadata; protected by metadataMu and accessed using atomic @@ -485,6 +529,11 @@ type dentry struct { // locked to mutate it). size uint64 + // nlink counts the number of hard links to this dentry. It's updated and + // accessed using atomic operations. It's not protected by metadataMu like the + // other metadata fields. + nlink uint32 + mapsMu sync.Mutex // If this dentry represents a regular file, mappings tracks mappings of @@ -557,6 +606,8 @@ func dentryAttrMask() p9.AttrMask { // initially has no references, but is not cached; it is the caller's // responsibility to set the dentry's reference count and/or call // dentry.checkCachingLocked() as appropriate. +// +// Preconditions: !file.isNil(). func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { if !mask.Mode { ctx.Warningf("can't create gofer.dentry without file type") @@ -580,10 +631,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma }, } d.pf.dentry = d - if mask.UID { + if mask.UID && attr.UID != auth.NoID { d.uid = uint32(attr.UID) } - if mask.GID { + if mask.GID && attr.GID != auth.NoID { d.gid = uint32(attr.GID) } if mask.Size { @@ -604,14 +655,25 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma if mask.BTime { d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) } + if mask.NLink { + d.nlink = uint32(attr.NLink) + } d.vfsd.Init(d) fs.syncMu.Lock() - fs.dentries[d] = struct{}{} + fs.syncableDentries[d] = struct{}{} fs.syncMu.Unlock() return d, nil } +func (d *dentry) isSynthetic() bool { + return d.file.isNil() +} + +func (d *dentry) cachedMetadataAuthoritative() bool { + return d.fs.opts.interop != InteropModeShared || d.isSynthetic() +} + // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { @@ -645,6 +707,9 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { if mask.BTime { atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) } + if mask.NLink { + atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) + } if mask.Size { d.dataMu.Lock() atomic.StoreUint64(&d.size, attr.Size) @@ -653,6 +718,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { d.metadataMu.Unlock() } +// Preconditions: !d.isSynthetic() func (d *dentry) updateFromGetattr(ctx context.Context) error { // Use d.handle.file, which represents a 9P fid that has been opened, in // preference to d.file, which represents a 9P fid that has not. This may @@ -687,10 +753,7 @@ func (d *dentry) fileType() uint32 { func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME stat.Blksize = atomic.LoadUint32(&d.blockSize) - stat.Nlink = 1 - if d.isDir() { - stat.Nlink = 2 - } + stat.Nlink = atomic.LoadUint32(&d.nlink) stat.UID = atomic.LoadUint32(&d.uid) stat.GID = atomic.LoadUint32(&d.gid) stat.Mode = uint16(atomic.LoadUint32(&d.mode)) @@ -703,7 +766,7 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) - // TODO(jamieliu): device number + // TODO(gvisor.dev/issue/1198): device number } func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { @@ -713,7 +776,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { @@ -722,7 +786,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin defer mnt.EndWrite() setLocalAtime := false setLocalMtime := false - if d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { // Timestamp updates will be handled locally. setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 @@ -735,40 +799,39 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } d.metadataMu.Lock() defer d.metadataMu.Unlock() - if stat.Mask != 0 { - if err := d.file.setAttr(ctx, p9.SetAttrMask{ - Permissions: stat.Mask&linux.STATX_MODE != 0, - UID: stat.Mask&linux.STATX_UID != 0, - GID: stat.Mask&linux.STATX_GID != 0, - Size: stat.Mask&linux.STATX_SIZE != 0, - ATime: stat.Mask&linux.STATX_ATIME != 0, - MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, - }, p9.SetAttr{ - Permissions: p9.FileMode(stat.Mode), - UID: p9.UID(stat.UID), - GID: p9.GID(stat.GID), - Size: stat.Size, - ATimeSeconds: uint64(stat.Atime.Sec), - ATimeNanoSeconds: uint64(stat.Atime.Nsec), - MTimeSeconds: uint64(stat.Mtime.Sec), - MTimeNanoSeconds: uint64(stat.Mtime.Nsec), - }); err != nil { - return err + if !d.isSynthetic() { + if stat.Mask != 0 { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + return err + } + } + if d.fs.opts.interop == InteropModeShared { + // There's no point to updating d's metadata in this case since + // it'll be overwritten by revalidation before the next time it's + // used anyway. (InteropModeShared inhibits client caching of + // regular file data, so there's no cache to truncate either.) + return nil } } - if d.fs.opts.interop == InteropModeShared { - // There's no point to updating d's metadata in this case since it'll - // be overwritten by revalidation before the next time it's used - // anyway. (InteropModeShared inhibits client caching of regular file - // data, so there's no cache to truncate either.) - return nil - } - now, haveNow := nowFromContext(ctx) - if !haveNow { - ctx.Warningf("gofer.dentry.setStat: current time not available") - } + now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } @@ -780,25 +843,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } if setLocalAtime { if stat.Atime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.atime, now) - } + atomic.StoreInt64(&d.atime, now) } else { atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) } } if setLocalMtime { if stat.Mtime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.mtime, now) - } + atomic.StoreInt64(&d.mtime, now) } else { atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) } } - if haveNow { - atomic.StoreInt64(&d.ctime, now) - } + atomic.StoreInt64(&d.ctime, now) if stat.Mask&linux.STATX_SIZE != 0 { d.dataMu.Lock() oldSize := d.size @@ -835,8 +892,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin return nil } -func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } // IncRef implements vfs.DentryImpl.IncRef. @@ -850,7 +907,7 @@ func (d *dentry) IncRef() { func (d *dentry) TryIncRef() bool { for { refs := atomic.LoadInt64(&d.refs) - if refs == 0 { + if refs <= 0 { return false } if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { @@ -870,16 +927,32 @@ func (d *dentry) DecRef() { } } +// decRefLocked decrements d's reference count without calling +// d.checkCachingLocked, even if d's reference count reaches 0; callers are +// responsible for ensuring that d.checkCachingLocked will be called later. +func (d *dentry) decRefLocked() { + if refs := atomic.AddInt64(&d.refs, -1); refs < 0 { + panic("gofer.dentry.decRefLocked() called without holding a reference") + } +} + // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // +// It may be called on a destroyed dentry. For example, +// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times +// for the same dentry when the dentry is visited more than once in the same +// operation. One of the calls may destroy the dentry, so subsequent calls will +// do nothing. +// // Preconditions: d.fs.renameMu must be locked for writing. func (d *dentry) checkCachingLocked() { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will // remain zero while we hold renameMu for writing.) - if atomic.LoadInt64(&d.refs) != 0 { + refs := atomic.LoadInt64(&d.refs) + if refs > 0 { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -887,9 +960,13 @@ func (d *dentry) checkCachingLocked() { } return } - // Non-child dentries with zero references are no longer reachable by path - // resolution and should be dropped immediately. - if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { + if refs == -1 { + // Dentry has already been destroyed. + return + } + // Deleted and invalidated dentries with zero references are no longer + // reachable by path resolution and should be dropped immediately. + if d.vfsd.IsDead() { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -914,34 +991,45 @@ func (d *dentry) checkCachingLocked() { d.fs.cachedDentries.Remove(victim) d.fs.cachedDentriesLen-- victim.cached = false - // victim.refs may have become non-zero from an earlier path - // resolution since it was inserted into fs.cachedDentries; see - // dentry.incRefLocked(). Either way, we brought - // fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so - // we don't loop. + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. if atomic.LoadInt64(&victim.refs) == 0 { - if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil { - victimParent := victimParentVFSD.Impl().(*dentry) - victimParent.dirMu.Lock() - if !victim.vfsd.IsDisowned() { - // victim can't be a mount point (in any mount - // namespace), since VFS holds references on mount - // points. - d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd) + if victim.parent != nil { + victim.parent.dirMu.Lock() + if !victim.vfsd.IsDead() { + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) + delete(victim.parent.children, victim.name) // We're only deleting the dentry, not the file it // represents, so we don't need to update // victimParent.dirents etc. } - victimParent.dirMu.Unlock() + victim.parent.dirMu.Unlock() } victim.destroyLocked() } + // Whether or not victim was destroyed, we brought fs.cachedDentriesLen + // back down to fs.opts.maxCachedDentries, so we don't loop. } } +// destroyLocked destroys the dentry. It may flushes dirty pages from cache, +// close p9 file and remove reference on parent dentry. +// // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is // not a child dentry. func (d *dentry) destroyLocked() { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("dentry.destroyLocked() called on already destroyed dentry") + default: + panic("dentry.destroyLocked() called with references on the dentry") + } + ctx := context.Background() d.handleMu.Lock() if !d.handle.file.isNil() { @@ -961,17 +1049,19 @@ func (d *dentry) destroyLocked() { d.handle.close(ctx) } d.handleMu.Unlock() - d.file.close(ctx) - // Remove d from the set of all dentries. - d.fs.syncMu.Lock() - delete(d.fs.dentries, d) - d.fs.syncMu.Unlock() - // Drop the reference held by d on its parent. - if parentVFSD := d.vfsd.Parent(); parentVFSD != nil { - parent := parentVFSD.Impl().(*dentry) - // This is parent.DecRef() without recursive locking of d.fs.renameMu. - if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 { - parent.checkCachingLocked() + if !d.file.isNil() { + d.file.close(ctx) + d.file = p9file{} + // Remove d from the set of syncable dentries. + d.fs.syncMu.Lock() + delete(d.fs.syncableDentries, d) + d.fs.syncMu.Unlock() + } + // Drop the reference held by d on its parent without recursively locking + // d.fs.renameMu. + if d.parent != nil { + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkCachingLocked() } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") } @@ -986,24 +1076,65 @@ func (d *dentry) setDeleted() { atomic.StoreUint32(&d.deleted, 1) } -func (d *dentry) listxattr(ctx context.Context) ([]string, error) { - return nil, syserror.ENOTSUP +// We only support xattrs prefixed with "user." (see b/148380782). Currently, +// there is no need to expose any other xattrs through a gofer. +func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { + if d.file.isNil() { + return nil, nil + } + xattrMap, err := d.file.listXattr(ctx, size) + if err != nil { + return nil, err + } + xattrs := make([]string, 0, len(xattrMap)) + for x := range xattrMap { + if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { + xattrs = append(xattrs, x) + } + } + return xattrs, nil } -func (d *dentry) getxattr(ctx context.Context, name string) (string, error) { - // TODO(jamieliu): add vfs.GetxattrOptions.Size - return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX) +func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if d.file.isNil() { + return "", syserror.ENODATA + } + if err := d.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + return d.file.getXattr(ctx, opts.Name, opts.Size) } -func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error { +func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if d.file.isNil() { + return syserror.EPERM + } + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } -func (d *dentry) removexattr(ctx context.Context, name string) error { - return syserror.ENOTSUP +func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { + if d.file.isNil() { + return syserror.EPERM + } + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + return d.file.removeXattr(ctx, name) } -// Preconditions: d.isRegularFile() || d.isDirectory(). +// Preconditions: !d.file.isNil(). d.isRegularFile() || d.isDirectory(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). @@ -1051,7 +1182,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // description, but this doesn't matter since they refer to the // same file (unless d.fs.opts.overlayfsStaleRead is true, // which we handle separately). - if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil { + if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil { d.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) h.close(ctx) @@ -1094,6 +1225,26 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool return nil } +// incLinks increments link count. +// +// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32. +func (d *dentry) incLinks() { + v := atomic.AddUint32(&d.nlink, 1) + if v < 2 { + panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v)) + } +} + +// decLinks decrements link count. +// +// Preconditions: d.nlink > 1. +func (d *dentry) decLinks() { + v := atomic.AddUint32(&d.nlink, ^uint32(0)) + if v == 0 { + panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v)) + } +} + // fileDescription is embedded by gofer implementations of // vfs.FileDescriptionImpl. type fileDescription struct { @@ -1112,7 +1263,8 @@ func (fd *fileDescription) dentry() *dentry { // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { d := fd.dentry() - if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) + if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if // available? if err := d.updateFromGetattr(ctx); err != nil { @@ -1130,21 +1282,21 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) { - return fd.dentry().listxattr(ctx) +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size) } // Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) { - return fd.dentry().getxattr(ctx, name) +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.dentry().setxattr(ctx, &opts) + return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.dentry().removexattr(ctx, name) + return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name) } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go new file mode 100644 index 000000000..adff39490 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + "testing" + + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/contexttest" +) + +func TestDestroyIdempotent(t *testing.T) { + fs := filesystem{ + syncableDentries: make(map[*dentry]struct{}), + opts: filesystemOptions{ + // Test relies on no dentry being held in the cache. + maxCachedDentries: 0, + }, + } + + ctx := contexttest.Context(t) + attr := &p9.Attr{ + Mode: p9.ModeRegular, + } + mask := p9.AttrMask{ + Mode: true, + Size: true, + } + parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + + child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + parent.cacheNewChildLocked(child, "child") + + child.checkCachingLocked() + if got := atomic.LoadInt64(&child.refs); got != -1 { + t.Fatalf("child.refs=%d, want: -1", got) + } + // Parent will also be destroyed when child reference is removed. + if got := atomic.LoadInt64(&parent.refs); got != -1 { + t.Fatalf("parent.refs=%d, want: -1", got) + } + child.checkCachingLocked() + child.checkCachingLocked() +} diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index cfe66f797..724a3f1f7 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/hostfd" ) // handle represents a remote "open file descriptor", consisting of an opened @@ -77,7 +78,7 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) - n, err := hostPreadv(h.fd, dsts, int64(offset)) + n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } @@ -103,7 +104,7 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) - n, err := hostPwritev(h.fd, srcs, int64(offset)) + n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go deleted file mode 100644 index 19560ab26..000000000 --- a/pkg/sentry/fsimpl/gofer/handle_unsafe.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "syscall" - "unsafe" - - "gvisor.dev/gvisor/pkg/safemem" -) - -// Preconditions: !dsts.IsEmpty(). -func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if dsts.NumBlocks() == 1 { - // Use pread() instead of preadv() to avoid iovec allocation and - // copying. - dst := dsts.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(dsts) - n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} - -// Preconditions: !srcs.IsEmpty(). -func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if srcs.NumBlocks() == 1 { - // Use pwrite() instead of pwritev() to avoid iovec allocation and - // copying. - src := srcs.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(srcs) - n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index 755ac2985..87f0b877f 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -85,6 +85,13 @@ func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAt return err } +func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { + ctx.UninterruptibleSleepStart(false) + xattrs, err := f.file.ListXattr(size) + ctx.UninterruptibleSleepFinish(false) + return xattrs, err +} + func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { ctx.UninterruptibleSleepStart(false) val, err := f.file.GetXattr(name, size) @@ -99,6 +106,13 @@ func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) return err } +func (f p9file) removeXattr(ctx context.Context, name string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.RemoveXattr(name) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { ctx.UninterruptibleSleepStart(false) err := f.file.Allocate(mode, offset, length) diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index e95209661..857f7c74e 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -104,7 +104,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs putDentryReadWriter(rw) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). - d.touchAtime(ctx, fd.vfsfd.Mount()) + d.touchAtime(fd.vfsfd.Mount()) } return n, err } @@ -126,6 +126,11 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) d := fd.dentry() d.metadataMu.Lock() @@ -134,10 +139,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). - if now, ok := nowFromContext(ctx); ok { - atomic.StoreInt64(&d.mtime, now) - atomic.StoreInt64(&d.ctime, now) - } + d.touchCMtimeLocked() } if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { // Write dirty cached pages that will be touched by the write back to diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 08c691c47..507e0e276 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -76,7 +76,7 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // hold here since specialFileFD doesn't client-cache data. Just buffer the // read instead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, fd.vfsfd.Mount()) + d.touchAtime(fd.vfsfd.Mount()) } buf := make([]byte, dst.NumBytes()) n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) @@ -107,9 +107,17 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EOPNOTSUPP } + if fd.dentry().fileType() == linux.S_IFREG { + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) + } + // Do a buffered write. See rationale in PRead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) + d.touchCMtime() } buf := make([]byte, src.NumBytes()) // Don't do partial writes if we get a partial read from src. diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index adf43be60..2ec819f86 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -27,7 +27,7 @@ func (d *dentry) isSymlink() bool { // Precondition: d.isSymlink(). func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, mnt) + d.touchAtime(mnt) d.dataMu.Lock() if d.haveTarget { target := d.target diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 7598ec6a8..2608e7e1d 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -18,8 +18,6 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -38,23 +36,12 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { } } -func nowFromContext(ctx context.Context) (int64, bool) { - if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { - return clock.Now().Nanoseconds(), true - } - return 0, false -} - // Preconditions: fs.interop != InteropModeShared. -func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { +func (d *dentry) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now, ok := nowFromContext(ctx) - if !ok { - mnt.EndWrite() - return - } + now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.atime, now) d.metadataMu.Unlock() @@ -63,13 +50,25 @@ func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { // Preconditions: fs.interop != InteropModeShared. The caller has successfully // called vfs.Mount.CheckBeginWrite(). -func (d *dentry) touchCMtime(ctx context.Context) { - now, ok := nowFromContext(ctx) - if !ok { - return - } +func (d *dentry) touchCtime() { + now := d.fs.clock.Now().Nanoseconds() + d.metadataMu.Lock() + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} + +// Preconditions: fs.interop != InteropModeShared. The caller has successfully +// called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCMtime() { + now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) d.metadataMu.Unlock() } + +func (d *dentry) touchCMtimeLocked() { + now := d.fs.clock.Now().Nanoseconds() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) +} diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 731f192b3..2dcb03a73 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -5,19 +5,28 @@ licenses(["notice"]) go_library( name = "host", srcs = [ - "default_file.go", + "control.go", "host.go", + "ioctl_unsafe.go", + "tty.go", "util.go", + "util_unsafe.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", "//pkg/log", "//pkg/refs", - "//pkg/safemem", + "//pkg/sentry/arch", "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/hostfd", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/unimpl", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go new file mode 100644 index 000000000..b9082a20f --- /dev/null +++ b/pkg/sentry/fsimpl/host/control.go @@ -0,0 +1,96 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type scmRights struct { + fds []int +} + +func newSCMRights(fds []int) control.SCMRightsVFS2 { + return &scmRights{fds} +} + +// Files implements control.SCMRights.Files. +func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFilesVFS2, bool) { + n := max + var trunc bool + if l := len(c.fds); n > l { + n = l + } else if n < l { + trunc = true + } + + rf := control.RightsFilesVFS2(fdsToFiles(ctx, c.fds[:n])) + + // Only consume converted FDs (fdsToFiles may convert fewer than n FDs). + c.fds = c.fds[len(rf):] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (c *scmRights) Clone() transport.RightsControlMessage { + // Host rights never need to be cloned. + return nil +} + +// Release implements transport.RightsControlMessage.Release. +func (c *scmRights) Release() { + for _, fd := range c.fds { + syscall.Close(fd) + } + c.fds = nil +} + +// If an error is encountered, only files created before the error will be +// returned. This is what Linux does. +func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription { + files := make([]*vfs.FileDescription, 0, len(fds)) + for _, fd := range fds { + // Get flags. We do it here because they may be modified + // by subsequent functions. + fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if errno != 0 { + ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) + break + } + + // Create the file backed by hostFD. + file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */) + if err != nil { + ctx.Warningf("Error creating file from host FD: %v", err) + break + } + + if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil { + ctx.Warningf("Error setting flags on host FD file: %v", err) + break + } + + files = append(files, file) + } + return files +} diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go deleted file mode 100644 index 172cdb161..000000000 --- a/pkg/sentry/fsimpl/host/default_file.go +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package host - -import ( - "math" - "syscall" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files. -type defaultFileFD struct { - fileDescription - - // canMap specifies whether we allow the file to be memory mapped. - canMap bool - - // mu protects the fields below. - mu sync.Mutex - - // offset specifies the current file offset. - offset int64 -} - -// TODO(gvisor.dev/issue/1672): Implement Waitable interface. - -// PRead implements FileDescriptionImpl. -func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - - return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags)) -} - -// Read implements FileDescriptionImpl. -func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - // These files can't be memory mapped, assert this. - if f.canMap { - panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") - } - - f.mu.Lock() - n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags)) - f.mu.Unlock() - if isBlockError(err) { - // If we got any data at all, return it as a "completed" partial read - // rather than retrying until complete. - if n != 0 { - err = nil - } else { - err = syserror.ErrWouldBlock - } - } - return n, err - } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - f.mu.Lock() - n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags)) - f.offset += n - f.mu.Unlock() - return n, err -} - -func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { - if flags&^(linux.RWF_VALID) != 0 { - return 0, syserror.EOPNOTSUPP - } - - reader := safemem.FromVecReaderFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Preadv2(fd, srcs, offset, flags) - return int64(n), err - }, - } - n, err := dst.CopyOutFrom(ctx, reader) - return int64(n), err -} - -// PWrite implements FileDescriptionImpl. -func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - - return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags)) -} - -// Write implements FileDescriptionImpl. -func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - // These files can't be memory mapped, assert this. - if f.canMap { - panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") - } - - f.mu.Lock() - n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags)) - f.mu.Unlock() - if isBlockError(err) { - err = syserror.ErrWouldBlock - } - return n, err - } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. - f.mu.Lock() - n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags)) - f.offset += n - f.mu.Unlock() - return n, err -} - -func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) { - if flags&^(linux.RWF_VALID) != 0 { - return 0, syserror.EOPNOTSUPP - } - - writer := safemem.FromVecWriterFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Pwritev2(fd, srcs, offset, flags) - return int64(n), err - }, - } - n, err := src.CopyInTo(ctx, writer) - return int64(n), err -} - -// Seek implements FileDescriptionImpl. -// -// Note that we do not support seeking on directories, since we do not even -// allow directory fds to be imported at all. -func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) { - // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - - f.mu.Lock() - defer f.mu.Unlock() - - switch whence { - case linux.SEEK_SET: - if offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset = offset - - case linux.SEEK_CUR: - // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. - if offset > math.MaxInt64-f.offset { - return f.offset, syserror.EOVERFLOW - } - if f.offset+offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset += offset - - case linux.SEEK_END: - var s syscall.Stat_t - if err := syscall.Fstat(f.inode.hostFD, &s); err != nil { - return f.offset, err - } - size := s.Size - - // Check for overflow. Note that underflow cannot occur, since size >= 0. - if offset > math.MaxInt64-size { - return f.offset, syserror.EOVERFLOW - } - if size+offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset = size + offset - - case linux.SEEK_DATA, linux.SEEK_HOLE: - // Modifying the offset in the host file table should not matter, since - // this is the only place where we use it. - // - // For reading and writing, we always rely on our internal offset. - n, err := unix.Seek(f.inode.hostFD, offset, int(whence)) - if err != nil { - return f.offset, err - } - f.offset = n - - default: - // Invalid whence. - return f.offset, syserror.EINVAL - } - - return f.offset, nil -} - -// Sync implements FileDescriptionImpl. -func (f *defaultFileFD) Sync(context.Context) error { - // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. - return unix.Fsync(f.inode.hostFD) -} - -// ConfigureMMap implements FileDescriptionImpl. -func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { - if !f.canMap { - return syserror.ENODEV - } - // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. - return syserror.ENODEV -} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index c205e6a0b..1e53b5c1b 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -19,6 +19,7 @@ package host import ( "errors" "fmt" + "math" "syscall" "golang.org/x/sys/unix" @@ -27,45 +28,79 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("host.filesystemType.GetFilesystem should never be called") +} + +// Name implements FilesystemType.Name. +func (filesystemType) Name() string { + return "none" +} + // filesystem implements vfs.FilesystemImpl. type filesystem struct { kernfs.Filesystem } +// NewFilesystem sets up and returns a new hostfs filesystem. +// +// Note that there should only ever be one instance of host.filesystem, +// a global mount for host fds. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() +} + // ImportFD sets up and returns a vfs.FileDescription from a donated fd. -func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) { - // Must be importing to a mount of host.filesystem. - fs, ok := mnt.Filesystem().Impl().(*filesystem) +func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { + fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) } // Retrieve metadata. - var s syscall.Stat_t - if err := syscall.Fstat(hostFD, &s); err != nil { + var s unix.Stat_t + if err := unix.Fstat(hostFD, &s); err != nil { return nil, err } fileMode := linux.FileMode(s.Mode) fileType := fileMode.FileType() - // Pipes, character devices, and sockets can return EWOULDBLOCK for - // operations that would block. - isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK + + // Determine if hostFD is seekable. If not, this syscall will return ESPIPE + // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character + // devices. + _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) + seekable := err != syserror.ESPIPE i := &inode{ hostFD: hostFD, - isStream: isStream, + seekable: seekable, isTTY: isTTY, + canMap: canMap(uint32(fileType)), ino: fs.NextIno(), - mode: fileMode, - uid: ownerUID, - gid: ownerGID, + // For simplicity, set offset to 0. Technically, we should use the existing + // offset on the host if the file is seekable. + offset: 0, + } + + // Non-seekable files can't be memory mapped, assert this. + if !i.seekable && i.canMap { + panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } d := &kernfs.Dentry{} @@ -73,7 +108,7 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID // i.open will take a reference on d. defer d.DecRef() - return i.open(d.VFSDentry(), mnt) + return i.open(ctx, d.VFSDentry(), mnt) } // inode implements kernfs.Inode. @@ -90,32 +125,33 @@ type inode struct { // This field is initialized at creation time and is immutable. hostFD int - // isStream is true if the host fd points to a file representing a stream, + // seekable is false if the host fd points to a file representing a stream, // e.g. a socket or a pipe. Such files are not seekable and can return // EWOULDBLOCK for I/O operations. // // This field is initialized at creation time and is immutable. - isStream bool + seekable bool // isTTY is true if this file represents a TTY. // // This field is initialized at creation time and is immutable. isTTY bool + // canMap specifies whether we allow the file to be memory mapped. + // + // This field is initialized at creation time and is immutable. + canMap bool + // ino is an inode number unique within this filesystem. + // + // This field is initialized at creation time and is immutable. ino uint64 - // mu protects the inode metadata below. - mu sync.Mutex + // offsetMu protects offset. + offsetMu sync.Mutex - // mode is the file mode of this inode. Note that this value may become out - // of date if the mode is changed on the host, e.g. with chmod. - mode linux.FileMode - - // uid and gid of the file owner. Note that these refer to the owner of the - // file created on import, not the fd on the host. - uid auth.KUID - gid auth.KGID + // offset specifies the current file offset. + offset int64 } // Note that these flags may become out of date, since they can be modified @@ -132,55 +168,167 @@ func fileFlagsFromHostFD(fd int) (int, error) { } // CheckPermissions implements kernfs.Inode. -func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid) +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return err + } + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) } // Mode implements kernfs.Inode. func (i *inode) Mode() linux.FileMode { - return i.mode + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + // Retrieving the mode from the host fd using fstat(2) should not fail. + // If the syscall does not succeed, something is fundamentally wrong. + panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) + } + return linux.FileMode(s.Mode) } // Stat implements kernfs.Inode. func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + if opts.Mask&linux.STATX__RESERVED != 0 { + return linux.Statx{}, syserror.EINVAL + } + if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { + return linux.Statx{}, syserror.EINVAL + } + + // Limit our host call only to known flags. + mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t - if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil { + err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) + // Fallback to fstat(2), if statx(2) is not supported on the host. + // + // TODO(b/151263641): Remove fallback. + if err == syserror.ENOSYS { + return i.fstat(opts) + } else if err != nil { return linux.Statx{}, err } - ls := unixToLinuxStatx(s) - // Use our own internal inode number and file owner. + ls := linux.Statx{Mask: mask} + // Unconditionally fill blksize, attributes, and device numbers, as indicated + // by /include/uapi/linux/stat.h. + // + // RdevMajor/RdevMinor are left as zero, so as not to expose host device + // numbers. + // + // TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined + // device numbers. If we use the device number from the host, it may collide + // with another sentry-internal device number. We handle device/inode + // numbers without relying on the host to prevent collisions. + ls.Blksize = s.Blksize + ls.Attributes = s.Attributes + ls.AttributesMask = s.Attributes_mask + + if mask&linux.STATX_TYPE != 0 { + ls.Mode |= s.Mode & linux.S_IFMT + } + if mask&linux.STATX_MODE != 0 { + ls.Mode |= s.Mode &^ linux.S_IFMT + } + if mask&linux.STATX_NLINK != 0 { + ls.Nlink = s.Nlink + } + if mask&linux.STATX_UID != 0 { + ls.UID = s.Uid + } + if mask&linux.STATX_GID != 0 { + ls.GID = s.Gid + } + if mask&linux.STATX_ATIME != 0 { + ls.Atime = unixToLinuxStatxTimestamp(s.Atime) + } + if mask&linux.STATX_BTIME != 0 { + ls.Btime = unixToLinuxStatxTimestamp(s.Btime) + } + if mask&linux.STATX_CTIME != 0 { + ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) + } + if mask&linux.STATX_MTIME != 0 { + ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) + } + if mask&linux.STATX_SIZE != 0 { + ls.Size = s.Size + } + if mask&linux.STATX_BLOCKS != 0 { + ls.Blocks = s.Blocks + } + + // Use our own internal inode number. + if mask&linux.STATX_INO != 0 { + ls.Ino = i.ino + } + + return ls, nil +} + +// fstat is a best-effort fallback for inode.Stat() if the host does not +// support statx(2). +// +// We ignore the mask and sync flags in opts and simply supply +// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification +// of a mask or sync flags. fstat(2) does not provide any metadata +// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so +// those fields remain empty. +func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { + var s unix.Stat_t + if err := unix.Fstat(i.hostFD, &s); err != nil { + return linux.Statx{}, err + } + + // Note that rdev numbers are left as 0; do not expose host device numbers. + ls := linux.Statx{ + Mask: linux.STATX_BASIC_STATS, + Blksize: uint32(s.Blksize), + Nlink: uint32(s.Nlink), + UID: s.Uid, + GID: s.Gid, + Mode: uint16(s.Mode), + Size: uint64(s.Size), + Blocks: uint64(s.Blocks), + Atime: timespecToStatxTimestamp(s.Atim), + Ctime: timespecToStatxTimestamp(s.Ctim), + Mtime: timespecToStatxTimestamp(s.Mtim), + } + + // Use our own internal inode number. // // TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well. // If we use the device number from the host, it may collide with another // sentry-internal device number. We handle device/inode numbers without // relying on the host to prevent collisions. ls.Ino = i.ino - ls.UID = uint32(i.uid) - ls.GID = uint32(i.gid) - - // Update file mode from the host. - i.mode = linux.FileMode(ls.Mode) return ls, nil } // SetStat implements kernfs.Inode. -func (i *inode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { +func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { s := opts.Stat m := s.Mask if m == 0 { return nil } - if m&(linux.STATX_UID|linux.STATX_GID) != 0 { + if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { return syserror.EPERM } + var hostStat syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &hostStat); err != nil { + return err + } + if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { + return err + } + if m&linux.STATX_MODE != 0 { if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { return err } - i.mode = linux.FileMode(s.Mode) } if m&linux.STATX_SIZE != 0 { if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { @@ -188,11 +336,11 @@ func (i *inode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { } } if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { - timestamps := []unix.Timespec{ + ts := [2]syscall.Timespec{ toTimespec(s.Atime, m&linux.STATX_ATIME == 0), toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), } - if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil { + if err := setTimestamps(i.hostFD, &ts); err != nil { return err } } @@ -212,13 +360,16 @@ func (i *inode) Destroy() { } // Open implements kernfs.Inode. -func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return i.open(vfsd, rp.Mount()) +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.open(ctx, vfsd, rp.Mount()) } -func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { - - fileType := i.mode.FileType() +func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return nil, err + } + fileType := s.Mode & linux.FileTypeMask if fileType == syscall.S_IFSOCK { if i.isTTY { return nil, errors.New("cannot use host socket as TTY") @@ -227,36 +378,42 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error return nil, errors.New("importing host sockets not supported") } + // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that + // we don't allow importing arbitrary file types without proper support. + var ( + vfsfd *vfs.FileDescription + fdImpl vfs.FileDescriptionImpl + ) if i.isTTY { - // TODO(gvisor.dev/issue/1672): support importing host fd as TTY. - return nil, errors.New("importing host fd as TTY not supported") - } - - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. - fd := &defaultFileFD{ - fileDescription: fileDescription{ - inode: i, - }, - canMap: canMap(uint32(fileType)), - mu: sync.Mutex{}, - offset: 0, + fd := &ttyFD{ + fileDescription: fileDescription{inode: i}, + termios: linux.DefaultSlaveTermios, + } + vfsfd = &fd.vfsfd + fdImpl = fd + } else { + // For simplicity, set offset to 0. Technically, we should + // only set to 0 on files that are not seekable (sockets, pipes, etc.), + // and use the offset from the host fd otherwise. + fd := &fileDescription{inode: i} + vfsfd = &fd.vfsfd + fdImpl = fd } - vfsfd := &fd.vfsfd flags, err := fileFlagsFromHostFD(i.hostFD) if err != nil { return nil, err } - if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := vfsfd.Init(fdImpl, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. +// +// TODO(gvisor.dev/issue/1672): Implement Waitable interface. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -271,8 +428,9 @@ type fileDescription struct { } // SetStat implements vfs.FileDescriptionImpl. -func (f *fileDescription) SetStat(_ context.Context, opts vfs.SetStatOptions) error { - return f.inode.SetStat(nil, opts) +func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + return f.inode.SetStat(ctx, nil, creds, opts) } // Stat implements vfs.FileDescriptionImpl. @@ -284,3 +442,168 @@ func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.S func (f *fileDescription) Release() { // noop } + +// PRead implements FileDescriptionImpl. +func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + i := f.inode + if !i.seekable { + return 0, syserror.ESPIPE + } + + return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) +} + +// Read implements FileDescriptionImpl. +func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + i := f.inode + if !i.seekable { + n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + i.offsetMu.Lock() + n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags) + i.offset += n + i.offsetMu.Unlock() + return n, err +} + +func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. + if flags != 0 { + return 0, syserror.EOPNOTSUPP + } + reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) + n, err := dst.CopyOutFrom(ctx, reader) + hostfd.PutReadWriterAt(reader) + return int64(n), err +} + +// PWrite implements FileDescriptionImpl. +func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + i := f.inode + if !i.seekable { + return 0, syserror.ESPIPE + } + + return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags) +} + +// Write implements FileDescriptionImpl. +func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + i := f.inode + if !i.seekable { + n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags) + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. + i.offsetMu.Lock() + n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags) + i.offset += n + i.offsetMu.Unlock() + return n, err +} + +func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. + if flags != 0 { + return 0, syserror.EOPNOTSUPP + } + writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) + n, err := src.CopyInTo(ctx, writer) + hostfd.PutReadWriterAt(writer) + return int64(n), err +} + +// Seek implements FileDescriptionImpl. +// +// Note that we do not support seeking on directories, since we do not even +// allow directory fds to be imported at all. +func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { + i := f.inode + if !i.seekable { + return 0, syserror.ESPIPE + } + + i.offsetMu.Lock() + defer i.offsetMu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset = offset + + case linux.SEEK_CUR: + // Check for overflow. Note that underflow cannot occur, since i.offset >= 0. + if offset > math.MaxInt64-i.offset { + return i.offset, syserror.EOVERFLOW + } + if i.offset+offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset += offset + + case linux.SEEK_END: + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return i.offset, err + } + size := s.Size + + // Check for overflow. Note that underflow cannot occur, since size >= 0. + if offset > math.MaxInt64-size { + return i.offset, syserror.EOVERFLOW + } + if size+offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset = size + offset + + case linux.SEEK_DATA, linux.SEEK_HOLE: + // Modifying the offset in the host file table should not matter, since + // this is the only place where we use it. + // + // For reading and writing, we always rely on our internal offset. + n, err := unix.Seek(i.hostFD, offset, int(whence)) + if err != nil { + return i.offset, err + } + i.offset = n + + default: + // Invalid whence. + return i.offset, syserror.EINVAL + } + + return i.offset, nil +} + +// Sync implements FileDescriptionImpl. +func (f *fileDescription) Sync(context.Context) error { + // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. + return unix.Fsync(f.inode.hostFD) +} + +// ConfigureMMap implements FileDescriptionImpl. +func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { + if !f.inode.canMap { + return syserror.ENODEV + } + // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. + return syserror.ENODEV +} diff --git a/pkg/sentry/fsimpl/host/ioctl_unsafe.go b/pkg/sentry/fsimpl/host/ioctl_unsafe.go new file mode 100644 index 000000000..0983bf7d8 --- /dev/null +++ b/pkg/sentry/fsimpl/host/ioctl_unsafe.go @@ -0,0 +1,56 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +func ioctlGetTermios(fd int) (*linux.Termios, error) { + var t linux.Termios + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) + if errno != 0 { + return nil, errno + } + return &t, nil +} + +func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t))) + if errno != 0 { + return errno + } + return nil +} + +func ioctlGetWinsize(fd int) (*linux.Winsize, error) { + var w linux.Winsize + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w))) + if errno != 0 { + return nil, errno + } + return &w, nil +} + +func ioctlSetWinsize(fd int, w *linux.Winsize) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w))) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go new file mode 100644 index 000000000..8936afb06 --- /dev/null +++ b/pkg/sentry/fsimpl/host/tty.go @@ -0,0 +1,379 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ttyFD implements vfs.FileDescriptionImpl for a host file descriptor +// that wraps a TTY FD. +type ttyFD struct { + fileDescription + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // session is the session attached to this ttyFD. + session *kernel.Session + + // fgProcessGroup is the foreground process group that is currently + // connected to this TTY. + fgProcessGroup *kernel.ProcessGroup + + // termios contains the terminal attributes for this TTY. + termios linux.KernelTermios +} + +// InitForegroundProcessGroup sets the foreground process group and session for +// the TTY. This should only be called once, after the foreground process group +// has been created, but before it has started running. +func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { + t.mu.Lock() + defer t.mu.Unlock() + if t.fgProcessGroup != nil { + panic("foreground process group is already set") + } + t.fgProcessGroup = pg + t.session = pg.Session() +} + +// ForegroundProcessGroup returns the foreground process for the TTY. +func (t *ttyFD) ForegroundProcessGroup() *kernel.ProcessGroup { + t.mu.Lock() + defer t.mu.Unlock() + return t.fgProcessGroup +} + +// Release implements fs.FileOperations.Release. +func (t *ttyFD) Release() { + t.mu.Lock() + t.fgProcessGroup = nil + t.mu.Unlock() + + t.fileDescription.Release() +} + +// PRead implements vfs.FileDescriptionImpl. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileDescription.PRead(ctx, dst, offset, opts) +} + +// Read implements vfs.FileDescriptionImpl. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileDescription.Read(ctx, dst, opts) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + } + return t.fileDescription.PWrite(ctx, src, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl. +func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + } + return t.fileDescription.Write(ctx, src, opts) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Ignore arg[0]. This is the real FD: + fd := t.inode.hostFD + ioctl := args[1].Uint64() + switch ioctl { + case linux.TCGETS: + termios, err := ioctlGetTermios(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TCSETS, linux.TCSETSW, linux.TCSETSF: + t.mu.Lock() + defer t.mu.Unlock() + + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + + var termios linux.Termios + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetTermios(fd, ioctl, &termios) + if err == nil { + t.termios.FromTermios(termios) + } + return 0, err + + case linux.TIOCGPGRP: + // Args: pid_t *argp + // When successful, equivalent to *argp = tcgetpgrp(fd). + // Get the process group ID of the foreground process group on this + // terminal. + + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Map the ProcessGroup into a ProcessGroupID in the task's PID namespace. + pgID := pidns.IDOfProcessGroup(t.fgProcessGroup) + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSPGRP: + // Args: const pid_t *argp + // Equivalent to tcsetpgrp(fd, *argp). + // Set the foreground process group ID of this terminal. + + task := kernel.TaskFromContext(ctx) + if task == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Check that we are allowed to set the process group. + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change() + // to -ENOTTY. + if err == syserror.EIO { + return 0, syserror.ENOTTY + } + return 0, err + } + + // Check that calling task's process group is in the TTY session. + if task.ThreadGroup().Session() != t.session { + return 0, syserror.ENOTTY + } + + var pgID kernel.ProcessGroupID + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + // pgID must be non-negative. + if pgID < 0 { + return 0, syserror.EINVAL + } + + // Process group with pgID must exist in this PID namespace. + pidns := task.PIDNamespace() + pg := pidns.ProcessGroupWithID(pgID) + if pg == nil { + return 0, syserror.ESRCH + } + + // Check that new process group is in the TTY session. + if pg.Session() != t.session { + return 0, syserror.EPERM + } + + t.fgProcessGroup = pg + return 0, nil + + case linux.TIOCGWINSZ: + // Args: struct winsize *argp + // Get window size. + winsize, err := ioctlGetWinsize(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSWINSZ: + // Args: const struct winsize *argp + // Set window size. + + // Unlike setting the termios, any process group (even background ones) can + // set the winsize. + + var winsize linux.Winsize + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetWinsize(fd, &winsize) + return 0, err + + // Unimplemented commands. + case linux.TIOCSETD, + linux.TIOCSBRK, + linux.TIOCCBRK, + linux.TCSBRK, + linux.TCSBRKP, + linux.TIOCSTI, + linux.TIOCCONS, + linux.FIONBIO, + linux.TIOCEXCL, + linux.TIOCNXCL, + linux.TIOCGEXCL, + linux.TIOCNOTTY, + linux.TIOCSCTTY, + linux.TIOCGSID, + linux.TIOCGETD, + linux.TIOCVHANGUP, + linux.TIOCGDEV, + linux.TIOCMGET, + linux.TIOCMSET, + linux.TIOCMBIC, + linux.TIOCMBIS, + linux.TIOCGICOUNT, + linux.TCFLSH, + linux.TIOCSSERIAL, + linux.TIOCGPTPEER: + + unimpl.EmitUnimplementedEvent(ctx) + fallthrough + default: + return 0, syserror.ENOTTY + } +} + +// checkChange checks that the process group is allowed to read, write, or +// change the state of the TTY. +// +// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic +// is a bit convoluted, but documented inline. +// +// Preconditions: t.mu must be held. +func (t *ttyFD) checkChange(ctx context.Context, sig linux.Signal) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + // No task? Linux does not have an analog for this case, but + // tty_check_change is more of a blacklist of cases than a + // whitelist, and is surprisingly permissive. Allowing the + // change seems most appropriate. + return nil + } + + tg := task.ThreadGroup() + pg := tg.ProcessGroup() + + // If the session for the task is different than the session for the + // controlling TTY, then the change is allowed. Seems like a bad idea, + // but that's exactly what linux does. + if tg.Session() != t.fgProcessGroup.Session() { + return nil + } + + // If we are the foreground process group, then the change is allowed. + if pg == t.fgProcessGroup { + return nil + } + + // We are not the foreground process group. + + // Is the provided signal blocked or ignored? + if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) { + // If the signal is SIGTTIN, then we are attempting to read + // from the TTY. Don't send the signal and return EIO. + if sig == linux.SIGTTIN { + return syserror.EIO + } + + // Otherwise, we are writing or changing terminal state. This is allowed. + return nil + } + + // If the process group is an orphan, return EIO. + if pg.IsOrphan() { + return syserror.EIO + } + + // Otherwise, send the signal to the process group and return ERESTARTSYS. + // + // Note that Linux also unconditionally sets TIF_SIGPENDING on current, + // but this isn't necessary in gVisor because the rationale given in + // 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't + // apply: the sentry will handle -ERESTARTSYS in + // kernel.runApp.execute() even if the kernel.Task isn't interrupted. + // + // Linux ignores the result of kill_pgrp(). + _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) + return kernel.ERESTARTSYS +} diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index e1ccacb4d..2bc757b1a 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -22,47 +22,27 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { +func toTimespec(ts linux.StatxTimestamp, omit bool) syscall.Timespec { if omit { - return unix.Timespec{ + return syscall.Timespec{ Sec: 0, Nsec: unix.UTIME_OMIT, } } - return unix.Timespec{ - Sec: int64(ts.Sec), + return syscall.Timespec{ + Sec: ts.Sec, Nsec: int64(ts.Nsec), } } -func unixToLinuxStatx(s unix.Statx_t) linux.Statx { - return linux.Statx{ - Mask: s.Mask, - Blksize: s.Blksize, - Attributes: s.Attributes, - Nlink: s.Nlink, - UID: s.Uid, - GID: s.Gid, - Mode: s.Mode, - Ino: s.Ino, - Size: s.Size, - Blocks: s.Blocks, - AttributesMask: s.Attributes_mask, - Atime: unixToLinuxStatxTimestamp(s.Atime), - Btime: unixToLinuxStatxTimestamp(s.Btime), - Ctime: unixToLinuxStatxTimestamp(s.Ctime), - Mtime: unixToLinuxStatxTimestamp(s.Mtime), - RdevMajor: s.Rdev_major, - RdevMinor: s.Rdev_minor, - DevMajor: s.Dev_major, - DevMinor: s.Dev_minor, - } -} - func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp { return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec} } +func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { + return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)} +} + // wouldBlock returns true for file types that can return EWOULDBLOCK // for blocking operations, e.g. pipes, character devices, and sockets. func wouldBlock(fileType uint32) bool { diff --git a/pkg/sentry/fsimpl/host/util_unsafe.go b/pkg/sentry/fsimpl/host/util_unsafe.go new file mode 100644 index 000000000..5136ac844 --- /dev/null +++ b/pkg/sentry/fsimpl/host/util_unsafe.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +func setTimestamps(fd int, ts *[2]syscall.Timespec) error { + _, _, errno := syscall.Syscall6( + syscall.SYS_UTIMENSAT, + uintptr(fd), + 0, /* path */ + uintptr(unsafe.Pointer(ts)), + 0, /* flags */ + 0, 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index e73f1f857..ef34cb28a 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -4,6 +4,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) go_template_instance( + name = "fstree", + out = "fstree.go", + package = "kernfs", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "Dentry", + }, +) + +go_template_instance( name = "slot_list", out = "slot_list.go", package = "kernfs", @@ -21,6 +32,7 @@ go_library( "dynamic_bytes_file.go", "fd_impl_util.go", "filesystem.go", + "fstree.go", "inode_impl_util.go", "kernfs.go", "slot_list.go", @@ -35,6 +47,7 @@ go_library( "//pkg/refs", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 1c026f4d8..c7779fc11 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -53,7 +53,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy } // Open implements Inode.Open. -func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil { return nil, err @@ -61,9 +61,10 @@ func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vf return &fd.vfsfd, nil } -// SetStat implements Inode.SetStat. -func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { - // DynamicBytesFiles are immutable. +// SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow +// inode attributes to be changed. Override SetStat() making it call +// f.InodeAttrs to allow it. +func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index da821d524..dd5806301 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -15,8 +15,11 @@ package kernfs import ( + "math" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -42,15 +45,27 @@ type GenericDirectoryFD struct { off int64 } -// Init initializes a GenericDirectoryFD. -func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error { +// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its +// dentry. +func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { + fd := &GenericDirectoryFD{} + if err := fd.Init(children, opts); err != nil { + return nil, err + } + if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return fd, nil +} + +// Init initializes a GenericDirectoryFD. Use it when overriding +// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the +// correct implementation. +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } - if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { - return err - } fd.children = children return nil } @@ -128,7 +143,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Handle "..". if fd.off == 1 { - parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode + parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode stat, err := parentInode.Stat(vfsFS, opts) if err != nil { return err @@ -186,6 +201,10 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int // Use offset as given. case linux.SEEK_CUR: offset += fd.off + case linux.SEEK_END: + // TODO(gvisor.dev/issue/1193): This can prevent new files from showing up + // if they are added after SEEK_END. + offset = math.MaxInt64 default: return 0, syserror.EINVAL } @@ -205,7 +224,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - fs := fd.filesystem() + creds := auth.CredentialsFromContext(ctx) inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode - return inode.SetStat(fs, opts) + return inode.SetStat(ctx, fd.filesystem(), creds, opts) } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 1d7e04ad4..9e8d80414 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -22,6 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -54,36 +56,48 @@ afterSymlink: return vfsd, nil } if name == ".." { - nextVFSD, err := rp.ResolveParent(vfsd) - if err != nil { + if isRoot, err := rp.CheckRoot(vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return vfsd, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { return nil, err } rp.Advance() - return nextVFSD, nil + return &d.parent.vfsd, nil + } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG } d.dirMu.Lock() - nextVFSD, err := rp.ResolveChild(vfsd, name) + next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name]) + d.dirMu.Unlock() if err != nil { - d.dirMu.Unlock() return nil, err } - next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD) - d.dirMu.Unlock() - if err != nil { + if err := rp.CheckMount(&next.vfsd); err != nil { return nil, err } // Resolve any symlink at current path component. if rp.ShouldFollowSymlink() && next.isSymlink() { - // TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks". - target, err := next.inode.Readlink(ctx) + targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount()) if err != nil { return nil, err } - if err := rp.HandleSymlink(target); err != nil { - return nil, err + if targetVD.Ok() { + err := rp.HandleJump(targetVD) + targetVD.DecRef() + if err != nil { + return nil, err + } + } else { + if err := rp.HandleSymlink(targetPathname); err != nil { + return nil, err + } } goto afterSymlink - } rp.Advance() return &next.vfsd, nil @@ -97,17 +111,17 @@ afterSymlink: // parent.dirMu must be locked. parent.isDir(). name is not "." or "..". // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) { - if childVFSD != nil { +func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) { + if child != nil { // Cached dentry exists, revalidate. - child := childVFSD.Impl().(*Dentry) if !child.inode.Valid(ctx) { - vfsObj.ForceDeleteDentry(childVFSD) - fs.deferDecRef(childVFSD) // Reference from Lookup. - childVFSD = nil + delete(parent.children, name) + vfsObj.InvalidateDentry(&child.vfsd) + fs.deferDecRef(&child.vfsd) // Reference from Lookup. + child = nil } } - if childVFSD == nil { + if child == nil { // Dentry isn't cached; it either doesn't exist or failed // revalidation. Attempt to resolve it via Lookup. // @@ -115,15 +129,15 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes // that all dentries in the filesystem are (kernfs.)Dentry and performs // vfs.DentryImpl casts accordingly. - var err error - childVFSD, err = parent.inode.Lookup(ctx, name) + childVFSD, err := parent.inode.Lookup(ctx, name) if err != nil { return nil, err } // Reference on childVFSD dropped by a corresponding Valid. - parent.insertChildLocked(name, childVFSD) + child = childVFSD.Impl().(*Dentry) + parent.insertChildLocked(name, child) } - return childVFSD.Impl().(*Dentry), nil + return child, nil } // walkExistingLocked resolves rp to an existing file. @@ -189,14 +203,14 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v if pc == "." || pc == ".." { return "", syserror.EEXIST } - childVFSD, err := rp.ResolveChild(parentVFSD, pc) - if err != nil { - return "", err + if len(pc) > linux.NAME_MAX { + return "", syserror.ENAMETOOLONG } - if childVFSD != nil { + // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu. + if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok { return "", syserror.EEXIST } - if parentVFSD.IsDisowned() { + if parentVFSD.IsDead() { return "", syserror.ENOENT } return pc, nil @@ -206,14 +220,14 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v // // Preconditions: Filesystem.mu must be locked for at least reading. func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { - parentVFSD := vfsd.Parent() - if parentVFSD == nil { + parent := vfsd.Impl().(*Dentry).parent + if parent == nil { return syserror.EBUSY } - if parentVFSD.IsDisowned() { + if parent.vfsd.IsDead() { return syserror.ENOENT } - if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } return nil @@ -229,6 +243,19 @@ func (fs *Filesystem) Sync(ctx context.Context) error { return nil } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + + _, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return err + } + return inode.CheckPermissions(ctx, creds, ats) +} + // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() @@ -294,11 +321,11 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EPERM } - child, err := parentInode.NewLink(ctx, pc, d.inode) + childVFSD, err := parentInode.NewLink(ctx, pc, d.inode) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -322,11 +349,11 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - child, err := parentInode.NewDir(ctx, pc, opts) + childVFSD, err := parentInode.NewDir(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -350,11 +377,11 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - new, err := parentInode.NewNode(ctx, pc, opts) + newVFSD, err := parentInode.NewNode(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, new) + parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry)) return nil } @@ -364,7 +391,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // O_NOFOLLOW have no effect here (they're handled by VFS by setting // appropriate bits in rp), but are returned by // FileDescriptionImpl.StatusFlags(). - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK ats := vfs.AccessTypesForOpenFlags(&opts) // Do not create new file. @@ -379,7 +406,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts) + return inode.Open(ctx, rp, vfsd, opts) } // May create new file. @@ -398,7 +425,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts) + return inode.Open(ctx, rp, vfsd, opts) } afterTrailingSymlink: parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) @@ -418,12 +445,12 @@ afterTrailingSymlink: if pc == "." || pc == ".." { return nil, syserror.EISDIR } - // Determine whether or not we need to create a file. - childVFSD, err := rp.ResolveChild(parentVFSD, pc) - if err != nil { - return nil, err + if len(pc) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG } - if childVFSD == nil { + // Determine whether or not we need to create a file. + childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD) + if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err @@ -433,37 +460,46 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - child, err := parentInode.NewFile(ctx, pc, opts) + childVFSD, err = parentInode.NewFile(ctx, pc, opts) if err != nil { return nil, err } + child := childVFSD.Impl().(*Dentry) parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - return child.Impl().(*Dentry).inode.Open(rp, child, opts) + return child.inode.Open(ctx, rp, childVFSD, opts) + } + if err != nil { + return nil, err } // Open existing file or follow symlink. if mustCreate { return nil, syserror.EEXIST } - childDentry := childVFSD.Impl().(*Dentry) - childInode := childDentry.inode - if rp.ShouldFollowSymlink() { - if childDentry.isSymlink() { - target, err := childInode.Readlink(ctx) + child := childVFSD.Impl().(*Dentry) + if rp.ShouldFollowSymlink() && child.isSymlink() { + targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if targetVD.Ok() { + err := rp.HandleJump(targetVD) + targetVD.DecRef() if err != nil { return nil, err } - if err := rp.HandleSymlink(target); err != nil { + } else { + if err := rp.HandleSymlink(targetPathname); err != nil { return nil, err } - // rp.Final() may no longer be true since we now need to resolve the - // symlink target. - goto afterTrailingSymlink } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink } - if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return childInode.Open(rp, childVFSD, opts) + return child.inode.Open(ctx, rp, &child.vfsd, opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. @@ -490,15 +526,16 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.mu.Lock() + defer fs.processDeferredDecRefsLocked() + defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this // Mount. dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() if err != nil { return err } + dstDir := dstDirVFSD.Impl().(*Dentry) mnt := rp.Mount() if mnt != oldParentVD.Mount() { return syserror.EXDEV @@ -511,9 +548,8 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa srcDirVFSD := oldParentVD.Dentry() srcDir := srcDirVFSD.Impl().(*Dentry) srcDir.dirMu.Lock() - src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName)) + src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName]) srcDir.dirMu.Unlock() - fs.processDeferredDecRefsLocked() if err != nil { return err } @@ -525,7 +561,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Can we create the dst dentry? - var dstVFSD *vfs.Dentry + var dst *Dentry pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) switch err { case nil: @@ -535,38 +571,51 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Won't overwrite existing node since RENAME_NOREPLACE was requested. return syserror.EEXIST } - dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc) - if err != nil { + dst = dstDir.children[pc] + if dst == nil { panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) } default: return err } + var dstVFSD *vfs.Dentry + if dst != nil { + dstVFSD = &dst.vfsd + } mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() virtfs := rp.VirtualFilesystem() - srcDirDentry := srcDirVFSD.Impl().(*Dentry) - dstDirDentry := dstDirVFSD.Impl().(*Dentry) - // We can't deadlock here due to lock ordering because we're protected from // concurrent renames by fs.mu held for writing. - srcDirDentry.dirMu.Lock() - defer srcDirDentry.dirMu.Unlock() - dstDirDentry.dirMu.Lock() - defer dstDirDentry.dirMu.Unlock() + srcDir.dirMu.Lock() + defer srcDir.dirMu.Unlock() + if srcDir != dstDir { + dstDir.dirMu.Lock() + defer dstDir.dirMu.Unlock() + } if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { return err } - srcDirInode := srcDirDentry.inode - replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD) + replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD) if err != nil { virtfs.AbortRenameDentry(srcVFSD, dstVFSD) return err } - virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced) + delete(srcDir.children, src.name) + if srcDir != dstDir { + fs.deferDecRef(srcDirVFSD) + dstDir.IncRef() + } + src.parent = dstDir + src.name = pc + if dstDir.children == nil { + dstDir.children = make(map[string]*Dentry) + } + dstDir.children[pc] = src + virtfs.CommitRenameReplaceDentry(srcVFSD, replaced) return nil } @@ -586,14 +635,15 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } - if !vfsd.Impl().(*Dentry).isDir() { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { return syserror.ENOTDIR } if inode.HasChildren() { return syserror.ENOTEMPTY } virtfs := rp.VirtualFilesystem() - parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry := d.parent parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() @@ -622,7 +672,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if opts.Stat.Mask == 0 { return nil } - return inode.SetStat(fs.VFSFilesystem(), opts) + return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -646,7 +696,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO: actually implement statfs + // TODO(gvisor.dev/issue/1193): actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } @@ -670,11 +720,11 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } defer rp.Mount().EndWrite() - child, err := parentInode.NewSymlink(ctx, pc, target) + childVFSD, err := parentInode.NewSymlink(ctx, pc, target) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -694,11 +744,12 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } - if vfsd.Impl().(*Dentry).isDir() { + d := vfsd.Impl().(*Dentry) + if d.isDir() { return syserror.EISDIR } virtfs := rp.VirtualFilesystem() - parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry := d.parent parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) @@ -714,8 +765,20 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return nil, err + } + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() @@ -728,7 +791,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() @@ -770,5 +833,5 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index d50018b18..615592d5f 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -181,6 +181,11 @@ func (InodeNotSymlink) Readlink(context.Context) (string, error) { return "", syserror.EINVAL } +// Getlink implements Inode.Getlink. +func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { + return vfs.VirtualDentry{}, "", syserror.EINVAL +} + // InodeAttrs partially implements the Inode interface, specifically the // inodeMetadata sub interface. InodeAttrs provides functionality related to // inode attributes. @@ -211,6 +216,11 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo atomic.StoreUint32(&a.nlink, nlink) } +// Ino returns the inode id. +func (a *InodeAttrs) Ino() uint64 { + return atomic.LoadUint64(&a.ino) +} + // Mode implements Inode.Mode. func (a *InodeAttrs) Mode() linux.FileMode { return linux.FileMode(atomic.LoadUint32(&a.mode)) @@ -228,13 +238,23 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) stat.GID = atomic.LoadUint32(&a.gid) stat.Nlink = atomic.LoadUint32(&a.nlink) - // TODO: Implement other stat fields like timestamps. + // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. return stat, nil } // SetStat implements Inode.SetStat. -func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { +func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { + return syserror.EPERM + } + if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + return err + } + stat := opts.Stat if stat.Mask&linux.STATX_MODE != 0 { for { @@ -256,19 +276,17 @@ func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { // Note that not all fields are modifiable. For example, the file type and // inode numbers are immutable after node creation. - // TODO: Implement other stat fields like timestamps. + // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. return nil } // CheckPermissions implements Inode.CheckPermissions. func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { - mode := a.Mode() return vfs.GenericCheckPermissions( creds, ats, - mode.FileType() == linux.ModeDirectory, - uint16(mode), + a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid)), ) @@ -346,8 +364,8 @@ func (o *OrderedChildren) Destroy() { // cache. Populate returns the number of directories inserted, which the caller // may use to update the link count for the parent directory. // -// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory -// inode. children must not contain any conflicting entries already in o. +// Precondition: d must represent a directory inode. children must not contain +// any conflicting entries already in o. func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { var links uint32 for name, child := range children { @@ -357,7 +375,7 @@ func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint3 if err := o.Insert(name, child.VFSDentry()); err != nil { panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) } - d.InsertChild(name, child.VFSDentry()) + d.InsertChild(name, child) } return links } @@ -507,7 +525,7 @@ type InodeSymlink struct { } // Open implements Inode.Open. -func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } @@ -549,8 +567,23 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F } // Open implements kernfs.Inode. -func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts) +func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// AlwaysValid partially implements kernfs.inodeDynamicLookup. +type AlwaysValid struct{} + +// Valid implements kernfs.inodeDynamicLookup. +func (*AlwaysValid) Valid(context.Context) bool { + return true +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index a8ab2a2ba..732837933 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -63,9 +63,6 @@ import ( "gvisor.dev/gvisor/pkg/sync" ) -// FilesystemType implements vfs.FilesystemType. -type FilesystemType struct{} - // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory // filesystem. Concrete implementations are expected to embed this in their own // Filesystem type. @@ -138,8 +135,8 @@ func (fs *Filesystem) processDeferredDecRefsLocked() { // Init initializes a kernfs filesystem. This should be called from during // vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding // kernfs. -func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) { - fs.vfsfs.Init(vfsObj, fs) +func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem, fsType vfs.FilesystemType) { + fs.vfsfs.Init(vfsObj, fsType, fs) } // VFSFilesystem returns the generic vfs filesystem object. @@ -171,17 +168,22 @@ const ( // // Must be initialized by Init prior to first use. type Dentry struct { - refs.AtomicRefCount + vfsd vfs.Dentry - vfsd vfs.Dentry - inode Inode + refs.AtomicRefCount // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. flags uint32 - // dirMu protects vfsd.children for directory dentries. - dirMu sync.Mutex + parent *Dentry + name string + + // dirMu protects children and the names of child Dentries. + dirMu sync.Mutex + children map[string]*Dentry + + inode Inode } // Init initializes this dentry. @@ -225,8 +227,8 @@ func (d *Dentry) DecRef() { func (d *Dentry) destroy() { d.inode.DecRef() // IncRef from Init. d.inode = nil - if parent := d.vfsd.Parent(); parent != nil { - parent.DecRef() // IncRef from Dentry.InsertChild. + if d.parent != nil { + d.parent.DecRef() // IncRef from Dentry.InsertChild. } } @@ -236,7 +238,7 @@ func (d *Dentry) destroy() { // updates the link count on d if required. // // Precondition: d must represent a directory inode. -func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { +func (d *Dentry) InsertChild(name string, child *Dentry) { d.dirMu.Lock() d.insertChildLocked(name, child) d.dirMu.Unlock() @@ -246,13 +248,17 @@ func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { // preconditions. // // Precondition: d.dirMu must be locked. -func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) { +func (d *Dentry) insertChildLocked(name string, child *Dentry) { if !d.isDir() { panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) } - vfsDentry := d.VFSDentry() - vfsDentry.IncRef() // DecRef in child's Dentry.destroy. - vfsDentry.InsertChild(child, name) + d.IncRef() // DecRef in child's Dentry.destroy. + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*Dentry) + } + d.children[name] = child } // The Inode interface maps filesystem-level operations that operate on paths to @@ -302,7 +308,7 @@ type Inode interface { // // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing // the inode on which Open() is being called. - Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) + Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) } type inodeRefs interface { @@ -319,7 +325,7 @@ type inodeMetadata interface { // CheckPermissions checks that creds may access this inode for the // requested access type, per the the rules of // fs/namei.c:generic_permission(). - CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error + CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error // Mode returns the (struct stat)::st_mode value for this inode. This is // separated from Stat for performance. @@ -330,8 +336,10 @@ type inodeMetadata interface { Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) // SetStat updates the metadata for this inode. This corresponds to - // vfs.FilesystemImpl.SetStatAt. - SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error + // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking + // if the operation can be performed (see vfs.CheckSetStat() for common + // checks). + SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error } // Precondition: All methods in this interface may only be called on directory @@ -415,7 +423,21 @@ type inodeDynamicLookup interface { } type inodeSymlink interface { - // Readlink resolves the target of a symbolic link. If an inode is not a + // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. Readlink(ctx context.Context) (string, error) + + // Getlink returns the target of a symbolic link, as used by path + // resolution: + // + // - If the inode is a "magic link" (a link whose target is most accurately + // represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "", + // nil). A reference is taken on the returned VirtualDentry. + // + // - If the inode is an ordinary symlink, Getlink returns (zero-value + // VirtualDentry, symlink target, nil). + // + // - If the inode is not a symlink, Getlink returns (zero-value + // VirtualDentry, "", EINVAL). + Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 0459fb305..a9f671bc8 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -91,7 +91,7 @@ type attrs struct { kernfs.InodeAttrs } -func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error { +func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -116,9 +116,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod return &dir.dentry } -func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil { +func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + if err != nil { return nil, err } return fd.VFSFileDescription(), nil @@ -146,9 +146,11 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &dir.dentry } -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } @@ -187,9 +189,13 @@ func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, err return nil, syserror.EPERM } -func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fsType) Name() string { + return "kernfs" +} + +func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} - fs.Init(vfsObj) + fs.Init(vfsObj, &fst) root := fst.rootFn(creds, fs) return fs.VFSFilesystem(), root.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 0ee7eb9b7..0aa6dc979 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -18,6 +18,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) // StaticSymlink provides an Inode implementation for symlinks that point to @@ -52,3 +54,13 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { return s.target, nil } + +// Getlink implements Inode.Getlink. +func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { + return vfs.VirtualDentry{}, s.target, nil +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD new file mode 100644 index 000000000..0d411606f --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "pipefs", + srcs = ["pipefs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go new file mode 100644 index 000000000..d6bd67467 --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -0,0 +1,147 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pipefs provides the filesystem implementation backing +// Kernel.PipeMount. +package pipefs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type filesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (filesystemType) Name() string { + return "pipefs" +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("pipefs.filesystemType.GetFilesystem should never be called") +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem + + // TODO(gvisor.dev/issue/1193): + // + // - kernfs does not provide a way to implement statfs, from which we + // should indicate PIPEFS_MAGIC. + // + // - kernfs does not provide a way to override names for + // vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic + // name fmt.Sprintf("pipe:[%d]", inode.ino). +} + +// NewFilesystem sets up and returns a new vfs.Filesystem implemented by +// pipefs. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeNoopRefCount + + pipe *pipe.VFSPipe + + ino uint64 + uid auth.KUID + gid auth.KGID + // We use the creation timestamp for all of atime, mtime, and ctime. + ctime ktime.Time +} + +func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode { + creds := auth.CredentialsFromContext(ctx) + return &inode{ + pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + ino: fs.NextIno(), + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + ctime: ktime.NowFromContext(ctx), + } +} + +const pipeMode = 0600 | linux.S_IFIFO + +// CheckPermissions implements kernfs.Inode.CheckPermissions. +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid) +} + +// Mode implements kernfs.Inode.Mode. +func (i *inode) Mode() linux.FileMode { + return pipeMode +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) + return linux.Statx{ + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: usermem.PageSize, + Nlink: 1, + UID: uint32(i.uid), + GID: uint32(i.gid), + Mode: pipeMode, + Ino: i.ino, + Size: 0, + Blocks: 0, + Atime: ts, + Ctime: ts, + Mtime: ts, + // TODO(gvisor.dev/issue/1197): Device number. + }, nil +} + +// SetStat implements kernfs.Inode.SetStat. +func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + return syserror.EPERM +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags) +} + +// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read +// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2). +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + inode := newInode(ctx, fs) + var d kernfs.Dentry + d.Init(inode) + defer d.DecRef() + return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) +} diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index a83245866..17c1342b5 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -8,10 +8,11 @@ go_library( "filesystem.go", "subtasks.go", "task.go", + "task_fds.go", "task_files.go", + "task_net.go", "tasks.go", "tasks_files.go", - "tasks_net.go", "tasks_sys.go", ], visibility = ["//pkg/sentry:internal"], @@ -19,8 +20,9 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/log", + "//pkg/refs", "//pkg/safemem", - "//pkg/sentry/fs", + "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -53,6 +55,7 @@ go_test( "//pkg/fspath", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 5c19d5522..104fc9030 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -36,8 +36,13 @@ type FilesystemType struct{} var _ vfs.FilesystemType = (*FilesystemType)(nil) -// GetFilesystem implements vfs.FilesystemType. -func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, nil, fmt.Errorf("procfs requires a kernel") @@ -48,7 +53,7 @@ func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtual } procfs := &kernfs.Filesystem{} - procfs.VFSFilesystem().Init(vfsObj, procfs) + procfs.VFSFilesystem().Init(vfsObj, &ft, procfs) var cgroups map[string]string if opts.InternalData != nil { diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 611645f3f..a5cfa8333 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -34,6 +35,7 @@ type subtasksInode struct { kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs kernfs.OrderedChildren + kernfs.AlwaysValid task *kernel.Task pidns *kernel.PIDNamespace @@ -61,11 +63,6 @@ func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenera return dentry } -// Valid implements kernfs.inodeDynamicLookup. -func (i *subtasksInode) Valid(ctx context.Context) bool { - return true -} - // Lookup implements kernfs.inodeDynamicLookup. func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { tid, err := strconv.ParseUint(name, 10, 32) @@ -91,6 +88,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb if len(tasks) == 0 { return offset, syserror.ENOENT } + if relOffset >= int64(len(tasks)) { + return offset, nil + } tids := make([]int, 0, len(tasks)) for _, tid := range tasks { @@ -113,10 +113,52 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb return offset, nil } +type subtasksFD struct { + kernfs.GenericDirectoryFD + + task *kernel.Task +} + +func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return syserror.ENOENT + } + return fd.GenericDirectoryFD.IterDirents(ctx, cb) +} + +// Seek implements vfs.FileDecriptionImpl.Seek. +func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return 0, syserror.ENOENT + } + return fd.GenericDirectoryFD.Seek(ctx, offset, whence) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return linux.Statx{}, syserror.ENOENT + } + return fd.GenericDirectoryFD.Stat(ctx, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return syserror.ENOENT + } + return fd.GenericDirectoryFD.SetStat(ctx, opts) +} + // Open implements kernfs.Inode. -func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) +func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &subtasksFD{task: i.task} + if err := fd.Init(&i.OrderedChildren, &opts); err != nil { + return nil, err + } + if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } @@ -131,3 +173,8 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux. } return stat, nil } + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index c0d643f51..66419d91b 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -44,19 +44,21 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { + // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited. contents := map[string]*kernfs.Dentry{ - "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), - "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), - "comm": newComm(task, inoGen.NextIno(), 0444), - "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), - //"exe": newExe(t, msrc), - //"fd": newFdDir(t, msrc), - //"fdinfo": newFdInfoDir(t, msrc), - "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), - //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), - //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), + "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": newComm(task, inoGen.NextIno(), 0444), + "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "exe": newExeSymlink(task, inoGen.NextIno()), + "fd": newFDDirInode(task, inoGen), + "fdinfo": newFDInfoDirInode(task, inoGen), + "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), + "mountinfo": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountsData{task: task}), + "net": newTaskNetDir(task, inoGen), "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), @@ -100,19 +102,17 @@ func (i *taskInode) Valid(ctx context.Context) bool { } // Open implements kernfs.Inode. -func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) +func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } -// SetStat implements kernfs.Inode. -func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { - stat := opts.Stat - if stat.Mask&linux.STATX_MODE != 0 { - return syserror.EPERM - } - return nil +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM } // taskOwnedInode implements kernfs.Inode and overrides inode owner with task @@ -175,14 +175,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.S func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) - return vfs.GenericCheckPermissions( - creds, - ats, - mode.FileType() == linux.ModeDirectory, - uint16(mode), - uid, - gid, - ) + return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid) } func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) { @@ -224,22 +217,6 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData { return &ioData{ioUsage: t} } -func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { - // Namespace symlinks should contain the namespace name and the inode number - // for the namespace instance, so for example user:[123456]. We currently fake - // the inode number by sticking the symlink inode in its place. - target := fmt.Sprintf("%s:[%d]", ns, ino) - - inode := &kernfs.StaticSymlink{} - // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, target) - - taskInode := &taskOwnedInode{Inode: inode, owner: task} - d := &kernfs.Dentry{} - d.Init(taskInode) - return d -} - // newCgroupData creates inode that shows cgroup information. // From man 7 cgroups: "For each cgroup hierarchy of which the process is a // member, there is one entry containing three colon-separated fields: diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go new file mode 100644 index 000000000..8ad976073 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -0,0 +1,306 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) { + var ( + file *vfs.FileDescription + flags kernel.FDFlags + ) + t.WithMuLocked(func(t *kernel.Task) { + if fdt := t.FDTable(); fdt != nil { + file, flags = fdt.GetVFS2(fd) + } + }) + return file, flags +} + +func taskFDExists(t *kernel.Task, fd int32) bool { + file, _ := getTaskFD(t, fd) + if file == nil { + return false + } + file.DecRef() + return true +} + +type fdDir struct { + inoGen InoGenerator + task *kernel.Task + + // When produceSymlinks is set, dirents produces for the FDs are reported + // as symlink. Otherwise, they are reported as regular files. + produceSymlink bool +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) { + var fds []int32 + i.task.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.GetFDs() + } + }) + + offset := absOffset + relOffset + typ := uint8(linux.DT_REG) + if i.produceSymlink { + typ = linux.DT_LNK + } + + // Find the appropriate starting point. + idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) }) + if idx >= len(fds) { + return offset, nil + } + for _, fd := range fds[idx:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(fd), 10), + Type: typ, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} + +// fdDirInode represents the inode for /proc/[pid]/fd directory. +// +// +stateify savable +type fdDirInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + fdDir +} + +var _ kernfs.Inode = (*fdDirInode)(nil) + +func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { + inode := &fdDirInode{ + fdDir: fdDir{ + inoGen: inoGen, + task: task, + produceSymlink: true, + }, + } + inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + fdInt, err := strconv.ParseInt(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + fd := int32(fdInt) + if !taskFDExists(i.task, fd) { + return nil, syserror.ENOENT + } + taskDentry := newFDSymlink(i.task, fd, i.inoGen.NextIno()) + return taskDentry.VFSDentry(), nil +} + +// Open implements kernfs.Inode. +func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// CheckPermissions implements kernfs.Inode. +// +// This is to match Linux, which uses a special permission handler to guarantee +// that a process can still access /proc/self/fd after it has executed +// setuid. See fs/proc/fd.c:proc_fd_permission. +func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + err := i.InodeAttrs.CheckPermissions(ctx, creds, ats) + if err == nil { + // Access granted, no extra check needed. + return nil + } + if t := kernel.TaskFromContext(ctx); t != nil { + // Allow access if the task trying to access it is in the thread group + // corresponding to this directory. + if i.task.ThreadGroup() == t.ThreadGroup() { + // Access granted (overridden). + return nil + } + } + return err +} + +// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. +// +// +stateify savable +type fdSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + task *kernel.Task + fd int32 +} + +var _ kernfs.Inode = (*fdSymlink)(nil) + +func newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { + inode := &fdSymlink{ + task: task, + fd: fd, + } + inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { + file, _ := getTaskFD(s.task, s.fd) + if file == nil { + return "", syserror.ENOENT + } + defer file.DecRef() + root := vfs.RootFromContext(ctx) + defer root.DecRef() + return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) +} + +func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + file, _ := getTaskFD(s.task, s.fd) + if file == nil { + return vfs.VirtualDentry{}, "", syserror.ENOENT + } + defer file.DecRef() + vd := file.VirtualDentry() + vd.IncRef() + return vd, "", nil +} + +// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. +// +// +stateify savable +type fdInfoDirInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + fdDir +} + +var _ kernfs.Inode = (*fdInfoDirInode)(nil) + +func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { + inode := &fdInfoDirInode{ + fdDir: fdDir{ + inoGen: inoGen, + task: task, + }, + } + inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + fdInt, err := strconv.ParseInt(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + fd := int32(fdInt) + if !taskFDExists(i.task, fd) { + return nil, syserror.ENOENT + } + data := &fdInfoData{ + task: i.task, + fd: fd, + } + dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data) + return dentry.VFSDentry(), nil +} + +// Open implements kernfs.Inode. +func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. +// +// +stateify savable +type fdInfoData struct { + kernfs.DynamicBytesFile + refs.AtomicRefCount + + task *kernel.Task + fd int32 +} + +var _ dynamicInode = (*fdInfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + file, descriptorFlags := getTaskFD(d.task, d.fd) + if file == nil { + return syserror.ENOENT + } + defer file.DecRef() + // TODO(b/121266871): Include pos, locks, and other data. For now we only + // have flags. + // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt + flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags() + fmt.Fprintf(buf, "flags:\t0%o\n", flags) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 5a231ac86..515f25327 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -63,6 +64,16 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { return m, nil } +func checkTaskState(t *kernel.Task) error { + switch t.ExitState() { + case kernel.TaskExitZombie: + return syserror.EACCES + case kernel.TaskExitDead: + return syserror.ESRCH + } + return nil +} + type bufferWriter struct { buf *bytes.Buffer } @@ -100,17 +111,18 @@ func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { } defer m.DecUsers(ctx) - // Space for buffer with AT_NULL (0) terminator at the end. auxv := m.Auxv() + // Space for buffer with AT_NULL (0) terminator at the end. buf.Grow((len(auxv) + 1) * 16) for _, e := range auxv { - var tmp [8]byte - usermem.ByteOrder.PutUint64(tmp[:], e.Key) - buf.Write(tmp[:]) - - usermem.ByteOrder.PutUint64(tmp[:], uint64(e.Value)) + var tmp [16]byte + usermem.ByteOrder.PutUint64(tmp[:8], e.Key) + usermem.ByteOrder.PutUint64(tmp[8:], uint64(e.Value)) buf.Write(tmp[:]) } + var atNull [16]byte + buf.Write(atNull[:]) + return nil } @@ -496,7 +508,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider. +// ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. type ioUsage interface { // IOUsage returns the io usage data. IOUsage() *usage.IO @@ -539,11 +551,10 @@ var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { - adj, err := o.task.OOMScoreAdj() - if err != nil { - return err + if o.task.ExitState() == kernel.TaskExitDead { + return syserror.ESRCH } - fmt.Fprintf(buf, "%d\n", adj) + fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) return nil } @@ -562,9 +573,260 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset return 0, err } + if o.task.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH + } if err := o.task.SetOOMScoreAdj(v); err != nil { return 0, err } return n, nil } + +// exeSymlink is an symlink for the /proc/[pid]/exe file. +// +// +stateify savable +type exeSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + task *kernel.Task +} + +var _ kernfs.Inode = (*exeSymlink)(nil) + +func newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { + inode := &exeSymlink{task: task} + inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +// Readlink implements kernfs.Inode. +func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return "", syserror.EACCES + } + + // Pull out the executable for /proc/[pid]/exe. + exec, err := s.executable() + if err != nil { + return "", err + } + defer exec.DecRef() + + return exec.PathnameWithDeleted(ctx), nil +} + +// Getlink implements kernfs.Inode.Getlink. +func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return vfs.VirtualDentry{}, "", syserror.EACCES + } + + exec, err := s.executable() + if err != nil { + return vfs.VirtualDentry{}, "", err + } + defer exec.DecRef() + + vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() + vd.IncRef() + return vd, "", nil +} + +func (s *exeSymlink) executable() (file fsbridge.File, err error) { + if err := checkTaskState(s.task); err != nil { + return nil, err + } + + s.task.WithMuLocked(func(t *kernel.Task) { + mm := t.MemoryManager() + if mm == nil { + err = syserror.EACCES + return + } + + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + file = mm.Executable() + if file == nil { + err = syserror.ESRCH + } + }) + return +} + +// mountInfoData is used to implement /proc/[pid]/mountinfo. +// +// +stateify savable +type mountInfoData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mountInfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var fsctx *kernel.FSContext + i.task.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() + }) + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return nil + } + rootDir := fsctx.RootDirectoryVFS2() + if !rootDir.Ok() { + // Root has been destroyed. Don't try to read mounts. + return nil + } + defer rootDir.DecRef() + i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) + return nil +} + +// mountsData is used to implement /proc/[pid]/mounts. +// +// +stateify savable +type mountsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mountsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var fsctx *kernel.FSContext + i.task.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() + }) + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return nil + } + rootDir := fsctx.RootDirectoryVFS2() + if !rootDir.Ok() { + // Root has been destroyed. Don't try to read mounts. + return nil + } + defer rootDir.DecRef() + i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) + return nil +} + +type namespaceSymlink struct { + kernfs.StaticSymlink + + task *kernel.Task +} + +func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { + // Namespace symlinks should contain the namespace name and the inode number + // for the namespace instance, so for example user:[123456]. We currently fake + // the inode number by sticking the symlink inode in its place. + target := fmt.Sprintf("%s:[%d]", ns, ino) + + inode := &namespaceSymlink{task: task} + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), ino, target) + + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d +} + +// Readlink implements Inode. +func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { + if err := checkTaskState(s.task); err != nil { + return "", err + } + return s.StaticSymlink.Readlink(ctx) +} + +// Getlink implements Inode.Getlink. +func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + if err := checkTaskState(s.task); err != nil { + return vfs.VirtualDentry{}, "", err + } + + // Create a synthetic inode to represent the namespace. + dentry := &kernfs.Dentry{} + dentry.Init(&namespaceInode{}) + vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) + vd.IncRef() + dentry.DecRef() + return vd, "", nil +} + +// namespaceInode is a synthetic inode created to represent a namespace in +// /proc/[pid]/ns/*. +type namespaceInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink +} + +var _ kernfs.Inode = (*namespaceInode)(nil) + +// Init initializes a namespace inode. +func (i *namespaceInode) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + i.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) +} + +// Open implements Inode.Open. +func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &namespaceFD{inode: i} + i.IncRef() + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// namespace FD is a synthetic file that represents a namespace in +// /proc/[pid]/ns/*. +type namespaceFD struct { + vfs.FileDescriptionDefaultImpl + + vfsfd vfs.FileDescription + inode *namespaceInode +} + +var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) + +// Stat implements FileDescriptionImpl. +func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(vfs, opts) +} + +// SetStat implements FileDescriptionImpl. +func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + creds := auth.CredentialsFromContext(ctx) + return fd.inode.SetStat(ctx, vfs, creds, opts) +} + +// Release implements FileDescriptionImpl. +func (fd *namespaceFD) Release() { + fd.inode.DecRef() +} + +// OnClose implements FileDescriptionImpl. +func (*namespaceFD) OnClose(context.Context) error { + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/task_net.go index d4e1812d8..6595fcee6 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -32,17 +31,19 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/usermem" ) -func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { +func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { + k := task.Kernel() + pidns := task.PIDNamespace() + root := auth.NewRootCredentials(pidns.UserNamespace()) + var contents map[string]*kernfs.Dentry - // TODO(gvisor.dev/issue/1833): Support for using the network stack in the - // network namespace of the calling process. We should make this per-process, - // a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net. - if stack := k.RootNetworkNamespace().Stack(); stack != nil { + if stack := task.NetworkNamespace().Stack(); stack != nil { const ( arp = "IP address HW type Flags HW address Mask Device\n" netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n" @@ -53,6 +54,8 @@ func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k ) psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)) + // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task + // network namespace. contents = map[string]*kernfs.Dentry{ "dev": newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}), "snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}), @@ -84,7 +87,7 @@ func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k } } - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents) + return newTaskOwnedDir(task, inoGen.NextIno(), 0555, contents) } // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. @@ -203,22 +206,21 @@ var _ dynamicInode = (*netUnixData)(nil) func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") for _, se := range n.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { s.DecRef() // Not a unix socket. continue } - sops := sfile.FileOperations.(*unix.SocketOperations) + sops := s.Impl().(*unix.SocketVFS2) addr, err := sops.Endpoint().GetLocalAddress() if err != nil { - log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) + log.Warningf("Failed to retrieve socket name from %+v: %v", s, err) addr.Addr = "<unknown>" } @@ -231,6 +233,15 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { } } + // Get inode number. + var ino uint64 + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO}) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve ino for socket file: %v", statErr) + } else { + ino = stat.Ino + } + // In the socket entry below, the value for the 'Num' field requires // some consideration. Linux prints the address to the struct // unix_sock representing a socket in the kernel, but may redact the @@ -249,14 +260,14 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // the definition of this struct changes over time. // // For now, we always redact this pointer. - fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - sfile.ReadRefs()-1, // RefCount, don't count our own ref. + s.Refs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. sops.State(), // State. - sfile.InodeID(), // Inode. + ino, // Inode. ) // Path @@ -338,15 +349,14 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, t := kernel.TaskFromContext(ctx) for _, se := range k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) + sops, ok := s.Impl().(socket.SocketVFS2) if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { s.DecRef() @@ -395,14 +405,15 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Unimplemented. fmt.Fprintf(buf, "%08X ", 0) + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout; number of unanswered 0-window probes. @@ -410,11 +421,16 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, fmt.Fprintf(buf, "%8d ", 0) // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + fmt.Fprintf(buf, "%d ", s.Refs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -496,15 +512,14 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { t := kernel.TaskFromContext(ctx) for _, se := range d.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) + sops, ok := s.Impl().(socket.SocketVFS2) if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { s.DecRef() @@ -548,25 +563,31 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: retrnsmt. Always 0 for UDP. fmt.Fprintf(buf, "%08X ", 0) + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout. Always 0 for UDP. fmt.Fprintf(buf, "%8d ", 0) // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + fmt.Fprintf(buf, "%d ", s.Refs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -667,9 +688,9 @@ func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { if line.prefix == "Tcp" { tcp := stat.(*inet.StatSNMPTCP) // "Tcp" needs special processing because MaxConn is signed. RFC 2012. - fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) + fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) } else { - fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) + fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) } } return nil diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index b1e39c82f..5aeda8c9b 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -46,6 +46,7 @@ type tasksInode struct { kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs kernfs.OrderedChildren + kernfs.AlwaysValid inoGen InoGenerator pidns *kernel.PIDNamespace @@ -66,23 +67,23 @@ var _ kernfs.Inode = (*tasksInode)(nil) func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), - //"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), - "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), - "sys": newSysDir(root, inoGen, k), - "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), - "net": newNetDir(root, inoGen, k), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}), - "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}), + "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), + "filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), + "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), + "sys": newSysDir(root, inoGen, k), + "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), + "net": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"), + "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), + "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), + "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), } inode := &tasksInode{ pidns: pidns, inoGen: inoGen, - selfSymlink: newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), - threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), + selfSymlink: newSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), + threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) @@ -121,11 +122,6 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return taskDentry.VFSDentry(), nil } -// Valid implements kernfs.inodeDynamicLookup. -func (i *tasksInode) Valid(ctx context.Context) bool { - return true -} - // IterDirents implements kernfs.inodeDynamicLookup. func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 @@ -205,9 +201,11 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback } // Open implements kernfs.Inode. -func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) +func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } @@ -229,6 +227,20 @@ func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Sta return stat, nil } +// staticFileSetStat implements a special static file that allows inode +// attributes to be set. This is to support /proc files that are readonly, but +// allow attributes to be set. +type staticFileSetStat struct { + dynamicBytesFileSetAttr + vfs.StaticData +} + +var _ dynamicInode = (*staticFileSetStat)(nil) + +func newStaticFileSetStat(data string) *staticFileSetStat { + return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}} +} + func cpuInfoData(k *kernel.Kernel) string { features := k.FeatureSet() if features == nil { diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 434998910..92007df81 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -40,9 +41,9 @@ type selfSymlink struct { var _ kernfs.Inode = (*selfSymlink)(nil) -func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &selfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|perm) + inode.Init(creds, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -62,6 +63,16 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { return strconv.FormatUint(uint64(tgid), 10), nil } +func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) + return vfs.VirtualDentry{}, target, err +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + type threadSelfSymlink struct { kernfs.InodeAttrs kernfs.InodeNoopRefCount @@ -72,9 +83,9 @@ type threadSelfSymlink struct { var _ kernfs.Inode = (*threadSelfSymlink)(nil) -func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &threadSelfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|perm) + inode.Init(creds, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -95,6 +106,28 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { return fmt.Sprintf("%d/task/%d", tgid, tid), nil } +func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) + return vfs.VirtualDentry{}, target, err +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// dynamicBytesFileSetAttr implements a special file that allows inode +// attributes to be set. This is to support /proc files that are readonly, but +// allow attributes to be set. +type dynamicBytesFileSetAttr struct { + kernfs.DynamicBytesFile +} + +// SetStat implements Inode.SetStat. +func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts) +} + // cpuStats contains the breakdown of CPU time for /proc/stat. type cpuStats struct { // user is time spent in userspace tasks with non-positive niceness. @@ -137,22 +170,20 @@ func (c cpuStats) String() string { // // +stateify savable type statData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel + dynamicBytesFileSetAttr } var _ dynamicInode = (*statData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error { // TODO(b/37226836): We currently export only zero CPU stats. We could // at least provide some aggregate stats. var cpu cpuStats fmt.Fprintf(buf, "cpu %s\n", cpu) - for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { + k := kernel.KernelFromContext(ctx) + for c, max := uint(0), k.ApplicationCores(); c < max; c++ { fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) } @@ -176,7 +207,7 @@ func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "ctxt 0\n") // CLOCK_REALTIME timestamp from boot, in seconds. - fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) + fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds()) // Total number of clones. // TODO(b/37226836): Count this. @@ -203,13 +234,13 @@ func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { // // +stateify savable type loadavgData struct { - kernfs.DynamicBytesFile + dynamicBytesFileSetAttr } var _ dynamicInode = (*loadavgData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { // TODO(b/62345059): Include real data in fields. // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. // Column 4-5: currently running processes and the total number of processes. @@ -222,17 +253,15 @@ func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { // // +stateify savable type meminfoData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel + dynamicBytesFileSetAttr } var _ dynamicInode = (*meminfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { - mf := d.k.MemoryFile() +func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + mf := k.MemoryFile() mf.UpdateUsage() snapshot, totalUsage := usage.MemoryAccounting.Copy() totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) @@ -275,7 +304,7 @@ func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // // +stateify savable type uptimeData struct { - kernfs.DynamicBytesFile + dynamicBytesFileSetAttr } var _ dynamicInode = (*uptimeData)(nil) @@ -294,17 +323,15 @@ func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error { // // +stateify savable type versionData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel + dynamicBytesFileSetAttr } var _ dynamicInode = (*versionData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { - init := v.k.GlobalInit() +func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + init := k.GlobalInit() if init == nil { // Attempted to read before the init Task is created. This can // only occur during startup, which should never need to read @@ -335,3 +362,19 @@ func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) return nil } + +// filesystemsData backs /proc/filesystems. +// +// +stateify savable +type filesystemsData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*filesystemsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + k.VFS().GenerateProcFilesystems(buf) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 3d5dc463c..f08668ca2 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -39,7 +39,7 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k "shmmni": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)), }), "vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}), + "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{k: k}), "overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")), }), "net": newSysNetDir(root, inoGen, k), diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 0eb401619..19abb5034 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -47,10 +48,11 @@ var ( var ( tasksStaticFiles = map[string]testutil.DirentType{ "cpuinfo": linux.DT_REG, + "filesystems": linux.DT_REG, "loadavg": linux.DT_REG, "meminfo": linux.DT_REG, "mounts": linux.DT_LNK, - "net": linux.DT_DIR, + "net": linux.DT_LNK, "self": linux.DT_LNK, "stat": linux.DT_REG, "sys": linux.DT_DIR, @@ -68,9 +70,15 @@ var ( "cmdline": linux.DT_REG, "comm": linux.DT_REG, "environ": linux.DT_REG, + "exe": linux.DT_LNK, + "fd": linux.DT_DIR, + "fdinfo": linux.DT_DIR, "gid_map": linux.DT_REG, "io": linux.DT_REG, "maps": linux.DT_REG, + "mountinfo": linux.DT_REG, + "mounts": linux.DT_REG, + "net": linux.DT_DIR, "ns": linux.DT_DIR, "oom_score": linux.DT_REG, "oom_score_adj": linux.DT_REG, @@ -95,17 +103,37 @@ func setup(t *testing.T) *testutil.System { k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - fsOpts := vfs.GetFilesystemOptions{ - InternalData: &InternalData{ - Cgroups: map[string]string{ - "cpuset": "/foo/cpuset", - "memory": "/foo/memory", + + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("NewMountNamespace(): %v", err) + } + pop := &vfs.PathOperation{ + Root: mntns.Root(), + Start: mntns.Root(), + Path: fspath.Parse("/proc"), + } + if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil { + t.Fatalf("MkDir(/proc): %v", err) + } + + pop = &vfs.PathOperation{ + Root: mntns.Root(), + Start: mntns.Root(), + Path: fspath.Parse("/proc"), + } + mntOpts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: &InternalData{ + Cgroups: map[string]string{ + "cpuset": "/foo/cpuset", + "memory": "/foo/memory", + }, }, }, } - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts) - if err != nil { - t.Fatalf("NewMountNamespace(): %v", err) + if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil { + t.Fatalf("MountAt(/proc): %v", err) } return testutil.NewSystem(ctx, t, k.VFS(), mntns) } @@ -114,7 +142,7 @@ func TestTasksEmpty(t *testing.T) { s := setup(t) defer s.Destroy() - collector := s.ListDirents(s.PathOpAtRoot("/")) + collector := s.ListDirents(s.PathOpAtRoot("/proc")) s.AssertAllDirentTypes(collector, tasksStaticFiles) s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) } @@ -140,7 +168,7 @@ func TestTasks(t *testing.T) { expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR } - collector := s.ListDirents(s.PathOpAtRoot("/")) + collector := s.ListDirents(s.PathOpAtRoot("/proc")) s.AssertAllDirentTypes(collector, expectedDirents) s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) @@ -180,7 +208,7 @@ func TestTasks(t *testing.T) { } // Test lookup. - for _, path := range []string{"/1", "/2"} { + for _, path := range []string{"/proc/1", "/proc/2"} { fd, err := s.VFS.OpenAt( s.Ctx, s.Creds, @@ -190,6 +218,7 @@ func TestTasks(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) } + defer fd.DecRef() buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { @@ -200,10 +229,10 @@ func TestTasks(t *testing.T) { if _, err := s.VFS.OpenAt( s.Ctx, s.Creds, - s.PathOpAtRoot("/9999"), + s.PathOpAtRoot("/proc/9999"), &vfs.OpenOptions{}, ); err != syserror.ENOENT { - t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err) + t.Fatalf("wrong error from vfsfs.OpenAt(/proc/9999): %v", err) } } @@ -301,12 +330,13 @@ func TestTasksOffset(t *testing.T) { fd, err := s.VFS.OpenAt( s.Ctx, s.Creds, - s.PathOpAtRoot("/"), + s.PathOpAtRoot("/proc"), &vfs.OpenOptions{}, ) if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } + defer fd.DecRef() if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) } @@ -343,7 +373,7 @@ func TestTask(t *testing.T) { t.Fatalf("CreateTask(): %v", err) } - collector := s.ListDirents(s.PathOpAtRoot("/1")) + collector := s.ListDirents(s.PathOpAtRoot("/proc/1")) s.AssertAllDirentTypes(collector, taskStaticFiles) } @@ -361,14 +391,14 @@ func TestProcSelf(t *testing.T) { collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{ Root: s.Root, Start: s.Root, - Path: fspath.Parse("/self/"), + Path: fspath.Parse("/proc/self/"), FollowFinalSymlink: true, }) s.AssertAllDirentTypes(collector, taskStaticFiles) } func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) { - t.Logf("Iterating: /proc%s", fd.MappedName(ctx)) + t.Logf("Iterating: %s", fd.MappedName(ctx)) var collector testutil.DirentCollector if err := fd.IterDirents(ctx, &collector); err != nil { @@ -385,35 +415,36 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F if d.Name == "." || d.Name == ".." { continue } - childPath := path.Join(fd.MappedName(ctx), d.Name) + absPath := path.Join(fd.MappedName(ctx), d.Name) if d.Type == linux.DT_LNK { link, err := s.VFS.ReadlinkAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)}, ) if err != nil { - t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err) + t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", absPath, err) } else { - t.Logf("Skipping symlink: /proc%s => %s", childPath, link) + t.Logf("Skipping symlink: %s => %s", absPath, link) } continue } - t.Logf("Opening: /proc%s", childPath) + t.Logf("Opening: %s", absPath) child, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)}, &vfs.OpenOptions{}, ) if err != nil { - t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err) + t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err) continue } + defer child.DecRef() stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { - t.Errorf("Stat(%v) failed: %v", childPath, err) + t.Errorf("Stat(%v) failed: %v", absPath, err) } if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type { t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type) @@ -431,6 +462,22 @@ func TestTree(t *testing.T) { defer s.Destroy() k := kernel.KernelFromContext(s.Ctx) + + pop := &vfs.PathOperation{ + Root: s.Root, + Start: s.Root, + Path: fspath.Parse("test-file"), + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_CREAT, + Mode: 0777, + } + file, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, opts) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + defer file.DecRef() + var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) @@ -438,6 +485,8 @@ func TestTree(t *testing.T) { if err != nil { t.Fatalf("CreateTask(): %v", err) } + // Add file to populate /proc/[pid]/fd and fdinfo directories. + task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{}) tasks = append(tasks, task) } @@ -445,11 +494,12 @@ func TestTree(t *testing.T) { fd, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(s.Ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/")}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc")}, &vfs.OpenOptions{}, ) if err != nil { - t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err) } iterateDir(ctx, t, s, fd) + fd.DecRef() } diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD new file mode 100644 index 000000000..52084ddb5 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "sockfs", + srcs = ["sockfs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go new file mode 100644 index 000000000..5ce50625b --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -0,0 +1,98 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sockfs provides a filesystem implementation for anonymous sockets. +package sockfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("sockfs.filesystemType.GetFilesystem should never be called") +} + +// Name implements FilesystemType.Name. +// +// Note that registering sockfs is unnecessary, except for the fact that it +// will not show up under /proc/filesystems as a result. This is a very minor +// discrepancy from Linux. +func (filesystemType) Name() string { + return "sockfs" +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +// NewFilesystem sets up and returns a new sockfs filesystem. +// +// Note that there should only ever be one instance of sockfs.Filesystem, +// backing a global socket mount. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() +} + +// inode implements kernfs.Inode. +// +// TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are +// not included in InodeAttrs) to store the numbers of the appropriate +// socket device. Override InodeAttrs.Stat() accordingly. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeAttrs + kernfs.InodeNoopRefCount +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return nil, syserror.ENXIO +} + +// InitSocket initializes a socket FileDescription, with a corresponding +// Dentry in mnt. +// +// fd should be the FileDescription associated with socketImpl, i.e. its first +// field. mnt should be the global socket mount, Kernel.socketMount. +func InitSocket(socketImpl vfs.FileDescriptionImpl, fd *vfs.FileDescription, mnt *vfs.Mount, creds *auth.Credentials) error { + fsimpl := mnt.Filesystem().Impl() + fs := fsimpl.(*kernfs.Filesystem) + + // File mode matches net/socket.c:sock_alloc. + filemode := linux.FileMode(linux.S_IFSOCK | 0600) + i := &inode{} + i.Init(creds, fs.NextIno(), filemode) + + d := &kernfs.Dentry{} + d.Init(i) + + opts := &vfs.FileDescriptionOptions{UseDentryMetadata: true} + if err := fd.Init(socketImpl, linux.O_RDWR, mnt, d.VFSDentry(), opts); err != nil { + return err + } + return nil +} diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index c36c4fa11..f8d25d35e 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -39,10 +39,15 @@ type filesystem struct { kernfs.Filesystem } +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} - fs.Filesystem.Init(vfsObj) + fs.Filesystem.Init(vfsObj, &fsType) k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() defaultSysDirMode := linux.FileMode(0755) @@ -94,15 +99,17 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &d.dentry } -// SetStat implements kernfs.Inode.SetStat. -func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error { +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } // Open implements kernfs.Inode.Open. -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index e4f36f4ae..0e4053a46 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -16,12 +16,14 @@ go_library( "//pkg/cpuid", "//pkg/fspath", "//pkg/memutil", + "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/sched", "//pkg/sentry/limits", "//pkg/sentry/loader", + "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm", diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 488478e29..c16a36cdb 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -23,13 +23,16 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/time" @@ -123,10 +126,17 @@ func Boot() (*kernel.Kernel, error) { // CreateTask creates a new bare bones task for tests. func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) + exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root) + if err != nil { + return nil, err + } + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) + m.SetExecutable(fsbridge.NewVFSFile(exe)) + config := &kernel.TaskConfig{ Kernel: k, ThreadGroup: tc, - TaskContext: &kernel.TaskContext{Name: name}, + TaskContext: &kernel.TaskContext{Name: name, MemoryManager: m}, Credentials: auth.CredentialsFromContext(ctx), NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), @@ -135,10 +145,25 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), MountNamespaceVFS2: mntns, FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), + FDTable: k.NewFDTable(), } return k.TaskSet().NewTask(config) } +func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) { + const name = "executable" + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_CREAT, + Mode: 0777, + } + return vfsObj.OpenAt(ctx, creds, pop, opts) +} + func createMemoryFile() (*pgalloc.MemoryFile, error) { const memfileName = "test-memory" memfd, err := memutil.CreateMemFD(memfileName, 0) diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index e16808c63..0556af877 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -162,6 +162,9 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector { // exactly the specified set of expected entries. AssertAllDirentTypes respects // collector.skipDots, and implicitly checks for "." and ".." accordingly. func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) { + if expected == nil { + expected = make(map[string]DirentType) + } // Also implicitly check for "." and "..", if enabled. if !collector.skipDots { expected["."] = linux.DT_DIR diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 57abd5583..a2d9649e7 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "tmpfs", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "tmpfs", srcs = [ @@ -22,8 +33,10 @@ go_library( "device_file.go", "directory.go", "filesystem.go", + "fstree.go", "named_pipe.go", "regular_file.go", + "socket_file.go", "symlink.go", "tmpfs.go", ], @@ -46,9 +59,11 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/vfs/lock", + "//pkg/sentry/vfs/memxattr", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 383133e44..2fb5c4d84 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -168,7 +168,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { } } -func BenchmarkVFS2MemfsStat(b *testing.B) { +func BenchmarkVFS2TmpfsStat(b *testing.B) { for _, depth := range depths { b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { ctx := contexttest.Context(b) @@ -362,7 +362,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { } } -func BenchmarkVFS2MemfsMountStat(b *testing.B) { +func BenchmarkVFS2TmpfsMountStat(b *testing.B) { for _, depth := range depths { b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { ctx := contexttest.Context(b) @@ -438,13 +438,6 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { filePathBuilder.WriteByte('/') } - // Verify that we didn't create any directories under the mount - // point (i.e. they were all created on the submount). - firstDirName := fmt.Sprintf("%d", depth) - if child := mountPoint.Dentry().Child(firstDirName); child != nil { - b.Fatalf("created directory %q under root mount, not submount", firstDirName) - } - // Create the file that will be stat'd. fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ Root: root, diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go index 84b181b90..83bf885ee 100644 --- a/pkg/sentry/fsimpl/tmpfs/device_file.go +++ b/pkg/sentry/fsimpl/tmpfs/device_file.go @@ -15,6 +15,8 @@ package tmpfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -33,6 +35,14 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode major: major, minor: minor, } + switch kind { + case vfs.BlockDevice: + mode |= linux.S_IFBLK + case vfs.CharDevice: + mode |= linux.S_IFCHR + default: + panic(fmt.Sprintf("invalid DeviceKind: %v", kind)) + } file.inode.init(file, fs, creds, mode) file.inode.nlink = 1 // from parent directory return &file.inode diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index b4380af38..f2399981b 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -15,40 +15,77 @@ package tmpfs import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) type directory struct { - inode inode + // Since directories can't be hard-linked, each directory can only be + // associated with a single dentry, which we can store in the directory + // struct. + dentry dentry + inode inode + + // childMap maps the names of the directory's children to their dentries. + // childMap is protected by filesystem.mu. + childMap map[string]*dentry - // childList is a list containing (1) child Dentries and (2) fake Dentries + // numChildren is len(childMap), but accessed using atomic memory + // operations to avoid locking in inode.statTo(). + numChildren int64 + + // childList is a list containing (1) child dentries and (2) fake dentries // (with inode == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is protected by filesystem.mu. + // efficiently. childList is protected by iterMu. + iterMu sync.Mutex childList dentryList } -func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode { +func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory { dir := &directory{} - dir.inode.init(dir, fs, creds, mode) + dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode) dir.inode.nlink = 2 // from "." and parent directory or ".." for root - return &dir.inode + dir.dentry.inode = &dir.inode + dir.dentry.vfsd.Init(&dir.dentry) + return dir } -func (i *inode) isDir() bool { - _, ok := i.impl.(*directory) - return ok +// Preconditions: filesystem.mu must be locked for writing. dir must not +// already contain a child with the given name. +func (dir *directory) insertChildLocked(child *dentry, name string) { + child.parent = &dir.dentry + child.name = name + if dir.childMap == nil { + dir.childMap = make(map[string]*dentry) + } + dir.childMap[name] = child + atomic.AddInt64(&dir.numChildren, 1) + dir.iterMu.Lock() + dir.childList.PushBack(child) + dir.iterMu.Unlock() +} + +// Preconditions: filesystem.mu must be locked for writing. +func (dir *directory) removeChildLocked(child *dentry) { + delete(dir.childMap, child.name) + atomic.AddInt64(&dir.numChildren, -1) + dir.iterMu.Lock() + dir.childList.Remove(child) + dir.iterMu.Unlock() } type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl - // Protected by filesystem.mu. + // Protected by directory.iterMu. iter *dentry off int64 } @@ -56,11 +93,10 @@ type directoryFD struct { // Release implements vfs.FileDescriptionImpl.Release. func (fd *directoryFD) Release() { if fd.iter != nil { - fs := fd.filesystem() dir := fd.inode().impl.(*directory) - fs.mu.Lock() + dir.iterMu.Lock() dir.childList.Remove(fd.iter) - fs.mu.Unlock() + dir.iterMu.Unlock() fd.iter = nil } } @@ -68,24 +104,30 @@ func (fd *directoryFD) Release() { // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fs := fd.filesystem() - vfsd := fd.vfsfd.VirtualDentry().Dentry() + dir := fd.inode().impl.(*directory) - fs.mu.Lock() - defer fs.mu.Unlock() + // fs.mu is required to read d.parent and dentry.name. + fs.mu.RLock() + defer fs.mu.RUnlock() + dir.iterMu.Lock() + defer dir.iterMu.Unlock() + + fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { if err := cb.Handle(vfs.Dirent{ Name: ".", Type: linux.DT_DIR, - Ino: vfsd.Impl().(*dentry).inode.ino, + Ino: dir.inode.ino, NextOff: 1, }); err != nil { return err } fd.off++ } + if fd.off == 1 { - parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode + parentInode := genericParentOrSelf(&dir.dentry).inode if err := cb.Handle(vfs.Dirent{ Name: "..", Type: parentInode.direntType(), @@ -97,7 +139,6 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.off++ } - dir := vfsd.Impl().(*dentry).inode.impl.(*directory) var child *dentry if fd.iter == nil { // Start iteration at the beginning of dir. @@ -112,7 +153,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Skip other directoryFD iterators. if child.inode != nil { if err := cb.Handle(vfs.Dirent{ - Name: child.vfsd.Name(), + Name: child.name, Type: child.inode.direntType(), Ino: child.inode.ino, NextOff: fd.off + 1, @@ -130,9 +171,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fs := fd.filesystem() - fs.mu.Lock() - defer fs.mu.Unlock() + dir := fd.inode().impl.(*directory) + dir.iterMu.Lock() + defer dir.iterMu.Unlock() switch whence { case linux.SEEK_SET: @@ -160,8 +201,6 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in remChildren = offset - 2 } - dir := fd.inode().impl.(*directory) - // Ensure that fd.iter exists and is not linked into dir.childList. if fd.iter == nil { fd.iter = &dentry{} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e1b551422..5b62f9ebb 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -20,6 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -37,25 +39,44 @@ func (fs *filesystem) Sync(ctx context.Context) error { // // Preconditions: filesystem.mu must be locked. !rp.Done(). func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { - if !d.inode.isDir() { + dir, ok := d.inode.impl.(*directory) + if !ok { return nil, syserror.ENOTDIR } - if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: - nextVFSD, err := rp.ResolveComponent(&d.vfsd) - if err != nil { - return nil, err + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil + } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG } - if nextVFSD == nil { - // Since the Dentry tree is the sole source of truth for tmpfs, if it's - // not in the Dentry tree, it doesn't exist. + child, ok := dir.childMap[name] + if !ok { return nil, syserror.ENOENT } - next := nextVFSD.Impl().(*dentry) - if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issues/1197): Symlink traversals updates + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } + if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO(gvisor.dev/issue/1197): Symlink traversals updates // access time. if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err @@ -63,7 +84,7 @@ afterSymlink: goto afterSymlink // don't check the current directory again } rp.Advance() - return next, nil + return child, nil } // walkParentDirLocked resolves all but the last path component of rp to an @@ -75,7 +96,7 @@ afterSymlink: // fs/namei.c:path_parentat(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { +func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { next, err := stepLocked(rp, d) if err != nil { @@ -83,10 +104,11 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { } d = next } - if !d.inode.isDir() { + dir, ok := d.inode.impl.(*directory) + if !ok { return nil, syserror.ENOTDIR } - return d, nil + return dir, nil } // resolveLocked resolves rp to an existing file. @@ -117,33 +139,32 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST } - // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), - // because if the child exists we want to return EEXIST immediately instead - // of attempting symlink/mount traversal. - if parent.vfsd.Child(name) != nil { + if len(name) > linux.NAME_MAX { + return syserror.ENAMETOOLONG + } + if _, ok := parentDir.childMap[name]; ok { return syserror.EEXIST } if !dir && rp.MustBeDir() { return syserror.ENOENT } - // In memfs, the only way to cause a dentry to be disowned is by removing - // it from the filesystem, so this check is equivalent to checking if - // parent has been removed. - if parent.vfsd.IsDisowned() { + // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only + // be dead if it was deleted. + if parentDir.dentry.vfsd.IsDead() { return syserror.ENOENT } mnt := rp.Mount() @@ -151,7 +172,22 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa return err } defer mnt.EndWrite() - return create(parent, name) + if err := create(parentDir, name); err != nil { + return err + } + parentDir.inode.touchCMtime() + return nil +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return err + } + return d.inode.checkPermissions(creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. @@ -166,7 +202,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.inode.isDir() { return nil, syserror.ENOTDIR } - if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -178,17 +214,17 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } - d.IncRef() - return &d.vfsd, nil + dir.dentry.IncRef() + return &dir.dentry.vfsd, nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { return syserror.EXDEV } @@ -203,30 +239,27 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } d.inode.incLinksLocked() - child := fs.newDentry(d.inode) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(fs.newDentry(d.inode), name) return nil }) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error { - if parent.inode.nlink == maxLinks { + return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + if parentDir.inode.nlink == maxLinks { return syserror.EMLINK } - parent.inode.incLinksLocked() // from child's ".." - child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.inode.incLinksLocked() // from child's ".." + childDir := fs.newDirectory(rp.Credentials(), opts.Mode) + parentDir.insertChildLocked(&childDir.dentry, name) return nil }) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { var childInode *inode switch opts.Mode.FileType() { case 0, linux.S_IFREG: @@ -238,14 +271,12 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v case linux.S_IFCHR: childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) case linux.S_IFSOCK: - // Not yet supported. - return syserror.EPERM + childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint) default: return syserror.EINVAL } child := fs.newDentry(childInode) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(child, name) return nil }) } @@ -284,12 +315,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: - parent, err := walkParentDirLocked(rp, start) + parentDir, err := walkParentDirLocked(rp, start) if err != nil { return nil, err } // Check for search permission in the parent directory. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -300,11 +331,14 @@ afterTrailingSymlink: if name == "." || name == ".." { return nil, syserror.EISDIR } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } // Determine whether or not we need to create a file. - child, err := stepLocked(rp, parent) - if err == syserror.ENOENT { + child, ok := parentDir.childMap[name] + if !ok { // Already checked for searchability above; now check for writability. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -313,16 +347,26 @@ afterTrailingSymlink: defer rp.Mount().EndWrite() // Create and open the child. child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return child.open(ctx, rp, &opts, true) + parentDir.insertChildLocked(child, name) + fd, err := child.open(ctx, rp, &opts, true) + if err != nil { + return nil, err + } + parentDir.inode.touchCMtime() + return fd, nil } - if err != nil { + // Is the file mounted over? + if err := rp.CheckMount(&child.vfsd); err != nil { return nil, err } // Do we need to resolve a trailing symlink? - if !rp.Done() { - start = parent + if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO(gvisor.dev/issue/1197): Symlink traversals updates + // access time. + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + start = &parentDir.dentry goto afterTrailingSymlink } // Open existing file. @@ -335,7 +379,7 @@ afterTrailingSymlink: func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if !afterCreate { - if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } } @@ -365,9 +409,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP case *namedPipe: - return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags) + return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) + case *socketFile: + return nil, syserror.ENXIO default: panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) } @@ -385,6 +431,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st if !ok { return "", syserror.EINVAL } + symlink.inode.touchAtime(rp.Mount()) return symlink.target, nil } @@ -398,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Resolve newParent first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() - newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -415,25 +462,24 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } defer mnt.EndWrite() - oldParent := oldParentVD.Dentry().Impl().(*dentry) - if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory) + if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), - // because if the existing child is a symlink or mount point then we want - // to rename over it rather than follow it. - renamedVFSD := oldParent.vfsd.Child(oldName) - if renamedVFSD == nil { + renamed, ok := oldParentDir.childMap[oldName] + if !ok { return syserror.ENOENT } - renamed := renamedVFSD.Impl().(*dentry) + // Note that we don't need to call rp.CheckMount(), since if renamed is a + // mount point then we want to rename the mount point, not anything in the + // mounted filesystem. if renamed.inode.isDir() { - if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) { + if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { return syserror.EINVAL } - if oldParent != newParent { + if oldParentDir != newParentDir { // Writability is needed to change renamed's "..". - if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil { + if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } } @@ -443,18 +489,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } - if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry - if replacedVFSD != nil { - replaced = replacedVFSD.Impl().(*dentry) - if replaced.inode.isDir() { + replaced, ok := newParentDir.childMap[newName] + if ok { + replacedDir, ok := replaced.inode.impl.(*directory) + if ok { if !renamed.inode.isDir() { return syserror.EISDIR } - if replaced.vfsd.HasChildren() { + if len(replacedDir.childMap) != 0 { return syserror.ENOTEMPTY } } else { @@ -466,11 +511,13 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } } else { - if renamed.inode.isDir() && newParent.inode.nlink == maxLinks { + if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks { return syserror.EMLINK } } - if newParent.vfsd.IsDisowned() { + // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can + // only be dead if it was deleted. + if newParentDir.dentry.vfsd.IsDead() { return syserror.ENOENT } @@ -478,33 +525,38 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // simplicity, under the assumption that applications are not intentionally // doing noop renames expecting them to succeed where non-noop renames // would fail. - if renamedVFSD == replacedVFSD { + if renamed == replaced { return nil } vfsObj := rp.VirtualFilesystem() - oldParentDir := oldParent.inode.impl.(*directory) - newParentDir := newParent.inode.impl.(*directory) mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil { + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + } + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } if replaced != nil { - newParentDir.childList.Remove(replaced) + newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { - newParent.inode.decLinksLocked() // from replaced's ".." + newParentDir.inode.decLinksLocked() // from replaced's ".." } replaced.inode.decLinksLocked() } - oldParentDir.childList.Remove(renamed) - newParentDir.childList.PushBack(renamed) - if renamed.inode.isDir() { - oldParent.inode.decLinksLocked() - newParent.inode.incLinksLocked() + oldParentDir.removeChildLocked(renamed) + newParentDir.insertChildLocked(renamed, newName) + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + oldParentDir.inode.touchCMtime() + if oldParentDir != newParentDir { + if renamed.inode.isDir() { + oldParentDir.inode.decLinksLocked() + newParentDir.inode.incLinksLocked() + } + newParentDir.inode.touchCMtime() } - // TODO(gvisor.dev/issues/1197): Update timestamps and parent directory - // sizes. - vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) + renamed.inode.touchCtime() return nil } @@ -512,11 +564,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -526,15 +578,15 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if name == ".." { return syserror.ENOTEMPTY } - childVFSD := parent.vfsd.Child(name) - if childVFSD == nil { + child, ok := parentDir.childMap[name] + if !ok { return syserror.ENOENT } - child := childVFSD.Impl().(*dentry) - if !child.inode.isDir() { + childDir, ok := child.inode.impl.(*directory) + if !ok { return syserror.ENOTDIR } - if childVFSD.HasChildren() { + if len(childDir.childMap) != 0 { return syserror.ENOTEMPTY } mnt := rp.Mount() @@ -545,13 +597,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - parent.inode.impl.(*directory).childList.Remove(child) - parent.inode.decLinksLocked() // from child's ".." + parentDir.removeChildLocked(child) + parentDir.inode.decLinksLocked() // from child's ".." child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(childVFSD) + vfsObj.CommitDeleteDentry(&child.vfsd) + parentDir.inode.touchCMtime() return nil } @@ -563,7 +616,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if err != nil { return err } - return d.inode.setStat(opts.Stat) + return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat) } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -587,16 +640,15 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issues/1197): Actually implement statfs. + // TODO(gvisor.dev/issue/1197): Actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(child, name) return nil }) } @@ -605,22 +657,21 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return syserror.EISDIR } - childVFSD := parent.vfsd.Child(name) - if childVFSD == nil { + child, ok := parentDir.childMap[name] + if !ok { return syserror.ENOENT } - child := childVFSD.Impl().(*dentry) if child.inode.isDir() { return syserror.EISDIR } @@ -635,66 +686,79 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - parent.inode.impl.(*directory).childList.Remove(child) + parentDir.removeChildLocked(child) child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(childVFSD) + vfsObj.CommitDeleteDentry(&child.vfsd) + parentDir.inode.touchCMtime() return nil } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + switch impl := d.inode.impl.(type) { + case *socketFile: + return impl.ep, nil + default: + return nil, syserror.ECONNREFUSED + } +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return nil, err } - // TODO(b/127675828): support extended attributes - return nil, syserror.ENOTSUP + return d.inode.listxattr(size) } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return "", err } - // TODO(b/127675828): support extended attributes - return "", syserror.ENOTSUP + return d.inode.getxattr(rp.Credentials(), &opts) } // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return err } - // TODO(b/127675828): support extended attributes - return syserror.ENOTSUP + return d.inode.setxattr(rp.Credentials(), &opts) } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return err } - // TODO(b/127675828): support extended attributes - return syserror.ENOTSUP + return d.inode.removexattr(rp.Credentials(), name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 0c57fdca3..8d77b3fa8 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -16,10 +16,8 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) @@ -33,27 +31,8 @@ type namedPipe struct { // * fs.mu must be locked. // * rp.Mount().CheckBeginWrite() has been called successfully. func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { - file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)} - file.inode.init(file, fs, creds, mode) + file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)} + file.inode.init(file, fs, creds, linux.S_IFIFO|mode) file.inode.nlink = 1 // Only the parent has a link. return &file.inode } - -// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented -// entirely via struct embedding. -type namedPipeFD struct { - fileDescription - - *pipe.VFSPipeFD -} - -func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { - var err error - var fd namedPipeFD - fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags) - if err != nil { - return nil, err - } - fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}) - return &fd.vfsfd, nil -} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 711442424..57e5e28ec 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -89,7 +89,7 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod file := ®ularFile{ memFile: fs.memFile, } - file.inode.init(file, fs, creds, mode) + file.inode.init(file, fs, creds, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory return &file.inode } @@ -286,7 +286,8 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs rw := getRegularFileReadWriter(f, offset) n, err := dst.CopyOutFrom(ctx, rw) putRegularFileReadWriter(rw) - return int64(n), err + fd.inode().touchAtime(fd.vfsfd.Mount()) + return n, err } // Read implements vfs.FileDescriptionImpl.Read. @@ -308,14 +309,22 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, nil } f := fd.inode().impl.(*regularFile) - end := offset + srclen - if end < offset { + if end := offset + srclen; end < offset { // Overflow. return 0, syserror.EFBIG } + + var err error + srclen, err = vfs.CheckLimit(ctx, offset, srclen) + if err != nil { + return 0, err + } + src = src.TakeFirst64(srclen) + f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) + fd.inode().touchCMtimeLocked() f.inode.mu.Unlock() putRegularFileReadWriter(rw) return n, err diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go new file mode 100644 index 000000000..25c2321af --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go @@ -0,0 +1,34 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" +) + +// socketFile is a socket (=S_IFSOCK) tmpfs file. +type socketFile struct { + inode inode + ep transport.BoundEndpoint +} + +func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode { + file := &socketFile{ep: ep} + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index ebe035dee..60c2c980e 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -29,7 +29,7 @@ func TestStatAfterCreate(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( @@ -71,9 +71,15 @@ func TestStatAfterCreate(t *testing.T) { t.Errorf("got btime %d, want 0", got.Btime.ToNsec()) } - // Size should be 0. - if got.Size != 0 { - t.Errorf("got size %d, want 0", got.Size) + // Size should be 0 (except for directories, which make up a size + // of 20 per entry, including the "." and ".." entries present in + // otherwise-empty directories). + wantSize := uint64(0) + if typ == "dir" { + wantSize = 40 + } + if got.Size != wantSize { + t.Errorf("got size %d, want %d", got.Size, wantSize) } // Nlink should be 1 for files, 2 for dirs. @@ -140,7 +146,7 @@ func TestSetStatAtime(t *testing.T) { Mask: 0, Atime: linux.NsecToStatxTimestamp(100), }}); err != nil { - t.Errorf("SetStat atime without mask failed: %v") + t.Errorf("SetStat atime without mask failed: %v", err) } // Atime should be unchanged. if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { @@ -155,7 +161,7 @@ func TestSetStatAtime(t *testing.T) { Atime: linux.NsecToStatxTimestamp(100), } if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { - t.Errorf("SetStat atime with mask failed: %v") + t.Errorf("SetStat atime with mask failed: %v", err) } if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { t.Errorf("Stat got error: %v", err) @@ -169,7 +175,7 @@ func TestSetStat(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( @@ -205,7 +211,7 @@ func TestSetStat(t *testing.T) { Mask: 0, Atime: linux.NsecToStatxTimestamp(100), }}); err != nil { - t.Errorf("SetStat atime without mask failed: %v") + t.Errorf("SetStat atime without mask failed: %v", err) } // Atime should be unchanged. if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { @@ -220,7 +226,7 @@ func TestSetStat(t *testing.T) { Atime: linux.NsecToStatxTimestamp(100), } if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { - t.Errorf("SetStat atime with mask failed: %v") + t.Errorf("SetStat atime with mask failed: %v", err) } if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { t.Errorf("Stat got error: %v", err) diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go index 5246aca84..47e075ed4 100644 --- a/pkg/sentry/fsimpl/tmpfs/symlink.go +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -15,6 +15,7 @@ package tmpfs import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) @@ -27,7 +28,7 @@ func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode link := &symlink{ target: target, } - link.inode.init(link, fs, creds, 0777) + link.inode.init(link, fs, creds, linux.S_IFLNK|0777) link.inode.nlink = 1 // from parent directory return &link.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 521206305..efc931468 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -12,21 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package tmpfs provides a filesystem implementation that behaves like tmpfs: -// the Dentry tree is the sole source of truth for the state of the filesystem. +// Package tmpfs provides an in-memory filesystem whose contents are +// application-mutable, consistent with Linux's tmpfs. // // Lock order: // // filesystem.mu // inode.mu // regularFileFD.offMu +// *** "memmap.Mappable locks" below this point // regularFile.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point // regularFile.dataMu +// directory.iterMu package tmpfs import ( "fmt" "math" + "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -37,8 +41,10 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/lock" + "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Name is the default filesystem name. @@ -63,6 +69,27 @@ type filesystem struct { nextInoMinusOne uint64 // accessed using atomic memory operations } +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// FilesystemOpts is used to pass configuration data to tmpfs. +type FilesystemOpts struct { + // RootFileType is the FileType of the filesystem root. Valid values + // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. + RootFileType uint16 + + // RootSymlinkTarget is the target of the root symlink. Only valid if + // RootFileType == S_IFLNK. + RootSymlinkTarget string + + // FilesystemType allows setting a different FilesystemType for this + // tmpfs filesystem. This allows tmpfs to "impersonate" other + // filesystems, like ramdiskfs and cgroupfs. + FilesystemType vfs.FilesystemType +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) @@ -74,8 +101,32 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt memFile: memFileProvider.MemoryFile(), clock: clock, } - fs.vfsfs.Init(vfsObj, &fs) - root := fs.newDentry(fs.newDirectory(creds, 01777)) + + rootFileType := uint16(linux.S_IFDIR) + newFSType := vfs.FilesystemType(&fstype) + tmpfsOpts, ok := opts.InternalData.(FilesystemOpts) + if ok { + if tmpfsOpts.RootFileType != 0 { + rootFileType = tmpfsOpts.RootFileType + } + if tmpfsOpts.FilesystemType != nil { + newFSType = tmpfsOpts.FilesystemType + } + } + + fs.vfsfs.Init(vfsObj, newFSType, &fs) + + var root *dentry + switch rootFileType { + case linux.S_IFREG: + root = fs.newDentry(fs.newRegularFile(creds, 0777)) + case linux.S_IFLNK: + root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)) + case linux.S_IFDIR: + root = &fs.newDirectory(creds, 01777).dentry + default: + return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) + } return &fs.vfsfs, &root.vfsd, nil } @@ -87,20 +138,29 @@ func (fs *filesystem) Release() { type dentry struct { vfsd vfs.Dentry + // parent is this dentry's parent directory. Each referenced dentry holds a + // reference on parent.dentry. If this dentry is a filesystem root, parent + // is nil. parent is protected by filesystem.mu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.mu. + name string + + // dentryEntry (ugh) links dentries into their parent directory.childList. + dentryEntry + // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory inode (with hard links). inode is // immutable. - inode *inode - + // // tmpfs doesn't count references on dentries; because the dentry tree is // the sole source of truth, it is by definition always consistent with the // state of the filesystem. However, it does count references on inodes, // because inode resources are released when all references are dropped. - // (tmpfs doesn't really have resources to release, but we implement - // reference counting because tmpfs regular files will.) - - // dentryEntry (ugh) links dentries into their parent directory.childList. - dentryEntry + // dentry therefore forwards reference counting directly to inode. + inode *inode } func (fs *filesystem) newDentry(inode *inode) *dentry { @@ -141,10 +201,15 @@ type inode struct { // filesystem.RmdirAt() drops the reference. refs int64 + // xattrs implements extended attributes. + // + // TODO(b/148380782): Support xattrs other than user.* + xattrs memxattr.SimpleExtendedAttributes + // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. mu sync.Mutex - mode uint32 // excluding file type bits, which are based on impl + mode uint32 // file type and mode nlink uint32 // protected by filesystem.mu instead of inode.mu uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid uint32 // auth.KGID, but ... @@ -155,10 +220,6 @@ type inode struct { ctime int64 // nanoseconds mtime int64 // nanoseconds - // Only meaningful for device special files. - rdevMajor uint32 - rdevMinor uint32 - // Advisory file locks, which lock at the inode level. locks lock.FileLocks @@ -168,6 +229,9 @@ type inode struct { const maxLinks = math.MaxUint32 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { + if mode.FileType() == 0 { + panic("file type is required in FileMode") + } i.clock = fs.clock i.refs = 1 i.mode = uint32(mode) @@ -175,7 +239,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. - now := i.clock.Now().Nanoseconds() + now := fs.clock.Now().Nanoseconds() i.atime = now i.ctime = now i.mtime = now @@ -192,7 +256,7 @@ func (i *inode) incLinksLocked() { panic("tmpfs.inode.incLinksLocked() called with no existing links") } if i.nlink == maxLinks { - panic("memfs.inode.incLinksLocked() called with maximum link count") + panic("tmpfs.inode.incLinksLocked() called with maximum link count") } atomic.AddUint32(&i.nlink, 1) } @@ -228,22 +292,19 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { if regFile, ok := i.impl.(*regularFile); ok { - // Hold inode.mu and regFile.dataMu while mutating - // size. - i.mu.Lock() - regFile.dataMu.Lock() + // Release memory used by regFile to store data. Since regFile is + // no longer usable, we don't need to grab any locks or update any + // metadata. regFile.data.DropAll(regFile.memFile) - atomic.StoreUint64(®File.size, 0) - regFile.dataMu.Unlock() - i.mu.Unlock() } } else if refs < 0 { panic("tmpfs.inode.decRef() called without holding a reference") } } -func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) +func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) } // Go won't inline this function, and returning linux.Statx (which is quite @@ -254,44 +315,37 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, i // a concurrent modification), so we do not require holding inode.mu. func (i *inode) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | - linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME | - linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME - stat.Blksize = 1 // usermem.PageSize in tmpfs + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | + linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | + linux.STATX_MTIME + stat.Blksize = usermem.PageSize stat.Nlink = atomic.LoadUint32(&i.nlink) stat.UID = atomic.LoadUint32(&i.uid) stat.GID = atomic.LoadUint32(&i.gid) stat.Mode = uint16(atomic.LoadUint32(&i.mode)) stat.Ino = i.ino - // Linux's tmpfs has no concept of btime, so zero-value is returned. stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) - // TODO(gvisor.dev/issues/1197): Device number. + // TODO(gvisor.dev/issue/1197): Device number. switch impl := i.impl.(type) { case *regularFile: - stat.Mode |= linux.S_IFREG stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(atomic.LoadUint64(&impl.size)) - // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in - // a uint64 accessed using atomic memory operations to avoid taking - // locks). + // TODO(jamieliu): This should be impl.data.Span() / 512, but this is + // too expensive to compute here. Cache it in regularFile. stat.Blocks = allocatedBlocksForSize(stat.Size) case *directory: - stat.Mode |= linux.S_IFDIR + // "20" is mm/shmem.c:BOGO_DIRENT_SIZE. + stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren))) + // stat.Blocks is 0. case *symlink: - stat.Mode |= linux.S_IFLNK - stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(len(impl.target)) - stat.Blocks = allocatedBlocksForSize(stat.Size) - case *namedPipe: - stat.Mode |= linux.S_IFIFO + // stat.Blocks is 0. + case *namedPipe, *socketFile: + // stat.Size and stat.Blocks are 0. case *deviceFile: - switch impl.kind { - case vfs.BlockDevice: - stat.Mode |= linux.S_IFBLK - case vfs.CharDevice: - stat.Mode |= linux.S_IFCHR - } + // stat.Size and stat.Blocks are 0. stat.RdevMajor = impl.major stat.RdevMinor = impl.minor default: @@ -299,18 +353,27 @@ func (i *inode) statTo(stat *linux.Statx) { } } -func (i *inode) setStat(stat linux.Statx) error { +func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error { if stat.Mask == 0 { return nil } + if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { + return syserror.EPERM + } + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + return err + } i.mu.Lock() + defer i.mu.Unlock() var ( needsMtimeBump bool needsCtimeBump bool ) mask := stat.Mask if mask&linux.STATX_MODE != 0 { - atomic.StoreUint32(&i.mode, uint32(stat.Mode)) + ft := atomic.LoadUint32(&i.mode) & linux.S_IFMT + atomic.StoreUint32(&i.mode, ft|uint32(stat.Mode&^linux.S_IFMT)) needsCtimeBump = true } if mask&linux.STATX_UID != 0 { @@ -338,29 +401,41 @@ func (i *inode) setStat(stat linux.Statx) error { return syserror.EINVAL } } + now := i.clock.Now().Nanoseconds() if mask&linux.STATX_ATIME != 0 { - atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + if stat.Atime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.atime, now) + } else { + atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + } needsCtimeBump = true } if mask&linux.STATX_MTIME != 0 { - atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + if stat.Mtime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.mtime, now) + } else { + atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + } needsCtimeBump = true // Ignore the mtime bump, since we just set it ourselves. needsMtimeBump = false } if mask&linux.STATX_CTIME != 0 { - atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + if stat.Ctime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.ctime, now) + } else { + atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + } // Ignore the ctime bump, since we just set it ourselves. needsCtimeBump = false } - now := i.clock.Now().Nanoseconds() if needsMtimeBump { atomic.StoreInt64(&i.mtime, now) } if needsCtimeBump { atomic.StoreInt64(&i.ctime, now) } - i.mu.Unlock() + return nil } @@ -419,6 +494,8 @@ func (i *inode) direntType() uint8 { return linux.DT_DIR case *symlink: return linux.DT_LNK + case *socketFile: + return linux.DT_SOCK case *deviceFile: switch impl.kind { case vfs.BlockDevice: @@ -433,6 +510,96 @@ func (i *inode) direntType() uint8 { } } +func (i *inode) isDir() bool { + return linux.FileMode(i.mode).FileType() == linux.S_IFDIR +} + +func (i *inode) touchAtime(mnt *vfs.Mount) { + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.atime, now) + i.mu.Unlock() + mnt.EndWrite() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). +func (i *inode) touchCtime() { + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.ctime, now) + i.mu.Unlock() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). +func (i *inode) touchCMtime() { + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.mtime, now) + atomic.StoreInt64(&i.ctime, now) + i.mu.Unlock() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds +// inode.mu. +func (i *inode) touchCMtimeLocked() { + now := i.clock.Now().Nanoseconds() + atomic.StoreInt64(&i.mtime, now) + atomic.StoreInt64(&i.ctime, now) +} + +func (i *inode) listxattr(size uint64) ([]string, error) { + return i.xattrs.Listxattr(size) +} + +func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if err := i.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return "", syserror.ENODATA + } + return i.xattrs.Getxattr(opts) +} + +func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return syserror.EPERM + } + return i.xattrs.Setxattr(opts) +} + +func (i *inode) removexattr(creds *auth.Credentials, name string) error { + if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return syserror.EPERM + } + return i.xattrs.Removexattr(name) +} + +// Extended attributes in the user.* namespace are only supported for regular +// files and directories. +func (i *inode) userXattrSupported() bool { + filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode) + return filetype == linux.S_IFREG || filetype == linux.S_IFDIR +} + // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. type fileDescription struct { @@ -457,5 +624,26 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - return fd.inode().setStat(opts.Stat) + creds := auth.CredentialsFromContext(ctx) + return fd.inode().setStat(ctx, creds, &opts.Stat) +} + +// Listxattr implements vfs.FileDescriptionImpl.Listxattr. +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.inode().listxattr(size) +} + +// Getxattr implements vfs.FileDescriptionImpl.Getxattr. +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts) +} + +// Setxattr implements vfs.FileDescriptionImpl.Setxattr. +func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { + return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts) +} + +// Removexattr implements vfs.FileDescriptionImpl.Removexattr. +func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { + return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) } |