diff options
Diffstat (limited to 'pkg/sentry/fsimpl')
142 files changed, 31839 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD new file mode 100644 index 000000000..93512c9b6 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -0,0 +1,44 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "devpts", + srcs = [ + "devpts.go", + "line_discipline.go", + "master.go", + "queue.go", + "slave.go", + "terminal.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/unimpl", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "devpts_test", + size = "small", + srcs = ["devpts_test.go"], + library = ":devpts", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/contexttest", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go new file mode 100644 index 000000000..e6fda2b4f --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -0,0 +1,233 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package devpts provides a filesystem implementation that behaves like +// devpts. +package devpts + +import ( + "fmt" + "math" + "sort" + "strconv" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the filesystem name. +const Name = "devpts" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +var _ vfs.FilesystemType = (*FilesystemType)(nil) + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + // No data allowed. + if opts.Data != "" { + return nil, nil, syserror.EINVAL + } + + fs, root, err := fstype.newFilesystem(vfsObj, creds) + if err != nil { + return nil, nil, err + } + return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil +} + +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + +// newFilesystem creates a new devpts filesystem with root directory and ptmx +// master inode. It returns the filesystem and root Dentry. +func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, fstype, fs) + + // Construct the root directory. This is always inode id 1. + root := &rootInode{ + slaves: make(map[uint32]*slaveInode), + } + root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) + root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + root.dentry.Init(root) + + // Construct the pts master inode and dentry. Linux always uses inode + // id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx. + master := &masterInode{ + root: root, + } + master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666) + master.dentry.Init(master) + + // Add the master as a child of the root. + links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{ + "ptmx": &master.dentry, + }) + root.IncLinks(links) + + return fs, &root.dentry, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// rootInode is the root directory inode for the devpts mounts. +type rootInode struct { + kernfs.AlwaysValid + kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.OrderedChildren + + locks vfs.FileLocks + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // master is the master pty inode. Immutable. + master *masterInode + + // root is the root directory inode for this filesystem. Immutable. + root *rootInode + + // mu protects the fields below. + mu sync.Mutex + + // slaves maps pty ids to slave inodes. + slaves map[uint32]*slaveInode + + // nextIdx is the next pty index to use. Must be accessed atomically. + // + // TODO(b/29356795): reuse indices when ptys are closed. + nextIdx uint32 +} + +var _ kernfs.Inode = (*rootInode)(nil) + +// allocateTerminal creates a new Terminal and installs a pts node for it. +func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) { + i.mu.Lock() + defer i.mu.Unlock() + if i.nextIdx == math.MaxUint32 { + return nil, syserror.ENOMEM + } + idx := i.nextIdx + i.nextIdx++ + + // Sanity check that slave with idx does not exist. + if _, ok := i.slaves[idx]; ok { + panic(fmt.Sprintf("pty index collision; index %d already exists", idx)) + } + + // Create the new terminal and slave. + t := newTerminal(idx) + slave := &slaveInode{ + root: i, + t: t, + } + // Linux always uses pty index + 3 as the inode id. See + // fs/devpts/inode.c:devpts_pty_new(). + slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) + slave.dentry.Init(slave) + i.slaves[idx] = slave + + return t, nil +} + +// masterClose is called when the master end of t is closed. +func (i *rootInode) masterClose(t *Terminal) { + i.mu.Lock() + defer i.mu.Unlock() + + // Sanity check that slave with idx exists. + if _, ok := i.slaves[t.n]; !ok { + panic(fmt.Sprintf("pty with index %d does not exist", t.n)) + } + delete(i.slaves, t.n) +} + +// Open implements kernfs.Inode.Open. +func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// Lookup implements kernfs.Inode.Lookup. +func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + idx, err := strconv.ParseUint(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + i.mu.Lock() + defer i.mu.Unlock() + if si, ok := i.slaves[uint32(idx)]; ok { + si.dentry.IncRef() + return si.dentry.VFSDentry(), nil + + } + return nil, syserror.ENOENT +} + +// IterDirents implements kernfs.Inode.IterDirents. +func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + i.mu.Lock() + defer i.mu.Unlock() + ids := make([]int, 0, len(i.slaves)) + for id := range i.slaves { + ids = append(ids, int(id)) + } + sort.Ints(ids) + for _, id := range ids[relOffset:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(id), 10), + Type: linux.DT_CHR, + Ino: i.slaves[uint32(id)].InodeAttrs.Ino(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go new file mode 100644 index 000000000..b7c149047 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/devpts_test.go @@ -0,0 +1,56 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" +) + +func TestSimpleMasterToSlave(t *testing.T) { + ld := newLineDiscipline(linux.DefaultSlaveTermios) + ctx := contexttest.Context(t) + inBytes := []byte("hello, tty\n") + src := usermem.BytesIOSequence(inBytes) + outBytes := make([]byte, 32) + dst := usermem.BytesIOSequence(outBytes) + + // Write to the input queue. + nw, err := ld.inputQueueWrite(ctx, src) + if err != nil { + t.Fatalf("error writing to input queue: %v", err) + } + if nw != int64(len(inBytes)) { + t.Fatalf("wrote wrong length: got %d, want %d", nw, len(inBytes)) + } + + // Read from the input queue. + nr, err := ld.inputQueueRead(ctx, dst) + if err != nil { + t.Fatalf("error reading from input queue: %v", err) + } + if nr != int64(len(inBytes)) { + t.Fatalf("read wrong length: got %d, want %d", nr, len(inBytes)) + } + + outStr := string(outBytes[:nr]) + inStr := string(inBytes) + if outStr != inStr { + t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr) + } +} diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go new file mode 100644 index 000000000..f7bc325d1 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -0,0 +1,445 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "bytes" + "unicode/utf8" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + // canonMaxBytes is the number of bytes that fit into a single line of + // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE + // in include/linux/tty.h. + canonMaxBytes = 4096 + + // nonCanonMaxBytes is the maximum number of bytes that can be read at + // a time in noncanonical mode. + nonCanonMaxBytes = canonMaxBytes - 1 + + spacesPerTab = 8 +) + +// lineDiscipline dictates how input and output are handled between the +// pseudoterminal (pty) master and slave. It can be configured to alter I/O, +// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man +// pages are good resources for how to affect the line discipline: +// +// * termios(3) +// * tty_ioctl(4) +// +// This file corresponds most closely to drivers/tty/n_tty.c. +// +// lineDiscipline has a simple structure but supports a multitude of options +// (see the above man pages). It consists of two queues of bytes: one from the +// terminal master to slave (the input queue) and one from slave to master (the +// output queue). When bytes are written to one end of the pty, the line +// discipline reads the bytes, modifies them or takes special action if +// required, and enqueues them to be read by the other end of the pty: +// +// input from terminal +-------------+ input to process (e.g. bash) +// +------------------------>| input queue |---------------------------+ +// | (inputQueueWrite) +-------------+ (inputQueueRead) | +// | | +// | v +// masterFD slaveFD +// ^ | +// | | +// | output to terminal +--------------+ output from process | +// +------------------------| output queue |<--------------------------+ +// (outputQueueRead) +--------------+ (outputQueueWrite) +// +// Lock order: +// termiosMu +// inQueue.mu +// outQueue.mu +// +// +stateify savable +type lineDiscipline struct { + // sizeMu protects size. + sizeMu sync.Mutex `state:"nosave"` + + // size is the terminal size (width and height). + size linux.WindowSize + + // inQueue is the input queue of the terminal. + inQueue queue + + // outQueue is the output queue of the terminal. + outQueue queue + + // termiosMu protects termios. + termiosMu sync.RWMutex `state:"nosave"` + + // termios is the terminal configuration used by the lineDiscipline. + termios linux.KernelTermios + + // column is the location in a row of the cursor. This is important for + // handling certain special characters like backspace. + column int + + // masterWaiter is used to wait on the master end of the TTY. + masterWaiter waiter.Queue `state:"zerovalue"` + + // slaveWaiter is used to wait on the slave end of the TTY. + slaveWaiter waiter.Queue `state:"zerovalue"` +} + +func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { + ld := lineDiscipline{termios: termios} + ld.inQueue.transformer = &inputQueueTransformer{} + ld.outQueue.transformer = &outputQueueTransformer{} + return &ld +} + +// getTermios gets the linux.Termios for the tty. +func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + // We must copy a Termios struct, not KernelTermios. + t := l.termios.ToTermios() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err +} + +// setTermios sets a linux.Termios for the tty. +func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + l.termiosMu.Lock() + defer l.termiosMu.Unlock() + oldCanonEnabled := l.termios.LEnabled(linux.ICANON) + // We must copy a Termios struct, not KernelTermios. + var t linux.Termios + _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{ + AddressSpaceActive: true, + }) + l.termios.FromTermios(t) + + // If canonical mode is turned off, move bytes from inQueue's wait + // buffer to its read buffer. Anything already in the read buffer is + // now readable. + if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) { + l.inQueue.mu.Lock() + l.inQueue.pushWaitBufLocked(l) + l.inQueue.readable = true + l.inQueue.mu.Unlock() + l.slaveWaiter.Notify(waiter.EventIn) + } + + return 0, err +} + +func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + l.sizeMu.Lock() + defer l.sizeMu.Unlock() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err +} + +func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + l.sizeMu.Lock() + defer l.sizeMu.Unlock() + _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err +} + +func (l *lineDiscipline) masterReadiness() waiter.EventMask { + // We don't have to lock a termios because the default master termios + // is immutable. + return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios) +} + +func (l *lineDiscipline) slaveReadiness() waiter.EventMask { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios) +} + +func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + return l.inQueue.readableSize(ctx, io, args) +} + +func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, pushed, err := l.inQueue.read(ctx, dst, l) + if err != nil { + return 0, err + } + if n > 0 { + l.masterWaiter.Notify(waiter.EventOut) + if pushed { + l.slaveWaiter.Notify(waiter.EventIn) + } + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, err := l.inQueue.write(ctx, src, l) + if err != nil { + return 0, err + } + if n > 0 { + l.slaveWaiter.Notify(waiter.EventIn) + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + return l.outQueue.readableSize(ctx, io, args) +} + +func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, pushed, err := l.outQueue.read(ctx, dst, l) + if err != nil { + return 0, err + } + if n > 0 { + l.slaveWaiter.Notify(waiter.EventOut) + if pushed { + l.masterWaiter.Notify(waiter.EventIn) + } + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, err := l.outQueue.write(ctx, src, l) + if err != nil { + return 0, err + } + if n > 0 { + l.masterWaiter.Notify(waiter.EventIn) + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +// transformer is a helper interface to make it easier to stateify queue. +type transformer interface { + // transform functions require queue's mutex to be held. + transform(*lineDiscipline, *queue, []byte) int +} + +// outputQueueTransformer implements transformer. It performs line discipline +// transformations on the output queue. +// +// +stateify savable +type outputQueueTransformer struct{} + +// transform does output processing for one end of the pty. See +// drivers/tty/n_tty.c:do_output_char for an analogous kernel function. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { + // transformOutput is effectively always in noncanonical mode, as the + // master termios never has ICANON set. + + if !l.termios.OEnabled(linux.OPOST) { + q.readBuf = append(q.readBuf, buf...) + if len(q.readBuf) > 0 { + q.readable = true + } + return len(buf) + } + + var ret int + for len(buf) > 0 { + size := l.peek(buf) + cBytes := append([]byte{}, buf[:size]...) + ret += size + buf = buf[size:] + // We're guaranteed that cBytes has at least one element. + switch cBytes[0] { + case '\n': + if l.termios.OEnabled(linux.ONLRET) { + l.column = 0 + } + if l.termios.OEnabled(linux.ONLCR) { + q.readBuf = append(q.readBuf, '\r', '\n') + continue + } + case '\r': + if l.termios.OEnabled(linux.ONOCR) && l.column == 0 { + continue + } + if l.termios.OEnabled(linux.OCRNL) { + cBytes[0] = '\n' + if l.termios.OEnabled(linux.ONLRET) { + l.column = 0 + } + break + } + l.column = 0 + case '\t': + spaces := spacesPerTab - l.column%spacesPerTab + if l.termios.OutputFlags&linux.TABDLY == linux.XTABS { + l.column += spaces + q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...) + continue + } + l.column += spaces + case '\b': + if l.column > 0 { + l.column-- + } + default: + l.column++ + } + q.readBuf = append(q.readBuf, cBytes...) + } + if len(q.readBuf) > 0 { + q.readable = true + } + return ret +} + +// inputQueueTransformer implements transformer. It performs line discipline +// transformations on the input queue. +// +// +stateify savable +type inputQueueTransformer struct{} + +// transform does input processing for one end of the pty. Characters read are +// transformed according to flags set in the termios struct. See +// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel +// function. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { + // If there's a line waiting to be read in canonical mode, don't write + // anything else to the read buffer. + if l.termios.LEnabled(linux.ICANON) && q.readable { + return 0 + } + + maxBytes := nonCanonMaxBytes + if l.termios.LEnabled(linux.ICANON) { + maxBytes = canonMaxBytes + } + + var ret int + for len(buf) > 0 && len(q.readBuf) < canonMaxBytes { + size := l.peek(buf) + cBytes := append([]byte{}, buf[:size]...) + // We're guaranteed that cBytes has at least one element. + switch cBytes[0] { + case '\r': + if l.termios.IEnabled(linux.IGNCR) { + buf = buf[size:] + ret += size + continue + } + if l.termios.IEnabled(linux.ICRNL) { + cBytes[0] = '\n' + } + case '\n': + if l.termios.IEnabled(linux.INLCR) { + cBytes[0] = '\r' + } + } + + // In canonical mode, we discard non-terminating characters + // after the first 4095. + if l.shouldDiscard(q, cBytes) { + buf = buf[size:] + ret += size + continue + } + + // Stop if the buffer would be overfilled. + if len(q.readBuf)+size > maxBytes { + break + } + buf = buf[size:] + ret += size + + // If we get EOF, make the buffer available for reading. + if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) { + q.readable = true + break + } + + q.readBuf = append(q.readBuf, cBytes...) + + // Anything written to the readBuf will have to be echoed. + if l.termios.LEnabled(linux.ECHO) { + l.outQueue.writeBytes(cBytes, l) + l.masterWaiter.Notify(waiter.EventIn) + } + + // If we finish a line, make it available for reading. + if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) { + q.readable = true + break + } + } + + // In noncanonical mode, everything is readable. + if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 { + q.readable = true + } + + return ret +} + +// shouldDiscard returns whether c should be discarded. In canonical mode, if +// too many bytes are enqueued, we keep reading input and discarding it until +// we find a terminating character. Signal/echo processing still occurs. +// +// Precondition: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool { + return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes) +} + +// peek returns the size in bytes of the next character to process. As long as +// b isn't empty, peek returns a value of at least 1. +func (l *lineDiscipline) peek(b []byte) int { + size := 1 + // If UTF-8 support is enabled, runes might be multiple bytes. + if l.termios.IEnabled(linux.IUTF8) { + _, size = utf8.DecodeRune(b) + } + return size +} diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go new file mode 100644 index 000000000..69879498a --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -0,0 +1,237 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// masterInode is the inode for the master end of the Terminal. +type masterInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + locks vfs.FileLocks + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // root is the devpts root inode. + root *rootInode +} + +var _ kernfs.Inode = (*masterInode)(nil) + +// Open implements kernfs.Inode.Open. +func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + t, err := mi.root.allocateTerminal(rp.Credentials()) + if err != nil { + return nil, err + } + + mi.IncRef() + fd := &masterFileDescription{ + inode: mi, + t: t, + } + fd.LockFD.Init(&mi.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + mi.DecRef() + return nil, err + } + return &fd.vfsfd, nil +} + +// Stat implements kernfs.Inode.Stat. +func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := mi.InodeAttrs.Stat(vfsfs, opts) + if err != nil { + return linux.Statx{}, err + } + statx.Blksize = 1024 + statx.RdevMajor = linux.TTYAUX_MAJOR + statx.RdevMinor = linux.PTMX_MINOR + return statx, nil +} + +// SetStat implements kernfs.Inode.SetStat +func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask&linux.STATX_SIZE != 0 { + return syserror.EINVAL + } + return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) +} + +type masterFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + inode *masterInode + t *Terminal +} + +var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (mfd *masterFileDescription) Release() { + mfd.inode.root.masterClose(mfd.t) + mfd.inode.DecRef() +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (mfd *masterFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + mfd.t.ld.masterWaiter.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (mfd *masterFileDescription) EventUnregister(e *waiter.Entry) { + mfd.t.ld.masterWaiter.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (mfd *masterFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return mfd.t.ld.masterReadiness() +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (mfd *masterFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return mfd.t.ld.outputQueueRead(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return mfd.t.ld.inputQueueWrite(ctx, src) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ + // Get the number of bytes in the output queue read buffer. + return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args) + case linux.TCGETS: + // N.B. TCGETS on the master actually returns the configuration + // of the slave end. + return mfd.t.ld.getTermios(ctx, io, args) + case linux.TCSETS: + // N.B. TCSETS on the master actually affects the configuration + // of the slave end. + return mfd.t.ld.setTermios(ctx, io, args) + case linux.TCSETSW: + // TODO(b/29356795): This should drain the output queue first. + return mfd.t.ld.setTermios(ctx, io, args) + case linux.TIOCGPTN: + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + case linux.TIOCSPTLCK: + // TODO(b/29356795): Implement pty locking. For now just pretend we do. + return 0, nil + case linux.TIOCGWINSZ: + return 0, mfd.t.ld.windowSize(ctx, io, args) + case linux.TIOCSWINSZ: + return 0, mfd.t.ld.setWindowSize(ctx, io, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */) + default: + maybeEmitUnimplementedEvent(ctx, cmd) + return 0, syserror.ENOTTY + } +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() + return mfd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() + return mfd.inode.Stat(fs, opts) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence) +} + +// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. +func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { + switch cmd { + case linux.TCGETS, + linux.TCSETS, + linux.TCSETSW, + linux.TCSETSF, + linux.TIOCGWINSZ, + linux.TIOCSWINSZ, + linux.TIOCSETD, + linux.TIOCSBRK, + linux.TIOCCBRK, + linux.TCSBRK, + linux.TCSBRKP, + linux.TIOCSTI, + linux.TIOCCONS, + linux.FIONBIO, + linux.TIOCEXCL, + linux.TIOCNXCL, + linux.TIOCGEXCL, + linux.TIOCGSID, + linux.TIOCGETD, + linux.TIOCVHANGUP, + linux.TIOCGDEV, + linux.TIOCMGET, + linux.TIOCMSET, + linux.TIOCMBIC, + linux.TIOCMBIS, + linux.TIOCGICOUNT, + linux.TCFLSH, + linux.TIOCSSERIAL, + linux.TIOCGPTPEER: + + unimpl.EmitUnimplementedEvent(ctx) + } +} diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go new file mode 100644 index 000000000..dffb4232c --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -0,0 +1,236 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// waitBufMaxBytes is the maximum size of a wait buffer. It is based on +// TTYB_DEFAULT_MEM_LIMIT. +const waitBufMaxBytes = 131072 + +// queue represents one of the input or output queues between a pty master and +// slave. Bytes written to a queue are added to the read buffer until it is +// full, at which point they are written to the wait buffer. Bytes are +// processed (i.e. undergo termios transformations) as they are added to the +// read buffer. The read buffer is readable when its length is nonzero and +// readable is true. +// +// +stateify savable +type queue struct { + // mu protects everything in queue. + mu sync.Mutex `state:"nosave"` + + // readBuf is buffer of data ready to be read when readable is true. + // This data has been processed. + readBuf []byte + + // waitBuf contains data that can't fit into readBuf. It is put here + // until it can be loaded into the read buffer. waitBuf contains data + // that hasn't been processed. + waitBuf [][]byte + waitBufLen uint64 + + // readable indicates whether the read buffer can be read from. In + // canonical mode, there can be an unterminated line in the read buffer, + // so readable must be checked. + readable bool + + // transform is the the queue's function for transforming bytes + // entering the queue. For example, transform might convert all '\r's + // entering the queue to '\n's. + transformer +} + +// readReadiness returns whether q is ready to be read from. +func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + if len(q.readBuf) > 0 && q.readable { + return waiter.EventIn + } + return waiter.EventMask(0) +} + +// writeReadiness returns whether q is ready to be written to. +func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + if q.waitBufLen < waitBufMaxBytes { + return waiter.EventOut + } + return waiter.EventMask(0) +} + +// readableSize writes the number of readable bytes to userspace. +func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + q.mu.Lock() + defer q.mu.Unlock() + var size int32 + if q.readable { + size = int32(len(q.readBuf)) + } + + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err + +} + +// read reads from q to userspace. It returns the number of bytes read as well +// as whether the read caused more readable data to become available (whether +// data was pushed from the wait buffer to the read buffer). +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { + q.mu.Lock() + defer q.mu.Unlock() + + if !q.readable { + return 0, false, syserror.ErrWouldBlock + } + + if dst.NumBytes() > canonMaxBytes { + dst = dst.TakeFirst(canonMaxBytes) + } + + n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) { + src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf)) + n, err := safemem.CopySeq(dst, src) + if err != nil { + return 0, err + } + q.readBuf = q.readBuf[n:] + + // If we read everything, this queue is no longer readable. + if len(q.readBuf) == 0 { + q.readable = false + } + + return n, nil + })) + if err != nil { + return 0, false, err + } + + // Move data from the queue's wait buffer to its read buffer. + nPushed := q.pushWaitBufLocked(l) + + return int64(n), nPushed > 0, nil +} + +// write writes to q from userspace. +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) { + q.mu.Lock() + defer q.mu.Unlock() + + // Copy data into the wait buffer. + n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) { + copyLen := src.NumBytes() + room := waitBufMaxBytes - q.waitBufLen + // If out of room, return EAGAIN. + if room == 0 && copyLen > 0 { + return 0, syserror.ErrWouldBlock + } + // Cap the size of the wait buffer. + if copyLen > room { + copyLen = room + src = src.TakeFirst64(room) + } + buf := make([]byte, copyLen) + + // Copy the data into the wait buffer. + dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)) + n, err := safemem.CopySeq(dst, src) + if err != nil { + return 0, err + } + q.waitBufAppend(buf) + + return n, nil + })) + if err != nil { + return 0, err + } + + // Push data from the wait to the read buffer. + q.pushWaitBufLocked(l) + + return n, nil +} + +// writeBytes writes to q from b. +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) writeBytes(b []byte, l *lineDiscipline) { + q.mu.Lock() + defer q.mu.Unlock() + + // Write to the wait buffer. + q.waitBufAppend(b) + q.pushWaitBufLocked(l) +} + +// pushWaitBufLocked fills the queue's read buffer with data from the wait +// buffer. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be locked. +func (q *queue) pushWaitBufLocked(l *lineDiscipline) int { + if q.waitBufLen == 0 { + return 0 + } + + // Move data from the wait to the read buffer. + var total int + var i int + for i = 0; i < len(q.waitBuf); i++ { + n := q.transform(l, q, q.waitBuf[i]) + total += n + if n != len(q.waitBuf[i]) { + // The read buffer filled up without consuming the + // entire buffer. + q.waitBuf[i] = q.waitBuf[i][n:] + break + } + } + + // Update wait buffer based on consumed data. + q.waitBuf = q.waitBuf[i:] + q.waitBufLen -= uint64(total) + + return total +} + +// Precondition: q.mu must be locked. +func (q *queue) waitBufAppend(b []byte) { + q.waitBuf = append(q.waitBuf, b) + q.waitBufLen += uint64(len(b)) +} diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go new file mode 100644 index 000000000..cf1a0f0ac --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -0,0 +1,197 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// slaveInode is the inode for the slave end of the Terminal. +type slaveInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + locks vfs.FileLocks + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // root is the devpts root inode. + root *rootInode + + // t is the connected Terminal. + t *Terminal +} + +var _ kernfs.Inode = (*slaveInode)(nil) + +// Open implements kernfs.Inode.Open. +func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + si.IncRef() + fd := &slaveFileDescription{ + inode: si, + } + fd.LockFD.Init(&si.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + si.DecRef() + return nil, err + } + return &fd.vfsfd, nil + +} + +// Valid implements kernfs.Inode.Valid. +func (si *slaveInode) Valid(context.Context) bool { + // Return valid if the slave still exists. + si.root.mu.Lock() + defer si.root.mu.Unlock() + _, ok := si.root.slaves[si.t.n] + return ok +} + +// Stat implements kernfs.Inode.Stat. +func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := si.InodeAttrs.Stat(vfsfs, opts) + if err != nil { + return linux.Statx{}, err + } + statx.Blksize = 1024 + statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR + statx.RdevMinor = si.t.n + return statx, nil +} + +// SetStat implements kernfs.Inode.SetStat +func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask&linux.STATX_SIZE != 0 { + return syserror.EINVAL + } + return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) +} + +type slaveFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + inode *slaveInode +} + +var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil) + +// Release implements fs.FileOperations.Release. +func (sfd *slaveFileDescription) Release() { + sfd.inode.DecRef() +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) { + sfd.inode.t.ld.slaveWaiter.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return sfd.inode.t.ld.slaveReadiness() +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return sfd.inode.t.ld.inputQueueRead(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return sfd.inode.t.ld.outputQueueWrite(ctx, src) +} + +// Ioctl implements vfs.FileDescripionImpl.Ioctl. +func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ + // Get the number of bytes in the input queue read buffer. + return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args) + case linux.TCGETS: + return sfd.inode.t.ld.getTermios(ctx, io, args) + case linux.TCSETS: + return sfd.inode.t.ld.setTermios(ctx, io, args) + case linux.TCSETSW: + // TODO(b/29356795): This should drain the output queue first. + return sfd.inode.t.ld.setTermios(ctx, io, args) + case linux.TIOCGPTN: + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + case linux.TIOCGWINSZ: + return 0, sfd.inode.t.ld.windowSize(ctx, io, args) + case linux.TIOCSWINSZ: + return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */) + default: + maybeEmitUnimplementedEvent(ctx, cmd) + return 0, syserror.ENOTTY + } +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() + return sfd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() + return sfd.inode.Stat(fs, opts) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go new file mode 100644 index 000000000..7d2781c54 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/terminal.go @@ -0,0 +1,120 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Terminal is a pseudoterminal. +// +// +stateify savable +type Terminal struct { + // n is the terminal index. It is immutable. + n uint32 + + // ld is the line discipline of the terminal. It is immutable. + ld *lineDiscipline + + // masterKTTY contains the controlling process of the master end of + // this terminal. This field is immutable. + masterKTTY *kernel.TTY + + // slaveKTTY contains the controlling process of the slave end of this + // terminal. This field is immutable. + slaveKTTY *kernel.TTY +} + +func newTerminal(n uint32) *Terminal { + termios := linux.DefaultSlaveTermios + t := Terminal{ + n: n, + ld: newLineDiscipline(termios), + masterKTTY: &kernel.TTY{Index: n}, + slaveKTTY: &kernel.TTY{Index: n}, + } + return &t +} + +// setControllingTTY makes tm the controlling terminal of the calling thread +// group. +func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("setControllingTTY must be called from a task context") + } + + return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int()) +} + +// releaseControllingTTY removes tm as the controlling terminal of the calling +// thread group. +func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("releaseControllingTTY must be called from a task context") + } + + return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster)) +} + +// foregroundProcessGroup gets the process group ID of tm's foreground process. +func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("foregroundProcessGroup must be called from a task context") + } + + ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster)) + if err != nil { + return 0, err + } + + // Write it out to *arg. + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err +} + +// foregroundProcessGroup sets tm's foreground process. +func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("setForegroundProcessGroup must be called from a task context") + } + + // Read in the process group ID. + var pgid int32 + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid)) + return uintptr(ret), err +} + +func (tm *Terminal) tty(isMaster bool) *kernel.TTY { + if isMaster { + return tm.masterKTTY + } + return tm.slaveKTTY +} diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD new file mode 100644 index 000000000..aa0c2ad8c --- /dev/null +++ b/pkg/sentry/fsimpl/devtmpfs/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "devtmpfs", + srcs = ["devtmpfs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/sync", + ], +) + +go_test( + name = "devtmpfs_test", + size = "small", + srcs = ["devtmpfs_test.go"], + library = ":devtmpfs", + deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/sentry/contexttest", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + ], +) diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go new file mode 100644 index 000000000..d0e06cdc0 --- /dev/null +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -0,0 +1,219 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package devtmpfs provides an implementation of /dev based on tmpfs, +// analogous to Linux's devtmpfs. +package devtmpfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" +) + +// Name is the default filesystem name. +const Name = "devtmpfs" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct { + initOnce sync.Once + initErr error + + // fs is the tmpfs filesystem that backs all mounts of this FilesystemType. + // root is fs' root. fs and root are immutable. + fs *vfs.Filesystem + root *vfs.Dentry +} + +// Name implements vfs.FilesystemType.Name. +func (*FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fst.initOnce.Do(func() { + fs, root, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "" /* source */, vfs.GetFilesystemOptions{ + Data: "mode=0755", // opts from drivers/base/devtmpfs.c:devtmpfs_init() + }) + if err != nil { + fst.initErr = err + return + } + fst.fs = fs + fst.root = root + }) + if fst.initErr != nil { + return nil, nil, fst.initErr + } + fst.fs.IncRef() + fst.root.IncRef() + return fst.fs, fst.root, nil +} + +// Accessor allows devices to create device special files in devtmpfs. +type Accessor struct { + vfsObj *vfs.VirtualFilesystem + mntns *vfs.MountNamespace + root vfs.VirtualDentry + creds *auth.Credentials +} + +// NewAccessor returns an Accessor that supports creation of device special +// files in the devtmpfs instance registered with name fsTypeName in vfsObj. +func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) { + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, err + } + return &Accessor{ + vfsObj: vfsObj, + mntns: mntns, + root: mntns.Root(), + creds: creds, + }, nil +} + +// Release must be called when a is no longer in use. +func (a *Accessor) Release() { + a.root.DecRef() + a.mntns.DecRef() +} + +// accessorContext implements context.Context by extending an existing +// context.Context with an Accessor's values for VFS-relevant state. +type accessorContext struct { + context.Context + a *Accessor +} + +func (a *Accessor) wrapContext(ctx context.Context) *accessorContext { + return &accessorContext{ + Context: ctx, + a: a, + } +} + +// Value implements context.Context.Value. +func (ac *accessorContext) Value(key interface{}) interface{} { + switch key { + case vfs.CtxMountNamespace: + ac.a.mntns.IncRef() + return ac.a.mntns + case vfs.CtxRoot: + ac.a.root.IncRef() + return ac.a.root + default: + return ac.Context.Value(key) + } +} + +func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation { + return &vfs.PathOperation{ + Root: a.root, + Start: a.root, + Path: fspath.Parse(pathname), + } +} + +// CreateDeviceFile creates a device special file at the given pathname in the +// devtmpfs instance accessed by the Accessor. +func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error { + actx := a.wrapContext(ctx) + + mode := (linux.FileMode)(perms) + switch kind { + case vfs.BlockDevice: + mode |= linux.S_IFBLK + case vfs.CharDevice: + mode |= linux.S_IFCHR + default: + panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind)) + } + + // Create any parent directories. See + // devtmpfs.c:handle_create()=>path_create(). + for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() { + pop := a.pathOperationAt(it.String()) + if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + return fmt.Errorf("failed to create directory %q: %v", it.String(), err) + } + } + + // NOTE: Linux's devtmpfs refuses to automatically delete files it didn't + // create, which it recognizes by storing a pointer to the kdevtmpfs struct + // thread in struct inode::i_private. Accessor doesn't yet support deletion + // of files at all, and probably won't as long as we don't need to support + // kernel modules, so this is moot for now. + return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{ + Mode: mode, + DevMajor: major, + DevMinor: minor, + }) +} + +// UserspaceInit creates symbolic links and mount points in the devtmpfs +// instance accessed by the Accessor that are created by userspace in Linux. It +// does not create mounts. +func (a *Accessor) UserspaceInit(ctx context.Context) error { + actx := a.wrapContext(ctx) + + // Initialize symlinks. + for _, symlink := range []struct { + source string + target string + }{ + // systemd: src/shared/dev-setup.c:dev_setup() + {source: "fd", target: "/proc/self/fd"}, + {source: "stdin", target: "/proc/self/fd/0"}, + {source: "stdout", target: "/proc/self/fd/1"}, + {source: "stderr", target: "/proc/self/fd/2"}, + // /proc/kcore is not implemented. + + // Linux implements /dev/ptmx as a device node, but advises + // container implementations to create /dev/ptmx as a symlink + // to pts/ptmx (Documentation/filesystems/devpts.txt). Systemd + // follows this advice (src/nspawn/nspawn.c:setup_pts()), while + // LXC tries to create a bind mount and falls back to a symlink + // (src/lxc/conf.c:lxc_setup_devpts()). + {source: "ptmx", target: "pts/ptmx"}, + } { + if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil { + return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err) + } + } + + // systemd: src/core/mount-setup.c:mount_table + for _, dir := range []string{ + "shm", + "pts", + } { + if err := a.vfsObj.MkdirAt(actx, a.creds, a.pathOperationAt(dir), &vfs.MkdirOptions{ + // systemd: src/core/mount-setup.c:mount_one() + Mode: 0755, + }); err != nil { + return fmt.Errorf("failed to create directory %q: %v", dir, err) + } + } + + return nil +} diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go new file mode 100644 index 000000000..b6d52c015 --- /dev/null +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -0,0 +1,122 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devtmpfs + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func TestDevtmpfs(t *testing.T) { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + // Register tmpfs just so that we can have a root filesystem that isn't + // devtmpfs. + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + vfsObj.MustRegisterFilesystemType("devtmpfs", &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + // Create a test mount namespace with devtmpfs mounted at "/dev". + const devPath = "/dev" + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("failed to create tmpfs root mount: %v", err) + } + defer mntns.DecRef() + root := mntns.Root() + defer root.DecRef() + devpop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(devPath), + } + if err := vfsObj.MkdirAt(ctx, creds, &devpop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + t.Fatalf("failed to create mount point: %v", err) + } + if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil { + t.Fatalf("failed to mount devtmpfs: %v", err) + } + + a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs") + if err != nil { + t.Fatalf("failed to create devtmpfs.Accessor: %v", err) + } + defer a.Release() + + // Create "userspace-initialized" files using a devtmpfs.Accessor. + if err := a.UserspaceInit(ctx); err != nil { + t.Fatalf("failed to userspace-initialize devtmpfs: %v", err) + } + // Created files should be visible in the test mount namespace. + abspath := devPath + "/fd" + target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(abspath), + }) + if want := "/proc/self/fd"; err != nil || target != want { + t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want) + } + + // Create a dummy device special file using a devtmpfs.Accessor. + const ( + pathInDev = "dummy" + kind = vfs.CharDevice + major = 12 + minor = 34 + perms = 0600 + wantMode = linux.S_IFCHR | perms + ) + if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil { + t.Fatalf("failed to create device file: %v", err) + } + // The device special file should be visible in the test mount namespace. + abspath = devPath + "/" + pathInDev + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(abspath), + }, &vfs.StatOptions{ + Mask: linux.STATX_TYPE | linux.STATX_MODE, + }) + if err != nil { + t.Fatalf("failed to stat device file at %q: %v", abspath, err) + } + if stat.Mode != wantMode { + t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode) + } + if stat.RdevMajor != major { + t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major) + } + if stat.RdevMinor != minor { + t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor) + } +} diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD new file mode 100644 index 000000000..ea167d38c --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "eventfd", + srcs = ["eventfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fdnotifier", + "//pkg/log", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "eventfd_test", + size = "small", + srcs = ["eventfd_test.go"], + library = ":eventfd", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/contexttest", + "//pkg/sentry/vfs", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go new file mode 100644 index 000000000..d12d78b84 --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -0,0 +1,285 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package eventfd implements event fds. +package eventfd + +import ( + "math" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EventFileDescription implements FileDescriptionImpl for file-based event +// notification (eventfd). Eventfds are usually internal to the Sentry but in +// certain situations they may be converted into a host-backed eventfd. +type EventFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // queue is used to notify interested parties when the event object + // becomes readable or writable. + queue waiter.Queue `state:"zerovalue"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool + + // hostfd indicates whether this eventfd is passed through to the host. + hostfd int +} + +var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) + +// New creates a new event fd. +func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[eventfd]") + defer vd.DecRef() + efd := &EventFileDescription{ + val: initVal, + semMode: semMode, + hostfd: -1, + } + if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &efd.vfsfd, nil +} + +// HostFD returns the host eventfd associated with this event. +func (efd *EventFileDescription) HostFD() (int, error) { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + return efd.hostfd, nil + } + + flags := linux.EFD_NONBLOCK + if efd.semMode { + flags |= linux.EFD_SEMAPHORE + } + + fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) + if errno != 0 { + return -1, errno + } + + if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { + if closeErr := syscall.Close(int(fd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) + } + return -1, err + } + + efd.hostfd = int(fd) + return efd.hostfd, nil +} + +// Release implements FileDescriptionImpl.Release() +func (efd *EventFileDescription) Release() { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.RemoveFD(int32(efd.hostfd)) + if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) + } + efd.hostfd = -1 + } +} + +// Read implements FileDescriptionImpl.Read. +func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements FileDescriptionImpl.Write. +func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { + var buf [8]byte + if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err + } + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { + efd.mu.Lock() + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostReadLocked(ctx, dst) + } + + // We can't complete the read if the value is currently zero. + if efd.val == 0 { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if efd.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + efd.val-- + } else { + val = efd.val + efd.val = 0 + } + + efd.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + efd.queue.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostWriteLocked(val uint64) error { + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := syscall.Write(efd.hostfd, buf[:]) + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return efd.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (efd *EventFileDescription) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + efd.mu.Lock() + + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostWriteLocked(val) + } + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-efd.val { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + efd.val += val + efd.mu.Unlock() + + // Always trigger a notification. + efd.queue.Notify(waiter.EventIn) + + return nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + efd.mu.Lock() + defer efd.mu.Unlock() + + if efd.hostfd >= 0 { + return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) + } + + ready := waiter.EventMask(0) + if efd.val > 0 { + ready |= waiter.EventIn + } + + if efd.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + + return mask & ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { + efd.queue.EventRegister(entry, mask) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { + efd.queue.EventUnregister(entry) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go new file mode 100644 index 000000000..20e3adffc --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -0,0 +1,97 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventfd + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestEventFD(t *testing.T) { + initVals := []uint64{ + 0, + // Using a non-zero initial value verifies that writing to an + // eventfd signals when the eventfd's counter was already + // non-zero. + 343, + } + + for _, initVal := range initVals { + ctx := contexttest.Context(t) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + defer eventfd.DecRef() + + // Register a callback for a write event. + w, ch := waiter.NewChannelEntry(nil) + eventfd.EventRegister(&w, waiter.EventIn) + defer eventfd.EventUnregister(&w) + + data := []byte("00000124") + // Create and submit a write request. + n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatal(err) + } + if n != 8 { + t.Errorf("eventfd.write wrote %d bytes, not full int64", n) + } + + // Check if the callback fired due to the write event. + select { + case <-ch: + default: + t.Errorf("Didn't get notified of EventIn after write") + } + } +} + +func TestEventFDStat(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + defer eventfd.DecRef() + + statx, err := eventfd.Stat(ctx, vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + t.Fatalf("eventfd.Stat failed: %v", err) + } + if statx.Size != 0 { + t.Errorf("eventfd size should be 0") + } +} diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD new file mode 100644 index 000000000..ef24f8159 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -0,0 +1,102 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "dirent_list", + out = "dirent_list.go", + package = "ext", + prefix = "dirent", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dirent", + "Linker": "*dirent", + }, +) + +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "ext", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + +go_library( + name = "ext", + srcs = [ + "block_map_file.go", + "dentry.go", + "directory.go", + "dirent_list.go", + "ext.go", + "extent_file.go", + "file_description.go", + "filesystem.go", + "fstree.go", + "inode.go", + "regular_file.go", + "symlink.go", + "utils.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/fd", + "//pkg/fspath", + "//pkg/log", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsimpl/ext/disklayout", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/syscalls/linux", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "ext_test", + size = "small", + srcs = [ + "block_map_test.go", + "ext_test.go", + "extent_test.go", + ], + data = [ + "//pkg/sentry/fsimpl/ext:assets/bigfile.txt", + "//pkg/sentry/fsimpl/ext:assets/file.txt", + "//pkg/sentry/fsimpl/ext:assets/tiny.ext2", + "//pkg/sentry/fsimpl/ext:assets/tiny.ext3", + "//pkg/sentry/fsimpl/ext:assets/tiny.ext4", + ], + library = ":ext", + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/contexttest", + "//pkg/sentry/fsimpl/ext/disklayout", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/test/testutil", + "//pkg/usermem", + "@com_github_google_go-cmp//cmp:go_default_library", + "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/ext/README.md b/pkg/sentry/fsimpl/ext/README.md new file mode 100644 index 000000000..af00cfda8 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/README.md @@ -0,0 +1,117 @@ +## EXT(2/3/4) File System + +This is a filesystem driver which supports ext2, ext3 and ext4 filesystems. +Linux has specialized drivers for each variant but none which supports all. This +library takes advantage of ext's backward compatibility and understands the +internal organization of on-disk structures to support all variants. + +This driver implementation diverges from the Linux implementations in being more +forgiving about versioning. For instance, if a filesystem contains both extent +based inodes and classical block map based inodes, this driver will not complain +and interpret them both correctly. While in Linux this would be an issue. This +blurs the line between the three ext fs variants. + +Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has +been superseded by ext4 by large performance gains. Thus it is recommended to +upgrade older filesystem images to ext4 using e2fsprogs for better performance. + +### Read Only + +This driver currently only allows read only operations. A lot of the design +decisions are based on this feature. There are plans to implement write (the +process for which is documented in the future work section). + +### Performance + +One of the biggest wins about this driver is that it directly talks to the +underlying block device (or whatever persistent storage is being used), instead +of making expensive RPCs to a gofer. + +Another advantage is that ext fs supports fast concurrent reads. Currently the +device is represented using a `io.ReaderAt` which allows for concurrent reads. +All reads are directly passed to the device driver which intelligently serves +the read requests in the optimal order. There is no congestion due to locking +while reading in the filesystem level. + +Reads are optimized further in the way file data is transferred over to user +memory. Ext fs directly copies over file data from disk into user memory with no +additional allocations on the way. We can only get faster by preloading file +data into memory (see future work section). + +The internal structures used to represent files, inodes and file descriptors use +a lot of inheritance. With the level of indirection that an interface adds with +an internal pointer, it can quickly fragment a structure across memory. As this +runs along side a full blown kernel (which is memory intensive), having a +fragmented struct might hurt performance. Hence these internal structures, +though interfaced, are tightly packed in memory using the same inheritance +pattern that pkg/sentry/vfs uses. The pkg/sentry/fsimpl/ext/disklayout package +makes an execption to this pattern for reasons documented in the package. + +### Security + +This driver also intends to help sandbox the container better by reducing the +surface of the host kernel that the application touches. It prevents the +application from exploiting vulnerabilities in the host filesystem driver. All +`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly +passed to the device driver in the kernel. Hence this reduces the surface for +attack. + +The application can not affect any host filesystems other than the one passed +via block device by the user. + +### Future Work + +#### Write + +To support write operations we would need to modify the block device underneath. +Currently, the driver does not modify the device at all, not even for updating +the access times for reads. Modifying the filesystem incorrectly can corrupt it +and render it unreadable for other correct ext(x) drivers. Hence caution must be +maintained while modifying metadata structures. + +Ext4 specifically is built for performance and has added a lot of complexity as +to how metadata structures are modified. For instance, files that are organized +via an extent tree which must be balanced and file data blocks must be placed in +the same extent as much as possible to increase locality. Such properties must +be maintained while modifying the tree. + +Ext filesystems boast a lot about locality, which plays a big role in them being +performant. The block allocation algorithm in Linux does a good job in keeping +related data together. This behavior must be maintained as much as possible, +else we might end up degrading the filesystem performance over time. + +Ext4 also supports a wide variety of features which are specialized for varying +use cases. Implementing all of them can get difficult very quickly. + +Ext(x) checksums all its metadata structures to check for corruption, so +modification of any metadata struct must correspond with re-checksumming the +struct. Linux filesystem drivers also order on-disk updates intelligently to not +corrupt the filesystem and also remain performant. The in-memory metadata +structures must be kept in sync with what is on disk. + +There is also replication of some important structures across the filesystem. +All replicas must be updated when their original copy is updated. There is also +provisioning for snapshotting which must be kept in mind, although it should not +affect this implementation unless we allow users to create filesystem snapshots. + +Ext4 also introduced journaling (jbd2). The journal must be updated +appropriately. + +#### Performance + +To improve performance we should implement a buffer cache, and optionally, read +ahead for small files. While doing so we must also keep in mind the memory usage +and have a reasonable cap on how much file data we want to hold in memory. + +#### Features + +Our current implementation will work with most ext4 filesystems for readonly +purposed. However, the following features are not supported yet: + +- Journal +- Snapshotting +- Extended Attributes +- Hash Tree Directories +- Meta Block Groups +- Multiple Mount Protection +- Bigalloc diff --git a/pkg/sentry/fsimpl/ext/assets/README.md b/pkg/sentry/fsimpl/ext/assets/README.md new file mode 100644 index 000000000..6f1e81b3a --- /dev/null +++ b/pkg/sentry/fsimpl/ext/assets/README.md @@ -0,0 +1,36 @@ +### Tiny Ext(2/3/4) Images + +The images are of size 64Kb which supports 64 1k blocks and 16 inodes. This is +the smallest size mkfs.ext(2/3/4) works with. + +These images were generated using the following commands. + +```bash +fallocate -l 64K tiny.ext$VERSION +mkfs.ext$VERSION -j tiny.ext$VERSION +``` + +where `VERSION` is `2`, `3` or `4`. + +You can mount it using: + +```bash +sudo mount -o loop tiny.ext$VERSION $MOUNTPOINT +``` + +`file.txt`, `bigfile.txt` and `symlink.txt` were added to this image by just +mounting it and copying (while preserving links) those files to the mountpoint +directory using: + +```bash +sudo cp -P {file.txt,symlink.txt,bigfile.txt} $MOUNTPOINT +``` + +The files in this directory mirror the contents and organisation of the files +stored in the image. + +You can umount the filesystem using: + +```bash +sudo umount $MOUNTPOINT +``` diff --git a/pkg/sentry/fsimpl/ext/assets/bigfile.txt b/pkg/sentry/fsimpl/ext/assets/bigfile.txt new file mode 100644 index 000000000..3857cf516 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/assets/bigfile.txt @@ -0,0 +1,41 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus faucibus eleifend orci, ut ornare nibh faucibus eu. Cras at condimentum massa. Nullam luctus, elit non porttitor congue, sapien diam feugiat sapien, sed eleifend nulla mauris non arcu. Sed lacinia mauris magna, eu mollis libero varius sit amet. Donec mollis, quam convallis commodo posuere, dolor nisi placerat nisi, in faucibus augue mi eu lorem. In pharetra consectetur faucibus. Ut euismod ex efficitur egestas tincidunt. Maecenas condimentum ut ante in rutrum. Vivamus sed arcu tempor, faucibus turpis et, lacinia diam. + +Sed in lacus vel nisl interdum bibendum in sed justo. Nunc tellus risus, molestie vitae arcu sed, molestie tempus ligula. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc risus neque, volutpat et ante non, ullamcorper condimentum ante. Aliquam sed metus in urna condimentum convallis. Vivamus ut libero mauris. Proin mollis posuere consequat. Vestibulum placerat mollis est et pulvinar. + +Donec rutrum odio ac diam pharetra, id fermentum magna cursus. Pellentesque in dapibus elit, et condimentum orci. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Suspendisse euismod dapibus est, id vestibulum mauris. Nulla facilisi. Nulla cursus gravida nisi. Phasellus vestibulum rutrum lectus, a dignissim mauris hendrerit vitae. In at elementum mauris. Integer vel efficitur velit. Nullam fringilla sapien mi, quis luctus neque efficitur ac. Aenean nec quam dapibus nunc commodo pharetra. Proin sapien mi, fermentum aliquet vulputate non, aliquet porttitor diam. Quisque lacinia, urna et finibus fermentum, nunc lacus vehicula ex, sed congue metus lectus ac quam. Aliquam erat volutpat. Suspendisse sodales, dolor ut tincidunt finibus, augue erat varius tellus, a interdum erat sem at nunc. Vestibulum cursus iaculis sapien, vitae feugiat dui auctor quis. + +Pellentesque nec maximus nulla, eu blandit diam. Maecenas quis arcu ornare, congue ante at, vehicula ipsum. Praesent feugiat mauris rutrum sem fermentum, nec luctus ipsum placerat. Pellentesque placerat ipsum at dignissim fringilla. Vivamus et posuere sem, eget hendrerit felis. Aenean vulputate, augue vel mollis feugiat, justo ipsum mollis dolor, eu mollis elit neque ut ipsum. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Fusce bibendum sem quam, vulputate laoreet mi dapibus imperdiet. Sed a purus non nibh pretium aliquet. Integer eget luctus augue, vitae tincidunt magna. Ut eros enim, egestas eu nulla et, lobortis egestas arcu. Cras id ipsum ac justo lacinia rutrum. Vivamus lectus leo, ultricies sed justo at, pellentesque feugiat magna. Ut sollicitudin neque elit, vel ornare mauris commodo id. + +Duis dapibus orci et sapien finibus finibus. Mauris eleifend, lacus at vestibulum maximus, quam ligula pharetra erat, sit amet dapibus neque elit vitae neque. In bibendum sollicitudin erat, eget ultricies tortor malesuada at. Sed sit amet orci turpis. Donec feugiat ligula nibh, molestie tincidunt lectus elementum id. Donec volutpat maximus nibh, in vulputate felis posuere eu. Cras tincidunt ullamcorper lacus. Phasellus porta lorem auctor, congue magna a, commodo elit. + +Etiam auctor mi quis elit sodales, eu pulvinar arcu condimentum. Aenean imperdiet risus et dapibus tincidunt. Nullam tincidunt dictum dui, sed commodo urna rutrum id. Ut mollis libero vel elit laoreet bibendum. Quisque arcu arcu, tincidunt at ultricies id, vulputate nec metus. In tristique posuere quam sit amet volutpat. Vivamus scelerisque et nunc at dapibus. Fusce finibus libero ut ligula pretium rhoncus. Mauris non elit in arcu finibus imperdiet. Pellentesque nec massa odio. Proin rutrum mauris non sagittis efficitur. Aliquam auctor quam at dignissim faucibus. Ut eget ligula in magna posuere ultricies vitae sit amet turpis. Duis maximus odio nulla. Donec gravida sem tristique tempus scelerisque. + +Interdum et malesuada fames ac ante ipsum primis in faucibus. Fusce pharetra magna vulputate aliquet tempus. Duis id hendrerit arcu. Quisque ut ex elit. Integer velit orci, venenatis ut sapien ac, placerat porttitor dui. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc hendrerit cursus diam, hendrerit finibus ipsum scelerisque ut. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. + +Nulla non euismod neque. Phasellus vel sapien eu metus pulvinar rhoncus. Suspendisse eu mollis tellus, quis vestibulum tortor. Maecenas interdum dolor sed nulla fermentum maximus. Donec imperdiet ullamcorper condimentum. Nam quis nibh ante. Praesent quis tellus ut tortor pulvinar blandit sit amet ut sapien. Vestibulum est orci, pellentesque vitae tristique sit amet, tristique non felis. + +Vivamus sodales pellentesque varius. Sed vel tempus ligula. Nulla tristique nisl vel dui facilisis, ac sodales augue hendrerit. Proin augue nisi, vestibulum quis augue nec, sagittis tincidunt velit. Vestibulum euismod, nulla nec sodales faucibus, urna sapien vulputate magna, id varius metus sapien ut neque. Duis in mollis urna, in scelerisque enim. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc condimentum dictum turpis, et egestas neque dapibus eget. Quisque fringilla, dui eu venenatis eleifend, erat nibh lacinia urna, at lacinia lacus sapien eu dui. Duis eu erat ut mi lacinia convallis a sed ex. + +Fusce elit metus, tincidunt nec eleifend a, hendrerit nec ligula. Duis placerat finibus sollicitudin. In euismod porta tellus, in luctus justo bibendum bibendum. Maecenas at magna eleifend lectus tincidunt suscipit ut a ligula. Nulla tempor accumsan felis, fermentum dapibus est eleifend vitae. Mauris urna sem, fringilla at ultricies non, ultrices in arcu. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam vehicula nunc at laoreet imperdiet. Nunc tristique ut risus id aliquet. Integer eleifend massa orci. + +Vestibulum sed ante sollicitudin nisi fringilla bibendum nec vel quam. Sed pretium augue eu ligula congue pulvinar. Donec vitae magna tincidunt, pharetra lacus id, convallis nulla. Cras viverra nisl nisl, varius convallis leo vulputate nec. Morbi at consequat dui, sed aliquet metus. Sed suscipit fermentum mollis. Maecenas nec mi sodales, tincidunt purus in, tristique mauris. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec interdum mi in velit efficitur, quis ultrices ex imperdiet. Sed vestibulum, magna ut tristique pretium, mi ipsum placerat tellus, non tempor enim augue et ex. Pellentesque eget felis quis ante sodales viverra ac sed lacus. Donec suscipit tempus massa, eget laoreet massa molestie at. + +Aenean fringilla dui non aliquet consectetur. Fusce cursus quam nec orci hendrerit faucibus. Donec consequat suscipit enim, non volutpat lectus auctor interdum. Proin lorem purus, maximus vel orci vitae, suscipit egestas turpis. Donec risus urna, congue a sem eu, aliquet placerat odio. Morbi gravida tristique turpis, quis efficitur enim. Nunc interdum gravida ipsum vel facilisis. Nunc congue finibus sollicitudin. Quisque euismod aliquet lectus et tincidunt. Curabitur ultrices sem ut mi fringilla fermentum. Morbi pretium, nisi sit amet dapibus congue, dolor enim consectetur risus, a interdum ligula odio sed odio. Quisque facilisis, mi at suscipit gravida, nunc sapien cursus justo, ut luctus odio nulla quis leo. Integer condimentum lobortis mauris, non egestas tellus lobortis sit amet. + +In sollicitudin velit ac ante vehicula, vitae varius tortor mollis. In hac habitasse platea dictumst. Quisque et orci lorem. Integer malesuada fringilla luctus. Pellentesque malesuada, mi non lobortis porttitor, ante ligula vulputate ante, nec dictum risus eros sit amet sapien. Nulla aliquam lorem libero, ac varius nulla tristique eget. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut pellentesque mauris orci, vel consequat mi varius a. Ut sit amet elit vulputate, lacinia metus non, fermentum nisl. Pellentesque eu nisi sed quam egestas blandit. Duis sit amet lobortis dolor. Donec consectetur sem interdum, tristique elit sit amet, sodales lacus. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Fusce id aliquam augue. Sed pretium congue risus vitae lacinia. Vestibulum non vulputate risus, ut malesuada justo. + +Sed odio elit, consectetur ac mauris quis, consequat commodo libero. Fusce sodales velit vulputate pulvinar fermentum. Donec iaculis nec nisl eget faucibus. Mauris at dictum velit. Donec fermentum lectus eu viverra volutpat. Aliquam consequat facilisis lorem, cursus consequat dui bibendum ullamcorper. Pellentesque nulla magna, imperdiet at magna et, cursus egestas enim. Nullam semper molestie lectus sit amet semper. Duis eget tincidunt est. Integer id neque risus. Integer ultricies hendrerit vestibulum. Donec blandit blandit sagittis. Nunc consectetur vitae nisi consectetur volutpat. + +Nulla id lorem fermentum, efficitur magna a, hendrerit dui. Vivamus sagittis orci gravida, bibendum quam eget, molestie est. Phasellus nec enim tincidunt, volutpat sapien non, laoreet diam. Nulla posuere enim nec porttitor lobortis. Donec auctor odio ut orci eleifend, ut eleifend purus convallis. Interdum et malesuada fames ac ante ipsum primis in faucibus. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut hendrerit, purus eget viverra tincidunt, sem magna imperdiet libero, et aliquam turpis neque vitae elit. Maecenas semper varius iaculis. Cras non lorem quis quam bibendum eleifend in et libero. Curabitur at purus mauris. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus porta diam sed elit eleifend gravida. + +Nulla facilisi. Ut ultricies diam vel diam consectetur, vel porta augue molestie. Fusce interdum sapien et metus facilisis pellentesque. Nulla convallis sem at nunc vehicula facilisis. Nam ac rutrum purus. Nunc bibendum, dolor sit amet tempus ullamcorper, lorem leo tempor sem, id fringilla nunc augue scelerisque augue. Nullam sit amet rutrum nisl. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Donec sed mauris gravida eros vehicula sagittis at eget orci. Cras elementum, eros at accumsan bibendum, libero neque blandit purus, vitae vestibulum libero massa ac nibh. Integer at placerat nulla. Mauris eu eleifend orci. Aliquam consequat ligula vitae erat porta lobortis. Duis fermentum elit ac aliquet ornare. + +Mauris eget cursus tellus, eget sodales purus. Aliquam malesuada, augue id vulputate finibus, nisi ex bibendum nisl, sit amet laoreet quam urna a dolor. Nullam ultricies, sapien eu laoreet consequat, erat eros dignissim diam, ultrices sodales lectus mauris et leo. Morbi lacinia eu ante at tempus. Sed iaculis finibus magna malesuada efficitur. Donec faucibus erat sit amet elementum feugiat. Praesent a placerat nisi. Etiam lacinia gravida diam, et sollicitudin sapien tincidunt ut. + +Maecenas felis quam, tincidunt vitae venenatis scelerisque, viverra vitae odio. Phasellus enim neque, ultricies suscipit malesuada sit amet, vehicula sit amet purus. Nulla placerat sit amet dui vel tincidunt. Nam quis neque vel magna commodo egestas. Vestibulum sagittis rutrum lorem ut congue. Maecenas vel ultrices tellus. Donec efficitur, urna ac consequat iaculis, lorem felis pharetra eros, eget faucibus orci lectus sit amet arcu. + +Ut a tempus nisi. Nulla facilisi. Praesent vulputate maximus mi et dapibus. Sed sit amet libero ac augue hendrerit efficitur in a sapien. Mauris placerat velit sit amet tellus sollicitudin faucibus. Donec egestas a magna ac suscipit. Duis enim sapien, mollis sed egestas et, vestibulum vel leo. + +Proin quis dapibus dui. Donec eu tincidunt nunc. Vivamus eget purus consectetur, maximus ante vitae, tincidunt elit. Aenean mattis dolor a gravida aliquam. Praesent quis tellus id sem maximus vulputate nec sed nulla. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur metus nulla, volutpat volutpat est eu, hendrerit congue erat. Aliquam sollicitudin augue ante. Sed sollicitudin, magna eu consequat elementum, mi augue ullamcorper felis, molestie imperdiet erat metus iaculis est. Proin ac tortor nisi. Pellentesque quis nisi risus. Integer enim sapien, tincidunt quis tortor id, accumsan venenatis mi. Nulla facilisi. + +Cras pretium sit amet quam congue maximus. Morbi lacus libero, imperdiet commodo massa sed, scelerisque placerat libero. Cras nisl nisi, consectetur sed bibendum eu, venenatis at enim. Proin sodales justo at quam aliquam, a consectetur mi ornare. Donec porta ac est sit amet efficitur. Suspendisse vestibulum tortor id neque imperdiet, id lacinia risus vehicula. Phasellus ac eleifend purus. Mauris vel gravida ante. Aliquam vitae lobortis risus. Sed vehicula consectetur tincidunt. Nam et justo vitae purus molestie consequat. Pellentesque ipsum ex, convallis quis blandit non, gravida et urna. Donec diam ligula amet. diff --git a/pkg/sentry/fsimpl/ext/assets/file.txt b/pkg/sentry/fsimpl/ext/assets/file.txt new file mode 100644 index 000000000..980a0d5f1 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/assets/file.txt @@ -0,0 +1 @@ +Hello World! diff --git a/pkg/sentry/fsimpl/ext/assets/symlink.txt b/pkg/sentry/fsimpl/ext/assets/symlink.txt new file mode 120000 index 000000000..4c330738c --- /dev/null +++ b/pkg/sentry/fsimpl/ext/assets/symlink.txt @@ -0,0 +1 @@ +file.txt
\ No newline at end of file diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext2 b/pkg/sentry/fsimpl/ext/assets/tiny.ext2 Binary files differnew file mode 100644 index 000000000..381ade9bf --- /dev/null +++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext2 diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext3 b/pkg/sentry/fsimpl/ext/assets/tiny.ext3 Binary files differnew file mode 100644 index 000000000..0e97a324c --- /dev/null +++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext3 diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext4 b/pkg/sentry/fsimpl/ext/assets/tiny.ext4 Binary files differnew file mode 100644 index 000000000..a6859736d --- /dev/null +++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext4 diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD new file mode 100644 index 000000000..6c5a559fd --- /dev/null +++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_test") + +package(licenses = ["notice"]) + +go_test( + name = "benchmark_test", + size = "small", + srcs = ["benchmark_test.go"], + deps = [ + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/contexttest", + "//pkg/sentry/fsimpl/ext", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + ], +) diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go new file mode 100644 index 000000000..89caee3df --- /dev/null +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -0,0 +1,206 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// These benchmarks emulate memfs benchmarks. Ext4 images must be created +// before this benchmark is run using the `make_deep_ext4.sh` script at +// /tmp/image-{depth}.ext4 for all the depths tested below. +// +// The benchmark itself cannot run the script because the script requires +// sudo privileges to create the file system images. +package benchmark_test + +import ( + "fmt" + "os" + "runtime" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +var depths = []int{1, 2, 3, 8, 64, 100} + +const filename = "file.txt" + +// setUp opens imagePath as an ext Filesystem and returns all necessary +// elements required to run tests. If error is nil, it also returns a tear +// down function which must be called after the test is run for clean up. +func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) { + f, err := os.Open(imagePath) + if err != nil { + return nil, nil, nil, nil, err + } + + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, nil, nil, nil, err + } + vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) + if err != nil { + f.Close() + return nil, nil, nil, nil, err + } + + root := mntns.Root() + + tearDown := func() { + root.DecRef() + + if err := f.Close(); err != nil { + b.Fatalf("tearDown failed: %v", err) + } + } + return ctx, vfsObj, &root, tearDown, nil +} + +// mount mounts extfs at the path operation passed. Returns a tear down +// function which must be called after the test is run for clean up. +func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vfs.PathOperation) func() { + b.Helper() + + f, err := os.Open(imagePath) + if err != nil { + b.Fatalf("could not open image at %s: %v", imagePath, err) + } + + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + return func() { + if err := f.Close(); err != nil { + b.Fatalf("tearDown failed: %v", err) + } + } +} + +// BenchmarkVFS2Ext4fsStat emulates BenchmarkVFS2MemfsStat. +func BenchmarkVFS2Ext4fsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", depth)) + if err != nil { + b.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + creds := auth.CredentialsFromContext(ctx) + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + for i := 1; i <= depth; i++ { + filePathBuilder.WriteString(fmt.Sprintf("%d", i)) + filePathBuilder.WriteByte('/') + } + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ + Root: *root, + Start: *root, + Path: fspath.Parse(filePath), + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Size > 0 { + b.Fatalf("got wrong file size (%d)", stat.Size) + } + } + }) + } +} + +// BenchmarkVFS2ExtfsMountStat emulates BenchmarkVFS2MemfsMountStat. +func BenchmarkVFS2ExtfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + // Create root extfs with depth 1 so we can mount extfs again at /1/. + ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", 1)) + if err != nil { + b.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + creds := auth.CredentialsFromContext(ctx) + mountPointName := "/1/" + pop := vfs.PathOperation{ + Root: *root, + Start: *root, + Path: fspath.Parse(mountPointName), + } + + // Save the mount point for later use. + mountPoint, err := vfsfs.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + + // Create extfs submount. + mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop) + defer mountTearDown() + + var filePathBuilder strings.Builder + filePathBuilder.WriteString(mountPointName) + for i := 1; i <= depth; i++ { + filePathBuilder.WriteString(fmt.Sprintf("%d", i)) + filePathBuilder.WriteByte('/') + } + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ + Root: *root, + Start: *root, + Path: fspath.Parse(filePath), + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. touch(1) always creates files of size 0 (empty). + if stat.Size > 0 { + b.Fatalf("got wrong file size (%d)", stat.Size) + } + } + }) + } +} diff --git a/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh b/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh new file mode 100755 index 000000000..d0910da1f --- /dev/null +++ b/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script creates an ext4 image with $1 depth of directories and a file in +# the inner most directory. The created file is at path /1/2/.../depth/file.txt. +# The ext4 image is written to $2. The image is temporarily mounted at +# /tmp/mountpoint. This script must be run with sudo privileges. + +# Usage: +# sudo bash make_deep_ext4.sh {depth} {output path} + +# Check positional arguments. +if [ "$#" -ne 2 ]; then + echo "Usage: sudo bash make_deep_ext4.sh {depth} {output path}" + exit 1 +fi + +# Make sure depth is a non-negative number. +if ! [[ "$1" =~ ^[0-9]+$ ]]; then + echo "Depth must be a non-negative number." + exit 1 +fi + +# Create a 1 MB filesystem image at the requested output path. +rm -f $2 +fallocate -l 1M $2 +if [ $? -ne 0 ]; then + echo "fallocate failed" + exit $? +fi + +# Convert that blank into an ext4 image. +mkfs.ext4 -j $2 +if [ $? -ne 0 ]; then + echo "mkfs.ext4 failed" + exit $? +fi + +# Mount the image. +MOUNTPOINT=/tmp/mountpoint +mkdir -p $MOUNTPOINT +mount -o loop $2 $MOUNTPOINT +if [ $? -ne 0 ]; then + echo "mount failed" + exit $? +fi + +# Create nested directories and the file. +if [ "$1" -eq 0 ]; then + FILEPATH=$MOUNTPOINT/file.txt +else + FILEPATH=$MOUNTPOINT/$(seq -s '/' 1 $1)/file.txt +fi +mkdir -p $(dirname $FILEPATH) || exit +touch $FILEPATH + +# Clean up. +umount $MOUNTPOINT +rm -rf $MOUNTPOINT diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go new file mode 100644 index 000000000..8bb104ff0 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/block_map_file.go @@ -0,0 +1,201 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + "math" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + // numDirectBlks is the number of direct blocks in ext block map inodes. + numDirectBlks = 12 +) + +// blockMapFile is a type of regular file which uses direct/indirect block +// addressing to store file data. This was deprecated in ext4. +type blockMapFile struct { + regFile regularFile + + // directBlks are the direct blocks numbers. The physical blocks pointed by + // these holds file data. Contains file blocks 0 to 11. + directBlks [numDirectBlks]uint32 + + // indirectBlk is the physical block which contains (blkSize/4) direct block + // numbers (as uint32 integers). + indirectBlk uint32 + + // doubleIndirectBlk is the physical block which contains (blkSize/4) indirect + // block numbers (as uint32 integers). + doubleIndirectBlk uint32 + + // tripleIndirectBlk is the physical block which contains (blkSize/4) doubly + // indirect block numbers (as uint32 integers). + tripleIndirectBlk uint32 + + // coverage at (i)th index indicates the amount of file data a node at + // height (i) covers. Height 0 is the direct block. + coverage [4]uint64 +} + +// Compiles only if blockMapFile implements io.ReaderAt. +var _ io.ReaderAt = (*blockMapFile)(nil) + +// newBlockMapFile is the blockMapFile constructor. It initializes the file to +// physical blocks map with (at most) the first 12 (direct) blocks. +func newBlockMapFile(args inodeArgs) (*blockMapFile, error) { + file := &blockMapFile{} + file.regFile.impl = file + file.regFile.inode.init(args, &file.regFile) + + for i := uint(0); i < 4; i++ { + file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i) + } + + blkMap := file.regFile.inode.diskInode.Data() + binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks) + binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk) + binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk) + binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk) + return file, nil +} + +// ReadAt implements io.ReaderAt.ReadAt. +func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { + if len(dst) == 0 { + return 0, nil + } + + if off < 0 { + return 0, syserror.EINVAL + } + + offset := uint64(off) + size := f.regFile.inode.diskInode.Size() + if offset >= size { + return 0, io.EOF + } + + // dirBlksEnd is the file offset until which direct blocks cover file data. + // Direct blocks cover 0 <= file offset < dirBlksEnd. + dirBlksEnd := numDirectBlks * f.coverage[0] + + // indirBlkEnd is the file offset until which the indirect block covers file + // data. The indirect block covers dirBlksEnd <= file offset < indirBlkEnd. + indirBlkEnd := dirBlksEnd + f.coverage[1] + + // doubIndirBlkEnd is the file offset until which the double indirect block + // covers file data. The double indirect block covers the range + // indirBlkEnd <= file offset < doubIndirBlkEnd. + doubIndirBlkEnd := indirBlkEnd + f.coverage[2] + + read := 0 + toRead := len(dst) + if uint64(toRead)+offset > size { + toRead = int(size - offset) + } + for read < toRead { + var err error + var curR int + + // Figure out which block to delegate the read to. + switch { + case offset < dirBlksEnd: + // Direct block. + curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:]) + case offset < indirBlkEnd: + // Indirect block. + curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:]) + case offset < doubIndirBlkEnd: + // Doubly indirect block. + curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:]) + default: + // Triply indirect block. + curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:]) + } + + read += curR + offset += uint64(curR) + if err != nil { + return read, err + } + } + + if read < len(dst) { + return read, io.EOF + } + return read, nil +} + +// read is the recursive step of the ReadAt function. It relies on knowing the +// current node's location on disk (curPhyBlk) and its height in the block map +// tree. A height of 0 shows that the current node is actually holding file +// data. relFileOff tells the offset from which we need to start to reading +// under the current node. It is completely relative to the current node. +func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, dst []byte) (int, error) { + curPhyBlkOff := int64(curPhyBlk) * int64(f.regFile.inode.blkSize) + if height == 0 { + toRead := int(f.regFile.inode.blkSize - relFileOff) + if len(dst) < toRead { + toRead = len(dst) + } + + n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff)) + if n < toRead { + return n, syserror.EIO + } + return n, nil + } + + childCov := f.coverage[height-1] + startIdx := relFileOff / childCov + endIdx := f.regFile.inode.blkSize / 4 // This is exclusive. + wantEndIdx := (relFileOff + uint64(len(dst))) / childCov + wantEndIdx++ // Make this exclusive. + if wantEndIdx < endIdx { + endIdx = wantEndIdx + } + + read := 0 + curChildOff := relFileOff % childCov + for i := startIdx; i < endIdx; i++ { + var childPhyBlk uint32 + err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) + if err != nil { + return read, err + } + + n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:]) + read += n + if err != nil { + return read, err + } + + curChildOff = 0 + } + + return read, nil +} + +// getCoverage returns the number of bytes a node at the given height covers. +// Height 0 is the file data block itself. Height 1 is the indirect block. +// +// Formula: blkSize * ((blkSize / 4)^height) +func getCoverage(blkSize uint64, height uint) uint64 { + return blkSize * uint64(math.Pow(float64(blkSize/4), float64(height))) +} diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go new file mode 100644 index 000000000..6fa84e7aa --- /dev/null +++ b/pkg/sentry/fsimpl/ext/block_map_test.go @@ -0,0 +1,156 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "bytes" + "math/rand" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" +) + +// These consts are for mocking the block map tree. +const ( + mockBMBlkSize = uint32(16) + mockBMDiskSize = 2500 +) + +// TestBlockMapReader stress tests block map reader functionality. It performs +// random length reads from all possible positions in the block map structure. +func TestBlockMapReader(t *testing.T) { + mockBMFile, want := blockMapSetUp(t) + n := len(want) + + for from := 0; from < n; from++ { + got := make([]byte, n-from) + + if read, err := mockBMFile.ReadAt(got, int64(from)); err != nil { + t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err) + } + + if diff := cmp.Diff(got, want[from:]); diff != "" { + t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff) + } + } +} + +// blkNumGen is a number generator which gives block numbers for building the +// block map file on disk. It gives unique numbers in a random order which +// facilitates in creating an extremely fragmented filesystem. +type blkNumGen struct { + nums []uint32 +} + +// newBlkNumGen is the blkNumGen constructor. +func newBlkNumGen() *blkNumGen { + blkNums := &blkNumGen{} + lim := mockBMDiskSize / mockBMBlkSize + blkNums.nums = make([]uint32, lim) + for i := uint32(0); i < lim; i++ { + blkNums.nums[i] = i + } + + rand.Shuffle(int(lim), func(i, j int) { + blkNums.nums[i], blkNums.nums[j] = blkNums.nums[j], blkNums.nums[i] + }) + return blkNums +} + +// next returns the next random block number. +func (n *blkNumGen) next() uint32 { + ret := n.nums[0] + n.nums = n.nums[1:] + return ret +} + +// blockMapSetUp creates a mock disk and a block map file. It initializes the +// block map file with 12 direct block, 1 indirect block, 1 double indirect +// block and 1 triple indirect block (basically fill it till the rim). It +// initializes the disk to reflect the inode. Also returns the file data that +// the inode covers and that is written to disk. +func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { + mockDisk := make([]byte, mockBMDiskSize) + var fileData []byte + blkNums := newBlkNumGen() + var data []byte + + // Write the direct blocks. + for i := 0; i < numDirectBlks; i++ { + curBlkNum := blkNums.next() + data = binary.Marshal(data, binary.LittleEndian, curBlkNum) + fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...) + } + + // Write to indirect block. + indirectBlk := blkNums.next() + data = binary.Marshal(data, binary.LittleEndian, indirectBlk) + fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...) + + // Write to indirect block. + doublyIndirectBlk := blkNums.next() + data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk) + fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...) + + // Write to indirect block. + triplyIndirectBlk := blkNums.next() + data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk) + fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...) + + args := inodeArgs{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, + diskInode: &disklayout.InodeNew{ + InodeOld: disklayout.InodeOld{ + SizeLo: getMockBMFileFize(), + }, + }, + blkSize: uint64(mockBMBlkSize), + } + copy(args.diskInode.Data(), data) + + mockFile, err := newBlockMapFile(args) + if err != nil { + t.Fatalf("newBlockMapFile failed: %v", err) + } + return mockFile, fileData +} + +// writeFileDataToBlock writes random bytes to the block on disk. +func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkNumGen) []byte { + if height == 0 { + start := blkNum * mockBMBlkSize + end := start + mockBMBlkSize + rand.Read(disk[start:end]) + return disk[start:end] + } + + var fileData []byte + for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 { + curBlkNum := blkNums.next() + copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum)) + fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...) + } + return fileData +} + +// getMockBMFileFize gets the size of the mock block map file which is used for +// testing. +func getMockBMFileFize() uint32 { + return uint32(numDirectBlks*getCoverage(uint64(mockBMBlkSize), 0) + getCoverage(uint64(mockBMBlkSize), 1) + getCoverage(uint64(mockBMBlkSize), 2) + getCoverage(uint64(mockBMBlkSize), 3)) +} diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go new file mode 100644 index 000000000..55902322a --- /dev/null +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -0,0 +1,79 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + // Protected by filesystem.mu. + parent *dentry + name string + + // inode is the inode represented by this dentry. Multiple Dentries may + // share a single non-directory Inode (with hard links). inode is + // immutable. + inode *inode +} + +// Compiles only if dentry implements vfs.DentryImpl. +var _ vfs.DentryImpl = (*dentry)(nil) + +// newDentry is the dentry constructor. +func newDentry(in *inode) *dentry { + d := &dentry{ + inode: in, + } + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + d.inode.incRef() +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + return d.inode.tryIncRef() +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + // FIXME(b/134676337): filesystem.mu may not be locked as required by + // inode.decRef(). + d.inode.decRef() +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(b/134676337): Implement inotify. +func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(b/134676337): Implement inotify. +func (d *dentry) Watches() *vfs.Watches { + return nil +} + +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +// +// TODO(b/134676337): Implement inotify. +func (d *dentry) OnZeroWatches() {} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go new file mode 100644 index 000000000..357512c7e --- /dev/null +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -0,0 +1,318 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// directory represents a directory inode. It holds the childList in memory. +type directory struct { + inode inode + + // childCache maps filenames to dentries for children for which dentries + // have been instantiated. childCache is protected by filesystem.mu. + childCache map[string]*dentry + + // mu serializes the changes to childList. + // Lock Order (outermost locks must be taken first): + // directory.mu + // filesystem.mu + mu sync.Mutex + + // childList is a list containing (1) child dirents and (2) fake dirents + // (with diskDirent == nil) that represent the iteration position of + // directoryFDs. childList is used to support directoryFD.IterDirents() + // efficiently. childList is protected by mu. + childList direntList + + // childMap maps the child's filename to the dirent structure stored in + // childList. This adds some data replication but helps in faster path + // traversal. For consistency, key == childMap[key].diskDirent.FileName(). + // Immutable. + childMap map[string]*dirent +} + +// newDirectory is the directory constructor. +func newDirectory(args inodeArgs, newDirent bool) (*directory, error) { + file := &directory{ + childCache: make(map[string]*dentry), + childMap: make(map[string]*dirent), + } + file.inode.init(args, file) + + // Initialize childList by reading dirents from the underlying file. + if args.diskInode.Flags().Index { + // TODO(b/134676337): Support hash tree directories. Currently only the '.' + // and '..' entries are read in. + + // Users cannot navigate this hash tree directory yet. + log.Warningf("hash tree directory being used which is unsupported") + return file, nil + } + + // The dirents are organized in a linear array in the file data. + // Extract the file data and decode the dirents. + regFile, err := newRegularFile(args) + if err != nil { + return nil, err + } + + // buf is used as scratch space for reading in dirents from disk and + // unmarshalling them into dirent structs. + buf := make([]byte, disklayout.DirentSize) + size := args.diskInode.Size() + for off, inc := uint64(0), uint64(0); off < size; off += inc { + toRead := size - off + if toRead > disklayout.DirentSize { + toRead = disklayout.DirentSize + } + if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead { + return nil, err + } + + var curDirent dirent + if newDirent { + curDirent.diskDirent = &disklayout.DirentNew{} + } else { + curDirent.diskDirent = &disklayout.DirentOld{} + } + binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent) + + if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 { + // Inode number and name length fields being set to 0 is used to indicate + // an unused dirent. + file.childList.PushBack(&curDirent) + file.childMap[curDirent.diskDirent.FileName()] = &curDirent + } + + // The next dirent is placed exactly after this dirent record on disk. + inc = uint64(curDirent.diskDirent.RecordSize()) + } + + return file, nil +} + +func (i *inode) isDir() bool { + _, ok := i.impl.(*directory) + return ok +} + +// dirent is the directory.childList node. +type dirent struct { + diskDirent disklayout.Dirent + + // direntEntry links dirents into their parent directory.childList. + direntEntry +} + +// directoryFD represents a directory file description. It implements +// vfs.FileDescriptionImpl. +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + // Protected by directory.mu. + iter *dirent + off int64 +} + +// Compiles only if directoryFD implements vfs.FileDescriptionImpl. +var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { + if fd.iter == nil { + return + } + + dir := fd.inode().impl.(*directory) + dir.mu.Lock() + dir.childList.Remove(fd.iter) + dir.mu.Unlock() + fd.iter = nil +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + extfs := fd.filesystem() + dir := fd.inode().impl.(*directory) + + dir.mu.Lock() + defer dir.mu.Unlock() + + // Ensure that fd.iter exists and is not linked into dir.childList. + var child *dirent + if fd.iter == nil { + // Start iteration at the beginning of dir. + child = dir.childList.Front() + fd.iter = &dirent{} + } else { + // Continue iteration from where we left off. + child = fd.iter.Next() + dir.childList.Remove(fd.iter) + } + for ; child != nil; child = child.Next() { + // Skip other directoryFD iterators. + if child.diskDirent != nil { + childType, ok := child.diskDirent.FileType() + if !ok { + // We will need to read the inode off disk. Do not increment + // ref count here because this inode is not being added to the + // dentry tree. + extfs.mu.Lock() + childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode()) + extfs.mu.Unlock() + if err != nil { + // Usage of the file description after the error is + // undefined. This implementation would continue reading + // from the next dirent. + fd.off++ + dir.childList.InsertAfter(child, fd.iter) + return err + } + childType = fs.ToInodeType(childInode.diskInode.Mode().FileType()) + } + + if err := cb.Handle(vfs.Dirent{ + Name: child.diskDirent.FileName(), + Type: fs.ToDirentType(childType), + Ino: uint64(child.diskDirent.Inode()), + NextOff: fd.off + 1, + }); err != nil { + dir.childList.InsertBefore(child, fd.iter) + return err + } + fd.off++ + } + } + dir.childList.PushBack(fd.iter) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if whence != linux.SEEK_SET && whence != linux.SEEK_CUR { + return 0, syserror.EINVAL + } + + dir := fd.inode().impl.(*directory) + + dir.mu.Lock() + defer dir.mu.Unlock() + + // Find resulting offset. + if whence == linux.SEEK_CUR { + offset += fd.off + } + + if offset < 0 { + // lseek(2) specifies that EINVAL should be returned if the resulting offset + // is negative. + return 0, syserror.EINVAL + } + + n := int64(len(dir.childMap)) + realWantOff := offset + if realWantOff > n { + realWantOff = n + } + realCurOff := fd.off + if realCurOff > n { + realCurOff = n + } + + // Ensure that fd.iter exists and is linked into dir.childList so we can + // intelligently seek from the optimal position. + if fd.iter == nil { + fd.iter = &dirent{} + dir.childList.PushFront(fd.iter) + } + + // Guess that iterating from the current position is optimal. + child := fd.iter + diff := realWantOff - realCurOff // Shows direction and magnitude of travel. + + // See if starting from the beginning or end is better. + abDiff := diff + if diff < 0 { + abDiff = -diff + } + if abDiff > realWantOff { + // Starting from the beginning is best. + child = dir.childList.Front() + diff = realWantOff + } else if abDiff > (n - realWantOff) { + // Starting from the end is best. + child = dir.childList.Back() + // (n - 1) because the last non-nil dirent represents the (n-1)th offset. + diff = realWantOff - (n - 1) + } + + for child != nil { + // Skip other directoryFD iterators. + if child.diskDirent != nil { + if diff == 0 { + if child != fd.iter { + dir.childList.Remove(fd.iter) + dir.childList.InsertBefore(child, fd.iter) + } + + fd.off = offset + return offset, nil + } + + if diff < 0 { + diff++ + child = child.Prev() + } else { + diff-- + child = child.Next() + } + continue + } + + if diff < 0 { + child = child.Prev() + } else { + child = child.Next() + } + } + + // Reaching here indicates that the offset is beyond the end of the childList. + dir.childList.Remove(fd.iter) + dir.childList.PushBack(fd.iter) + fd.off = offset + return offset, nil +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD new file mode 100644 index 000000000..9bd9c76c0 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD @@ -0,0 +1,47 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "disklayout", + srcs = [ + "block_group.go", + "block_group_32.go", + "block_group_64.go", + "dirent.go", + "dirent_new.go", + "dirent_old.go", + "disklayout.go", + "extent.go", + "inode.go", + "inode_new.go", + "inode_old.go", + "superblock.go", + "superblock_32.go", + "superblock_64.go", + "superblock_old.go", + "test_utils.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + ], +) + +go_test( + name = "disklayout_test", + size = "small", + srcs = [ + "block_group_test.go", + "dirent_test.go", + "extent_test.go", + "inode_test.go", + "superblock_test.go", + ], + library = ":disklayout", + deps = ["//pkg/sentry/kernel/time"], +) diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go new file mode 100644 index 000000000..ad6f4fef8 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go @@ -0,0 +1,137 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// BlockGroup represents a Linux ext block group descriptor. An ext file system +// is split into a series of block groups. This provides an access layer to +// information needed to access and use a block group. +// +// Location: +// - The block group descriptor table is always placed in the blocks +// immediately after the block containing the superblock. +// - The 1st block group descriptor in the original table is in the +// (sb.FirstDataBlock() + 1)th block. +// - See SuperBlock docs to see where the block group descriptor table is +// replicated. +// - sb.BgDescSize() must be used as the block group descriptor entry size +// while reading the table from disk. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. +type BlockGroup interface { + // InodeTable returns the absolute block number of the block containing the + // inode table. This points to an array of Inode structs. Inode tables are + // statically allocated at mkfs time. The superblock records the number of + // inodes per group (length of this table) and the size of each inode struct. + InodeTable() uint64 + + // BlockBitmap returns the absolute block number of the block containing the + // block bitmap. This bitmap tracks the usage of data blocks within this block + // group and has its own checksum. + BlockBitmap() uint64 + + // InodeBitmap returns the absolute block number of the block containing the + // inode bitmap. This bitmap tracks the usage of this group's inode table + // entries and has its own checksum. + InodeBitmap() uint64 + + // ExclusionBitmap returns the absolute block number of the snapshot exclusion + // bitmap. + ExclusionBitmap() uint64 + + // FreeBlocksCount returns the number of free blocks in the group. + FreeBlocksCount() uint32 + + // FreeInodesCount returns the number of free inodes in the group. + FreeInodesCount() uint32 + + // DirectoryCount returns the number of inodes that represent directories + // under this block group. + DirectoryCount() uint32 + + // UnusedInodeCount returns the number of unused inodes beyond the last used + // inode in this group's inode table. As a result, we needn’t scan past the + // (InodesPerGroup - UnusedInodeCount())th entry in the inode table. + UnusedInodeCount() uint32 + + // BlockBitmapChecksum returns the block bitmap checksum. This is calculated + // using crc32c(FS UUID + group number + entire bitmap). + BlockBitmapChecksum() uint32 + + // InodeBitmapChecksum returns the inode bitmap checksum. This is calculated + // using crc32c(FS UUID + group number + entire bitmap). + InodeBitmapChecksum() uint32 + + // Checksum returns this block group's checksum. + // + // If SbMetadataCsum feature is set: + // - checksum is crc32c(FS UUID + group number + group descriptor + // structure) & 0xFFFF. + // + // If SbGdtCsum feature is set: + // - checksum is crc16(FS UUID + group number + group descriptor + // structure). + // + // SbMetadataCsum and SbGdtCsum should not be both set. + // If they are, Linux warns and asks to run fsck. + Checksum() uint16 + + // Flags returns BGFlags which represents the block group flags. + Flags() BGFlags +} + +// These are the different block group flags. +const ( + // BgInodeUninit indicates that inode table and bitmap are not initialized. + BgInodeUninit uint16 = 0x1 + + // BgBlockUninit indicates that block bitmap is not initialized. + BgBlockUninit uint16 = 0x2 + + // BgInodeZeroed indicates that inode table is zeroed. + BgInodeZeroed uint16 = 0x4 +) + +// BGFlags represents all the different combinations of block group flags. +type BGFlags struct { + InodeUninit bool + BlockUninit bool + InodeZeroed bool +} + +// ToInt converts a BGFlags struct back to its 16-bit representation. +func (f BGFlags) ToInt() uint16 { + var res uint16 + + if f.InodeUninit { + res |= BgInodeUninit + } + if f.BlockUninit { + res |= BgBlockUninit + } + if f.InodeZeroed { + res |= BgInodeZeroed + } + + return res +} + +// BGFlagsFromInt converts the 16-bit flag representation to a BGFlags struct. +func BGFlagsFromInt(flags uint16) BGFlags { + return BGFlags{ + InodeUninit: flags&BgInodeUninit > 0, + BlockUninit: flags&BgBlockUninit > 0, + InodeZeroed: flags&BgInodeZeroed > 0, + } +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go new file mode 100644 index 000000000..3e16c76db --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// BlockGroup32Bit emulates the first half of struct ext4_group_desc in +// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and +// 32-bit ext4 filesystems. It implements BlockGroup interface. +type BlockGroup32Bit struct { + BlockBitmapLo uint32 + InodeBitmapLo uint32 + InodeTableLo uint32 + FreeBlocksCountLo uint16 + FreeInodesCountLo uint16 + UsedDirsCountLo uint16 + FlagsRaw uint16 + ExcludeBitmapLo uint32 + BlockBitmapChecksumLo uint16 + InodeBitmapChecksumLo uint16 + ItableUnusedLo uint16 + ChecksumRaw uint16 +} + +// Compiles only if BlockGroup32Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup32Bit)(nil) + +// InodeTable implements BlockGroup.InodeTable. +func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) } + +// BlockBitmap implements BlockGroup.BlockBitmap. +func (bg *BlockGroup32Bit) BlockBitmap() uint64 { return uint64(bg.BlockBitmapLo) } + +// InodeBitmap implements BlockGroup.InodeBitmap. +func (bg *BlockGroup32Bit) InodeBitmap() uint64 { return uint64(bg.InodeBitmapLo) } + +// ExclusionBitmap implements BlockGroup.ExclusionBitmap. +func (bg *BlockGroup32Bit) ExclusionBitmap() uint64 { return uint64(bg.ExcludeBitmapLo) } + +// FreeBlocksCount implements BlockGroup.FreeBlocksCount. +func (bg *BlockGroup32Bit) FreeBlocksCount() uint32 { return uint32(bg.FreeBlocksCountLo) } + +// FreeInodesCount implements BlockGroup.FreeInodesCount. +func (bg *BlockGroup32Bit) FreeInodesCount() uint32 { return uint32(bg.FreeInodesCountLo) } + +// DirectoryCount implements BlockGroup.DirectoryCount. +func (bg *BlockGroup32Bit) DirectoryCount() uint32 { return uint32(bg.UsedDirsCountLo) } + +// UnusedInodeCount implements BlockGroup.UnusedInodeCount. +func (bg *BlockGroup32Bit) UnusedInodeCount() uint32 { return uint32(bg.ItableUnusedLo) } + +// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. +func (bg *BlockGroup32Bit) BlockBitmapChecksum() uint32 { return uint32(bg.BlockBitmapChecksumLo) } + +// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. +func (bg *BlockGroup32Bit) InodeBitmapChecksum() uint32 { return uint32(bg.InodeBitmapChecksumLo) } + +// Checksum implements BlockGroup.Checksum. +func (bg *BlockGroup32Bit) Checksum() uint16 { return bg.ChecksumRaw } + +// Flags implements BlockGroup.Flags. +func (bg *BlockGroup32Bit) Flags() BGFlags { return BGFlagsFromInt(bg.FlagsRaw) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go new file mode 100644 index 000000000..9a809197a --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go @@ -0,0 +1,93 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// BlockGroup64Bit emulates struct ext4_group_desc in fs/ext4/ext4.h. +// It is the block group descriptor struct for 64-bit ext4 filesystems. +// It implements BlockGroup interface. It is an extension of the 32-bit +// version of BlockGroup. +type BlockGroup64Bit struct { + // We embed the 32-bit struct here because 64-bit version is just an extension + // of the 32-bit version. + BlockGroup32Bit + + // 64-bit specific fields. + BlockBitmapHi uint32 + InodeBitmapHi uint32 + InodeTableHi uint32 + FreeBlocksCountHi uint16 + FreeInodesCountHi uint16 + UsedDirsCountHi uint16 + ItableUnusedHi uint16 + ExcludeBitmapHi uint32 + BlockBitmapChecksumHi uint16 + InodeBitmapChecksumHi uint16 + _ uint32 // Padding to 64 bytes. +} + +// Compiles only if BlockGroup64Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup64Bit)(nil) + +// Methods to override. Checksum() and Flags() are not overridden. + +// InodeTable implements BlockGroup.InodeTable. +func (bg *BlockGroup64Bit) InodeTable() uint64 { + return (uint64(bg.InodeTableHi) << 32) | uint64(bg.InodeTableLo) +} + +// BlockBitmap implements BlockGroup.BlockBitmap. +func (bg *BlockGroup64Bit) BlockBitmap() uint64 { + return (uint64(bg.BlockBitmapHi) << 32) | uint64(bg.BlockBitmapLo) +} + +// InodeBitmap implements BlockGroup.InodeBitmap. +func (bg *BlockGroup64Bit) InodeBitmap() uint64 { + return (uint64(bg.InodeBitmapHi) << 32) | uint64(bg.InodeBitmapLo) +} + +// ExclusionBitmap implements BlockGroup.ExclusionBitmap. +func (bg *BlockGroup64Bit) ExclusionBitmap() uint64 { + return (uint64(bg.ExcludeBitmapHi) << 32) | uint64(bg.ExcludeBitmapLo) +} + +// FreeBlocksCount implements BlockGroup.FreeBlocksCount. +func (bg *BlockGroup64Bit) FreeBlocksCount() uint32 { + return (uint32(bg.FreeBlocksCountHi) << 16) | uint32(bg.FreeBlocksCountLo) +} + +// FreeInodesCount implements BlockGroup.FreeInodesCount. +func (bg *BlockGroup64Bit) FreeInodesCount() uint32 { + return (uint32(bg.FreeInodesCountHi) << 16) | uint32(bg.FreeInodesCountLo) +} + +// DirectoryCount implements BlockGroup.DirectoryCount. +func (bg *BlockGroup64Bit) DirectoryCount() uint32 { + return (uint32(bg.UsedDirsCountHi) << 16) | uint32(bg.UsedDirsCountLo) +} + +// UnusedInodeCount implements BlockGroup.UnusedInodeCount. +func (bg *BlockGroup64Bit) UnusedInodeCount() uint32 { + return (uint32(bg.ItableUnusedHi) << 16) | uint32(bg.ItableUnusedLo) +} + +// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. +func (bg *BlockGroup64Bit) BlockBitmapChecksum() uint32 { + return (uint32(bg.BlockBitmapChecksumHi) << 16) | uint32(bg.BlockBitmapChecksumLo) +} + +// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. +func (bg *BlockGroup64Bit) InodeBitmapChecksum() uint32 { + return (uint32(bg.InodeBitmapChecksumHi) << 16) | uint32(bg.InodeBitmapChecksumLo) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go new file mode 100644 index 000000000..0ef4294c0 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go @@ -0,0 +1,26 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestBlockGroupSize tests that the block group descriptor structs are of the +// correct size. +func TestBlockGroupSize(t *testing.T) { + assertSize(t, BlockGroup32Bit{}, 32) + assertSize(t, BlockGroup64Bit{}, 64) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go new file mode 100644 index 000000000..417b6cf65 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +const ( + // MaxFileName is the maximum length of an ext fs file's name. + MaxFileName = 255 + + // DirentSize is the size of ext dirent structures. + DirentSize = 263 +) + +var ( + // inodeTypeByFileType maps ext4 file types to vfs inode types. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype. + inodeTypeByFileType = map[uint8]fs.InodeType{ + 0: fs.Anonymous, + 1: fs.RegularFile, + 2: fs.Directory, + 3: fs.CharacterDevice, + 4: fs.BlockDevice, + 5: fs.Pipe, + 6: fs.Socket, + 7: fs.Symlink, + } +) + +// The Dirent interface should be implemented by structs representing ext +// directory entries. These are for the linear classical directories which +// just store a list of dirent structs. A directory is a series of data blocks +// where is each data block contains a linear array of dirents. The last entry +// of the block has a record size that takes it to the end of the block. The +// end of the directory is when you read dirInode.Size() bytes from the blocks. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. +type Dirent interface { + // Inode returns the absolute inode number of the underlying inode. + // Inode number 0 signifies an unused dirent. + Inode() uint32 + + // RecordSize returns the record length of this dirent on disk. The next + // dirent in the dirent list should be read after these many bytes from + // the current dirent. Must be a multiple of 4. + RecordSize() uint16 + + // FileName returns the name of the file. Can be at most 255 is length. + FileName() string + + // FileType returns the inode type of the underlying inode. This is a + // performance hack so that we do not have to read the underlying inode struct + // to know the type of inode. This will only work when the SbDirentFileType + // feature is set. If not, the second returned value will be false indicating + // that user code has to use the inode mode to extract the file type. + FileType() (fs.InodeType, bool) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go new file mode 100644 index 000000000..29ae4a5c2 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go @@ -0,0 +1,61 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +// DirentNew represents the ext4 directory entry struct. This emulates Linux's +// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we +// only need 8 bits to store the NameLength. As a result, NameLength has been +// shortened and the other 8 bits are used to encode the file type. Use the +// FileTypeRaw field only if the SbDirentFileType feature is set. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentNew struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint8 + FileTypeRaw uint8 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentNew implements Dirent. +var _ Dirent = (*DirentNew)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentNew) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentNew) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentNew) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentNew) FileType() (fs.InodeType, bool) { + if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok { + return inodeType, true + } + + panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw)) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go new file mode 100644 index 000000000..6fff12a6e --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go @@ -0,0 +1,49 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/fs" + +// DirentOld represents the old directory entry struct which does not contain +// the file type. This emulates Linux's ext4_dir_entry struct. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentOld struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint16 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentOld implements Dirent. +var _ Dirent = (*DirentOld)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentOld) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentOld) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentOld) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentOld) FileType() (fs.InodeType, bool) { + return fs.Anonymous, false +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go new file mode 100644 index 000000000..934919f8a --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go @@ -0,0 +1,26 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestDirentSize tests that the dirent structs are of the correct +// size. +func TestDirentSize(t *testing.T) { + assertSize(t, DirentOld{}, uintptr(DirentSize)) + assertSize(t, DirentNew{}, uintptr(DirentSize)) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go new file mode 100644 index 000000000..bdf4e2132 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package disklayout provides Linux ext file system's disk level structures +// which can be directly read into from the underlying device. Structs aim to +// emulate structures `exactly` how they are layed out on disk. +// +// This library aims to be compatible with all ext(2/3/4) systems so it +// provides a generic interface for all major structures and various +// implementations (for different versions). The user code is responsible for +// using appropriate implementations based on the underlying device. +// +// Interfacing all major structures here serves a few purposes: +// - Abstracts away the complexity of the underlying structure from client +// code. The client only has to figure out versioning on set up and then +// can use these as black boxes and pass it higher up the stack. +// - Having pointer receivers forces the user to use pointers to these +// heavy structs. Hence, prevents the client code from unintentionally +// copying these by value while passing the interface around. +// - Version-based implementation selection is resolved on set up hence +// avoiding per call overhead of choosing implementation. +// - All interface methods are pretty light weight (do not take in any +// parameters by design). Passing pointer arguments to interface methods +// can lead to heap allocation as the compiler won't be able to perform +// escape analysis on an unknown implementation at compile time. +// +// Notes: +// - All fields in these structs are exported because binary.Read would +// panic otherwise. +// - All structures on disk are in little-endian order. Only jbd2 (journal) +// structures are in big-endian order. +// - All OS dependent fields in these structures will be interpretted using +// the Linux version of that field. +// - The suffix `Lo` in field names stands for lower bits of that field. +// - The suffix `Hi` in field names stands for upper bits of that field. +// - The suffix `Raw` has been added to indicate that the field is not split +// into Lo and Hi fields and also to resolve name collision with the +// respective interface. +package disklayout diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go new file mode 100644 index 000000000..4110649ab --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go @@ -0,0 +1,143 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// Extents were introduced in ext4 and provide huge performance gains in terms +// data locality and reduced metadata block usage. Extents are organized in +// extent trees. The root node is contained in inode.BlocksRaw. +// +// Terminology: +// - Physical Block: +// Filesystem data block which is addressed normally wrt the entire +// filesystem (addressed with 48 bits). +// +// - File Block: +// Data block containing *only* file data and addressed wrt to the file +// with only 32 bits. The (i)th file block contains file data from +// byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()). + +const ( + // ExtentHeaderSize is the size of the header of an extent tree node. + ExtentHeaderSize = 12 + + // ExtentEntrySize is the size of an entry in an extent tree node. + // This size is the same for both leaf and internal nodes. + ExtentEntrySize = 12 + + // ExtentMagic is the magic number which must be present in the header. + ExtentMagic = 0xf30a +) + +// ExtentEntryPair couples an in-memory ExtendNode with the ExtentEntry that +// points to it. We want to cache these structs in memory to avoid repeated +// disk reads. +// +// Note: This struct itself does not represent an on-disk struct. +type ExtentEntryPair struct { + // Entry points to the child node on disk. + Entry ExtentEntry + // Node points to child node in memory. Is nil if the current node is a leaf. + Node *ExtentNode +} + +// ExtentNode represents an extent tree node. For internal nodes, all Entries +// will be ExtendIdxs. For leaf nodes, they will all be Extents. +// +// Note: This struct itself does not represent an on-disk struct. +type ExtentNode struct { + Header ExtentHeader + Entries []ExtentEntryPair +} + +// ExtentEntry represents an extent tree node entry. The entry can either be +// an ExtentIdx or Extent itself. This exists to simplify navigation logic. +type ExtentEntry interface { + // FileBlock returns the first file block number covered by this entry. + FileBlock() uint32 + + // PhysicalBlock returns the child physical block that this entry points to. + PhysicalBlock() uint64 +} + +// ExtentHeader emulates the ext4_extent_header struct in ext4. Each extent +// tree node begins with this and is followed by `NumEntries` number of: +// - Extent if `Depth` == 0 +// - ExtentIdx otherwise +type ExtentHeader struct { + // Magic in the extent magic number, must be 0xf30a. + Magic uint16 + + // NumEntries indicates the number of valid entries following the header. + NumEntries uint16 + + // MaxEntries that could follow the header. Used while adding entries. + MaxEntries uint16 + + // Height represents the distance of this node from the farthest leaf. Please + // note that Linux incorrectly calls this `Depth` (which means the distance + // of the node from the root). + Height uint16 + _ uint32 +} + +// ExtentIdx emulates the ext4_extent_idx struct in ext4. Only present in +// internal nodes. Sorted in ascending order based on FirstFileBlock since +// Linux does a binary search on this. This points to a block containing the +// child node. +type ExtentIdx struct { + FirstFileBlock uint32 + ChildBlockLo uint32 + ChildBlockHi uint16 + _ uint16 +} + +// Compiles only if ExtentIdx implements ExtentEntry. +var _ ExtentEntry = (*ExtentIdx)(nil) + +// FileBlock implements ExtentEntry.FileBlock. +func (ei *ExtentIdx) FileBlock() uint32 { + return ei.FirstFileBlock +} + +// PhysicalBlock implements ExtentEntry.PhysicalBlock. It returns the +// physical block number of the child block. +func (ei *ExtentIdx) PhysicalBlock() uint64 { + return (uint64(ei.ChildBlockHi) << 32) | uint64(ei.ChildBlockLo) +} + +// Extent represents the ext4_extent struct in ext4. Only present in leaf +// nodes. Sorted in ascending order based on FirstFileBlock since Linux does a +// binary search on this. This points to an array of data blocks containing the +// file data. It covers `Length` data blocks starting from `StartBlock`. +type Extent struct { + FirstFileBlock uint32 + Length uint16 + StartBlockHi uint16 + StartBlockLo uint32 +} + +// Compiles only if Extent implements ExtentEntry. +var _ ExtentEntry = (*Extent)(nil) + +// FileBlock implements ExtentEntry.FileBlock. +func (e *Extent) FileBlock() uint32 { + return e.FirstFileBlock +} + +// PhysicalBlock implements ExtentEntry.PhysicalBlock. It returns the +// physical block number of the first data block this extent covers. +func (e *Extent) PhysicalBlock() uint64 { + return (uint64(e.StartBlockHi) << 32) | uint64(e.StartBlockLo) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go new file mode 100644 index 000000000..8762b90db --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestExtentSize tests that the extent structs are of the correct +// size. +func TestExtentSize(t *testing.T) { + assertSize(t, ExtentHeader{}, ExtentHeaderSize) + assertSize(t, ExtentIdx{}, ExtentEntrySize) + assertSize(t, Extent{}, ExtentEntrySize) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go new file mode 100644 index 000000000..88ae913f5 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go @@ -0,0 +1,274 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// Special inodes. See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#special-inodes. +const ( + // RootDirInode is the inode number of the root directory inode. + RootDirInode = 2 +) + +// The Inode interface must be implemented by structs representing ext inodes. +// The inode stores all the metadata pertaining to the file (except for the +// file name which is held by the directory entry). It does NOT expose all +// fields and should be extended if need be. +// +// Some file systems (e.g. FAT) use the directory entry to store all this +// information. Ext file systems do not so that they can support hard links. +// However, ext4 cheats a little bit and duplicates the file type in the +// directory entry for performance gains. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. +type Inode interface { + // Mode returns the linux file mode which is majorly used to extract + // information like: + // - File permissions (read/write/execute by user/group/others). + // - Sticky, set UID and GID bits. + // - File type. + // + // Masks to extract this information are provided in pkg/abi/linux/file.go. + Mode() linux.FileMode + + // UID returns the owner UID. + UID() auth.KUID + + // GID returns the owner GID. + GID() auth.KGID + + // Size returns the size of the file in bytes. + Size() uint64 + + // InodeSize returns the size of this inode struct in bytes. + // In ext2 and ext3, the inode struct and inode disk record size was fixed at + // 128 bytes. Ext4 makes it possible for the inode struct to be bigger. + // However, accessing any field beyond the 128 bytes marker must be verified + // using this method. + InodeSize() uint16 + + // AccessTime returns the last access time. Shows when the file was last read. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the extended attribute value checksum. + AccessTime() time.Time + + // ChangeTime returns the last change time. Shows when the file meta data + // (like permissions) was last changed. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the lower 32 bits of the attribute + // value’s reference count. + ChangeTime() time.Time + + // ModificationTime returns the last modification time. Shows when the file + // content was last modified. + // + // If InExtendedAttr is set, then this should NOT be used because + // the underlying field contains the number of the inode that owns the + // extended attribute. + ModificationTime() time.Time + + // DeletionTime returns the deletion time. Inodes are marked as deleted by + // writing to the underlying field. FS tools can restore files until they are + // actually overwritten. + DeletionTime() time.Time + + // LinksCount returns the number of hard links to this inode. + // + // Normally there is an upper limit on the number of hard links: + // - ext2/ext3 = 32,000 + // - ext4 = 65,000 + // + // This implies that an ext4 directory cannot have more than 64,998 + // subdirectories because each subdirectory will have a hard link to the + // directory via the `..` entry. The directory has hard link via the `.` entry + // of its own. And finally the inode is initiated with 1 hard link (itself). + // + // The underlying value is reset to 1 if all the following hold: + // - Inode is a directory. + // - SbDirNlink is enabled. + // - Number of hard links is incremented past 64,999. + // Hard link value of 1 for a directory would indicate that the number of hard + // links is unknown because a directory can have minimum 2 hard links (itself + // and `.` entry). + LinksCount() uint16 + + // Flags returns InodeFlags which represents the inode flags. + Flags() InodeFlags + + // Data returns the underlying inode.i_block array as a slice so it's + // modifiable. This field is special and is used to store various kinds of + // things depending on the filesystem version and inode type. The underlying + // field name in Linux is a little misleading. + // - In ext2/ext3, it contains the block map. + // - In ext4, it contains the extent tree root node. + // - For inline files, it contains the file contents. + // - For symlinks, it contains the link path (if it fits here). + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. + Data() []byte +} + +// Inode flags. This is not comprehensive and flags which were not used in +// the Linux kernel have been excluded. +const ( + // InSync indicates that all writes to the file must be synchronous. + InSync = 0x8 + + // InImmutable indicates that this file is immutable. + InImmutable = 0x10 + + // InAppend indicates that this file can only be appended to. + InAppend = 0x20 + + // InNoDump indicates that teh dump(1) utility should not dump this file. + InNoDump = 0x40 + + // InNoAccessTime indicates that the access time of this inode must not be + // updated. + InNoAccessTime = 0x80 + + // InIndex indicates that this directory has hashed indexes. + InIndex = 0x1000 + + // InJournalData indicates that file data must always be written through a + // journal device. + InJournalData = 0x4000 + + // InDirSync indicates that all the directory entiry data must be written + // synchronously. + InDirSync = 0x10000 + + // InTopDir indicates that this inode is at the top of the directory hierarchy. + InTopDir = 0x20000 + + // InHugeFile indicates that this is a huge file. + InHugeFile = 0x40000 + + // InExtents indicates that this inode uses extents. + InExtents = 0x80000 + + // InExtendedAttr indicates that this inode stores a large extended attribute + // value in its data blocks. + InExtendedAttr = 0x200000 + + // InInline indicates that this inode has inline data. + InInline = 0x10000000 + + // InReserved indicates that this inode is reserved for the ext4 library. + InReserved = 0x80000000 +) + +// InodeFlags represents all possible combinations of inode flags. It aims to +// cover the bit masks and provide a more user-friendly interface. +type InodeFlags struct { + Sync bool + Immutable bool + Append bool + NoDump bool + NoAccessTime bool + Index bool + JournalData bool + DirSync bool + TopDir bool + HugeFile bool + Extents bool + ExtendedAttr bool + Inline bool + Reserved bool +} + +// ToInt converts inode flags back to its 32-bit rep. +func (f InodeFlags) ToInt() uint32 { + var res uint32 + + if f.Sync { + res |= InSync + } + if f.Immutable { + res |= InImmutable + } + if f.Append { + res |= InAppend + } + if f.NoDump { + res |= InNoDump + } + if f.NoAccessTime { + res |= InNoAccessTime + } + if f.Index { + res |= InIndex + } + if f.JournalData { + res |= InJournalData + } + if f.DirSync { + res |= InDirSync + } + if f.TopDir { + res |= InTopDir + } + if f.HugeFile { + res |= InHugeFile + } + if f.Extents { + res |= InExtents + } + if f.ExtendedAttr { + res |= InExtendedAttr + } + if f.Inline { + res |= InInline + } + if f.Reserved { + res |= InReserved + } + + return res +} + +// InodeFlagsFromInt converts the integer representation of inode flags to +// a InodeFlags struct. +func InodeFlagsFromInt(f uint32) InodeFlags { + return InodeFlags{ + Sync: f&InSync > 0, + Immutable: f&InImmutable > 0, + Append: f&InAppend > 0, + NoDump: f&InNoDump > 0, + NoAccessTime: f&InNoAccessTime > 0, + Index: f&InIndex > 0, + JournalData: f&InJournalData > 0, + DirSync: f&InDirSync > 0, + TopDir: f&InTopDir > 0, + HugeFile: f&InHugeFile > 0, + Extents: f&InExtents > 0, + ExtendedAttr: f&InExtendedAttr > 0, + Inline: f&InInline > 0, + Reserved: f&InReserved > 0, + } +} + +// These masks define how users can view/modify inode flags. The rest of the +// flags are for internal kernel usage only. +const ( + InUserReadFlagMask = 0x4BDFFF + InUserWriteFlagMask = 0x4B80FF +) diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go new file mode 100644 index 000000000..8f9f574ce --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go @@ -0,0 +1,96 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/kernel/time" + +// InodeNew represents ext4 inode structure which can be bigger than +// OldInodeSize. The actual size of this struct should be determined using +// inode.ExtraInodeSize. Accessing any field here should be verified with the +// actual size. The extra space between the end of the inode struct and end of +// the inode record can be used to store extended attr. +// +// If the TimeExtra fields are in scope, the lower 2 bits of those are used +// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits +// are used to provide nanoscond precision. Hence, these timestamps will now +// overflow in May 2446. +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +type InodeNew struct { + InodeOld + + ExtraInodeSize uint16 + ChecksumHi uint16 + ChangeTimeExtra uint32 + ModificationTimeExtra uint32 + AccessTimeExtra uint32 + CreationTime uint32 + CreationTimeExtra uint32 + VersionHi uint32 + ProjectID uint32 +} + +// Compiles only if InodeNew implements Inode. +var _ Inode = (*InodeNew)(nil) + +// fromExtraTime decodes the extra time and constructs the kernel time struct +// with nanosecond precision. +func fromExtraTime(lo int32, extra uint32) time.Time { + // See description above InodeNew for format. + seconds := (int64(extra&0x3) << 32) + int64(lo) + nanoseconds := int64(extra >> 2) + return time.FromUnix(seconds, nanoseconds) +} + +// Only override methods which change due to ext4 specific fields. + +// Size implements Inode.Size. +func (in *InodeNew) Size() uint64 { + return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeNew) InodeSize() uint16 { + return OldInodeSize + in.ExtraInodeSize +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeNew) ChangeTime() time.Time { + // Apply new timestamp logic if inode.ChangeTimeExtra is in scope. + if in.ExtraInodeSize >= 8 { + return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra) + } + + return in.InodeOld.ChangeTime() +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeNew) ModificationTime() time.Time { + // Apply new timestamp logic if inode.ModificationTimeExtra is in scope. + if in.ExtraInodeSize >= 12 { + return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra) + } + + return in.InodeOld.ModificationTime() +} + +// AccessTime implements Inode.AccessTime. +func (in *InodeNew) AccessTime() time.Time { + // Apply new timestamp logic if inode.AccessTimeExtra is in scope. + if in.ExtraInodeSize >= 16 { + return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra) + } + + return in.InodeOld.AccessTime() +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go new file mode 100644 index 000000000..db25b11b6 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go @@ -0,0 +1,117 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +const ( + // OldInodeSize is the inode size in ext2/ext3. + OldInodeSize = 128 +) + +// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. +// Inode struct size and record size are both 128 bytes for this. +// +// All fields representing time are in seconds since the epoch. Which means that +// they will overflow in January 2038. +type InodeOld struct { + ModeRaw uint16 + UIDLo uint16 + SizeLo uint32 + + // The time fields are signed integers because they could be negative to + // represent time before the epoch. + AccessTimeRaw int32 + ChangeTimeRaw int32 + ModificationTimeRaw int32 + DeletionTimeRaw int32 + + GIDLo uint16 + LinksCountRaw uint16 + BlocksCountLo uint32 + FlagsRaw uint32 + VersionLo uint32 // This is OS dependent. + DataRaw [60]byte + Generation uint32 + FileACLLo uint32 + SizeHi uint32 + ObsoFaddr uint32 + + // OS dependent fields have been inlined here. + BlocksCountHi uint16 + FileACLHi uint16 + UIDHi uint16 + GIDHi uint16 + ChecksumLo uint16 + _ uint16 +} + +// Compiles only if InodeOld implements Inode. +var _ Inode = (*InodeOld)(nil) + +// Mode implements Inode.Mode. +func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) } + +// UID implements Inode.UID. +func (in *InodeOld) UID() auth.KUID { + return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo)) +} + +// GID implements Inode.GID. +func (in *InodeOld) GID() auth.KGID { + return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo)) +} + +// Size implements Inode.Size. +func (in *InodeOld) Size() uint64 { + // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL. + return uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeOld) InodeSize() uint16 { return OldInodeSize } + +// AccessTime implements Inode.AccessTime. +func (in *InodeOld) AccessTime() time.Time { + return time.FromUnix(int64(in.AccessTimeRaw), 0) +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeOld) ChangeTime() time.Time { + return time.FromUnix(int64(in.ChangeTimeRaw), 0) +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeOld) ModificationTime() time.Time { + return time.FromUnix(int64(in.ModificationTimeRaw), 0) +} + +// DeletionTime implements Inode.DeletionTime. +func (in *InodeOld) DeletionTime() time.Time { + return time.FromUnix(int64(in.DeletionTimeRaw), 0) +} + +// LinksCount implements Inode.LinksCount. +func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } + +// Flags implements Inode.Flags. +func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } + +// Data implements Inode.Data. +func (in *InodeOld) Data() []byte { return in.DataRaw[:] } diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go new file mode 100644 index 000000000..dd03ee50e --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go @@ -0,0 +1,222 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// TestInodeSize tests that the inode structs are of the correct size. +func TestInodeSize(t *testing.T) { + assertSize(t, InodeOld{}, OldInodeSize) + + // This was updated from 156 bytes to 160 bytes in Oct 2015. + assertSize(t, InodeNew{}, 160) +} + +// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in +// ext4 inode structs are decoded correctly. +// +// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +func TestTimestampSeconds(t *testing.T) { + type timestampTest struct { + // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set. + // If this is set then the 32-bit time is negative. + msbSet bool + + // lowerBound tells if we should take the lowest possible value of + // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to + // false it tells to take the highest possible value. + lowerBound bool + + // extraBits is InodeNew.[X]TimeExtra. + extraBits uint32 + + // want is the kernel time struct that is expected. + want time.Time + } + + tests := []timestampTest{ + // 1901-12-13 + { + msbSet: true, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(-0x80000000), 0), + }, + + // 1969-12-31 + { + msbSet: true, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(-1), 0), + }, + + // 1970-01-01 + { + msbSet: false, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(0), 0), + }, + + // 2038-01-19 + { + msbSet: false, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(0x7fffffff), 0), + }, + + // 2038-01-19 + { + msbSet: true, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x80000000), 0), + }, + + // 2106-02-07 + { + msbSet: true, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0xffffffff), 0), + }, + + // 2106-02-07 + { + msbSet: false, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x100000000), 0), + }, + + // 2174-02-25 + { + msbSet: false, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0x17fffffff), 0), + }, + + // 2174-02-25 + { + msbSet: true, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x180000000), 0), + }, + + // 2242-03-16 + { + msbSet: true, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x1ffffffff), 0), + }, + + // 2242-03-16 + { + msbSet: false, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x200000000), 0), + }, + + // 2310-04-04 + { + msbSet: false, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x27fffffff), 0), + }, + + // 2310-04-04 + { + msbSet: true, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x280000000), 0), + }, + + // 2378-04-22 + { + msbSet: true, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x2ffffffff), 0), + }, + + // 2378-04-22 + { + msbSet: false, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x300000000), 0), + }, + + // 2446-05-10 + { + msbSet: false, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x37fffffff), 0), + }, + } + + lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000 + upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111 + lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000 + upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111 + + get32BitTime := func(test timestampTest) int32 { + if test.msbSet { + if test.lowerBound { + return lowerMSB1 + } + + return upperMSB1 + } + + if test.lowerBound { + return lowerMSB0 + } + + return upperMSB0 + } + + getTestName := func(test timestampTest) string { + return fmt.Sprintf( + "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t", + strconv.FormatInt(int64(test.extraBits), 2), + test.msbSet, + test.lowerBound, + ) + } + + for _, test := range tests { + t.Run(getTestName(test), func(t *testing.T) { + if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want { + t.Errorf("Expected: %v, Got: %v", test.want, got) + } + }) + } +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go new file mode 100644 index 000000000..8bb327006 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go @@ -0,0 +1,471 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +const ( + // SbOffset is the absolute offset at which the superblock is placed. + SbOffset = 1024 +) + +// SuperBlock should be implemented by structs representing the ext superblock. +// The superblock holds a lot of information about the enclosing filesystem. +// This interface aims to provide access methods to important information held +// by the superblock. It does NOT expose all fields of the superblock, only the +// ones necessary. This can be expanded when need be. +// +// Location and replication: +// - The superblock is located at offset 1024 in block group 0. +// - Redundant copies of the superblock and group descriptors are kept in +// all groups if SbSparse feature flag is NOT set. If it is set, the +// replicas only exist in groups whose group number is either 0 or a +// power of 3, 5, or 7. +// - There is also a sparse superblock feature v2 in which there are just +// two replicas saved in the block groups pointed by sb.s_backup_bgs. +// +// Replicas should eventually be updated if the superblock is updated. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. +type SuperBlock interface { + // InodesCount returns the total number of inodes in this filesystem. + InodesCount() uint32 + + // BlocksCount returns the total number of data blocks in this filesystem. + BlocksCount() uint64 + + // FreeBlocksCount returns the number of free blocks in this filesystem. + FreeBlocksCount() uint64 + + // FreeInodesCount returns the number of free inodes in this filesystem. + FreeInodesCount() uint32 + + // MountCount returns the number of mounts since the last fsck. + MountCount() uint16 + + // MaxMountCount returns the number of mounts allowed beyond which a fsck is + // needed. + MaxMountCount() uint16 + + // FirstDataBlock returns the absolute block number of the first data block, + // which contains the super block itself. + // + // If the filesystem has 1kb data blocks then this should return 1. For all + // other configurations, this typically returns 0. + FirstDataBlock() uint32 + + // BlockSize returns the size of one data block in this filesystem. + // This can be calculated by 2^(10 + sb.s_log_block_size). This ensures that + // the smallest block size is 1kb. + BlockSize() uint64 + + // BlocksPerGroup returns the number of data blocks in a block group. + BlocksPerGroup() uint32 + + // ClusterSize returns block cluster size (set during mkfs time by admin). + // This can be calculated by 2^(10 + sb.s_log_cluster_size). This ensures that + // the smallest cluster size is 1kb. + // + // sb.s_log_cluster_size must equal sb.s_log_block_size if bigalloc feature + // is NOT set and consequently BlockSize() = ClusterSize() in that case. + ClusterSize() uint64 + + // ClustersPerGroup returns: + // - number of clusters per group if bigalloc is enabled. + // - BlocksPerGroup() otherwise. + ClustersPerGroup() uint32 + + // InodeSize returns the size of the inode disk record size in bytes. Use this + // to iterate over inode arrays on disk. + // + // In ext2 and ext3: + // - Each inode had a disk record of 128 bytes. + // - The inode struct size was fixed at 128 bytes. + // + // In ext4 its possible to allocate larger on-disk inodes: + // - Inode disk record size = sb.s_inode_size (function return value). + // = 256 (default) + // - Inode struct size = 128 + inode.i_extra_isize. + // = 128 + 32 = 160 (default) + InodeSize() uint16 + + // InodesPerGroup returns the number of inodes in a block group. + InodesPerGroup() uint32 + + // BgDescSize returns the size of the block group descriptor struct. + // + // In ext2, ext3, ext4 (without 64-bit feature), the block group descriptor + // is only 32 bytes long. + // In ext4 with 64-bit feature, the block group descriptor expands to AT LEAST + // 64 bytes. It might be bigger than that. + BgDescSize() uint16 + + // CompatibleFeatures returns the CompatFeatures struct which holds all the + // compatible features this fs supports. + CompatibleFeatures() CompatFeatures + + // IncompatibleFeatures returns the CompatFeatures struct which holds all the + // incompatible features this fs supports. + IncompatibleFeatures() IncompatFeatures + + // ReadOnlyCompatibleFeatures returns the CompatFeatures struct which holds all the + // readonly compatible features this fs supports. + ReadOnlyCompatibleFeatures() RoCompatFeatures + + // Magic() returns the magic signature which must be 0xef53. + Magic() uint16 + + // Revision returns the superblock revision. Superblock struct fields from + // offset 0x54 till 0x150 should only be used if superblock has DynamicRev. + Revision() SbRevision +} + +// SbRevision is the type for superblock revisions. +type SbRevision uint32 + +// Super block revisions. +const ( + // OldRev is the good old (original) format. + OldRev SbRevision = 0 + + // DynamicRev is v2 format w/ dynamic inode sizes. + DynamicRev SbRevision = 1 +) + +// Superblock compatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbDirPrealloc indicates directory preallocation. + SbDirPrealloc = 0x1 + + // SbHasJournal indicates the presence of a journal. jbd2 should only work + // with this being set. + SbHasJournal = 0x4 + + // SbExtAttr indicates extended attributes support. + SbExtAttr = 0x8 + + // SbResizeInode indicates that the fs has reserved GDT blocks (right after + // group descriptors) for fs expansion. + SbResizeInode = 0x10 + + // SbDirIndex indicates that the fs has directory indices. + SbDirIndex = 0x20 + + // SbSparseV2 stands for Sparse superblock version 2. + SbSparseV2 = 0x200 +) + +// CompatFeatures represents a superblock's compatible feature set. If the +// kernel does not understand any of these feature, it can still read/write +// to this fs. +type CompatFeatures struct { + DirPrealloc bool + HasJournal bool + ExtAttr bool + ResizeInode bool + DirIndex bool + SparseV2 bool +} + +// ToInt converts superblock compatible features back to its 32-bit rep. +func (f CompatFeatures) ToInt() uint32 { + var res uint32 + + if f.DirPrealloc { + res |= SbDirPrealloc + } + if f.HasJournal { + res |= SbHasJournal + } + if f.ExtAttr { + res |= SbExtAttr + } + if f.ResizeInode { + res |= SbResizeInode + } + if f.DirIndex { + res |= SbDirIndex + } + if f.SparseV2 { + res |= SbSparseV2 + } + + return res +} + +// CompatFeaturesFromInt converts the integer representation of superblock +// compatible features to CompatFeatures struct. +func CompatFeaturesFromInt(f uint32) CompatFeatures { + return CompatFeatures{ + DirPrealloc: f&SbDirPrealloc > 0, + HasJournal: f&SbHasJournal > 0, + ExtAttr: f&SbExtAttr > 0, + ResizeInode: f&SbResizeInode > 0, + DirIndex: f&SbDirIndex > 0, + SparseV2: f&SbSparseV2 > 0, + } +} + +// Superblock incompatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbDirentFileType indicates that directory entries record the file type. + // We should use struct DirentNew for dirents then. + SbDirentFileType = 0x2 + + // SbRecovery indicates that the filesystem needs recovery. + SbRecovery = 0x4 + + // SbJournalDev indicates that the filesystem has a separate journal device. + SbJournalDev = 0x8 + + // SbMetaBG indicates that the filesystem is using Meta block groups. Moves + // the group descriptors from the congested first block group into the first + // group of each metablock group to increase the maximum block groups limit + // and hence support much larger filesystems. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#meta-block-groups. + SbMetaBG = 0x10 + + // SbExtents indicates that the filesystem uses extents. Must be set in ext4 + // filesystems. + SbExtents = 0x40 + + // SbIs64Bit indicates that this filesystem addresses blocks with 64-bits. + // Hence can support 2^64 data blocks. + SbIs64Bit = 0x80 + + // SbMMP indicates that this filesystem has multiple mount protection. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#multiple-mount-protection. + SbMMP = 0x100 + + // SbFlexBg indicates that this filesystem has flexible block groups. Several + // block groups are tied into one logical block group so that all the metadata + // for the block groups (bitmaps and inode tables) are close together for + // faster loading. Consequently, large files will be continuous on disk. + // However, this does not affect the placement of redundant superblocks and + // group descriptors. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#flexible-block-groups. + SbFlexBg = 0x200 + + // SbLargeDir shows that large directory enabled. Directory htree can be 3 + // levels deep. Directory htrees are allowed to be 2 levels deep otherwise. + SbLargeDir = 0x4000 + + // SbInlineData allows inline data in inodes for really small files. + SbInlineData = 0x8000 + + // SbEncrypted indicates that this fs contains encrypted inodes. + SbEncrypted = 0x10000 +) + +// IncompatFeatures represents a superblock's incompatible feature set. If the +// kernel does not understand any of these feature, it should refuse to mount. +type IncompatFeatures struct { + DirentFileType bool + Recovery bool + JournalDev bool + MetaBG bool + Extents bool + Is64Bit bool + MMP bool + FlexBg bool + LargeDir bool + InlineData bool + Encrypted bool +} + +// ToInt converts superblock incompatible features back to its 32-bit rep. +func (f IncompatFeatures) ToInt() uint32 { + var res uint32 + + if f.DirentFileType { + res |= SbDirentFileType + } + if f.Recovery { + res |= SbRecovery + } + if f.JournalDev { + res |= SbJournalDev + } + if f.MetaBG { + res |= SbMetaBG + } + if f.Extents { + res |= SbExtents + } + if f.Is64Bit { + res |= SbIs64Bit + } + if f.MMP { + res |= SbMMP + } + if f.FlexBg { + res |= SbFlexBg + } + if f.LargeDir { + res |= SbLargeDir + } + if f.InlineData { + res |= SbInlineData + } + if f.Encrypted { + res |= SbEncrypted + } + + return res +} + +// IncompatFeaturesFromInt converts the integer representation of superblock +// incompatible features to IncompatFeatures struct. +func IncompatFeaturesFromInt(f uint32) IncompatFeatures { + return IncompatFeatures{ + DirentFileType: f&SbDirentFileType > 0, + Recovery: f&SbRecovery > 0, + JournalDev: f&SbJournalDev > 0, + MetaBG: f&SbMetaBG > 0, + Extents: f&SbExtents > 0, + Is64Bit: f&SbIs64Bit > 0, + MMP: f&SbMMP > 0, + FlexBg: f&SbFlexBg > 0, + LargeDir: f&SbLargeDir > 0, + InlineData: f&SbInlineData > 0, + Encrypted: f&SbEncrypted > 0, + } +} + +// Superblock readonly compatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbSparse indicates sparse superblocks. Only groups with number either 0 or + // a power of 3, 5, or 7 will have redundant copies of the superblock and + // block descriptors. + SbSparse = 0x1 + + // SbLargeFile indicates that this fs has been used to store a file >= 2GiB. + SbLargeFile = 0x2 + + // SbHugeFile indicates that this fs contains files whose sizes are + // represented in units of logicals blocks, not 512-byte sectors. + SbHugeFile = 0x8 + + // SbGdtCsum indicates that group descriptors have checksums. + SbGdtCsum = 0x10 + + // SbDirNlink indicates that the new subdirectory limit is 64,999. Ext3 has a + // 32,000 subdirectory limit. + SbDirNlink = 0x20 + + // SbExtraIsize indicates that large inodes exist on this filesystem. + SbExtraIsize = 0x40 + + // SbHasSnapshot indicates the existence of a snapshot. + SbHasSnapshot = 0x80 + + // SbQuota enables usage tracking for all quota types. + SbQuota = 0x100 + + // SbBigalloc maps to the bigalloc feature. When set, the minimum allocation + // unit becomes a cluster rather than a data block. Then block bitmaps track + // clusters, not data blocks. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#bigalloc. + SbBigalloc = 0x200 + + // SbMetadataCsum indicates that the fs supports metadata checksumming. + SbMetadataCsum = 0x400 + + // SbReadOnly marks this filesystem as readonly. Should refuse to mount in + // read/write mode. + SbReadOnly = 0x1000 +) + +// RoCompatFeatures represents a superblock's readonly compatible feature set. +// If the kernel does not understand any of these feature, it can still mount +// readonly. But if the user wants to mount read/write, the kernel should +// refuse to mount. +type RoCompatFeatures struct { + Sparse bool + LargeFile bool + HugeFile bool + GdtCsum bool + DirNlink bool + ExtraIsize bool + HasSnapshot bool + Quota bool + Bigalloc bool + MetadataCsum bool + ReadOnly bool +} + +// ToInt converts superblock readonly compatible features to its 32-bit rep. +func (f RoCompatFeatures) ToInt() uint32 { + var res uint32 + + if f.Sparse { + res |= SbSparse + } + if f.LargeFile { + res |= SbLargeFile + } + if f.HugeFile { + res |= SbHugeFile + } + if f.GdtCsum { + res |= SbGdtCsum + } + if f.DirNlink { + res |= SbDirNlink + } + if f.ExtraIsize { + res |= SbExtraIsize + } + if f.HasSnapshot { + res |= SbHasSnapshot + } + if f.Quota { + res |= SbQuota + } + if f.Bigalloc { + res |= SbBigalloc + } + if f.MetadataCsum { + res |= SbMetadataCsum + } + if f.ReadOnly { + res |= SbReadOnly + } + + return res +} + +// RoCompatFeaturesFromInt converts the integer representation of superblock +// readonly compatible features to RoCompatFeatures struct. +func RoCompatFeaturesFromInt(f uint32) RoCompatFeatures { + return RoCompatFeatures{ + Sparse: f&SbSparse > 0, + LargeFile: f&SbLargeFile > 0, + HugeFile: f&SbHugeFile > 0, + GdtCsum: f&SbGdtCsum > 0, + DirNlink: f&SbDirNlink > 0, + ExtraIsize: f&SbExtraIsize > 0, + HasSnapshot: f&SbHasSnapshot > 0, + Quota: f&SbQuota > 0, + Bigalloc: f&SbBigalloc > 0, + MetadataCsum: f&SbMetadataCsum > 0, + ReadOnly: f&SbReadOnly > 0, + } +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go new file mode 100644 index 000000000..53e515fd3 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go @@ -0,0 +1,76 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of +// the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if +// RevLevel = DynamicRev and 64-bit feature is disabled. +type SuperBlock32Bit struct { + // We embed the old superblock struct here because the 32-bit version is just + // an extension of the old version. + SuperBlockOld + + FirstInode uint32 + InodeSizeRaw uint16 + BlockGroupNumber uint16 + FeatureCompat uint32 + FeatureIncompat uint32 + FeatureRoCompat uint32 + UUID [16]byte + VolumeName [16]byte + LastMounted [64]byte + AlgoUsageBitmap uint32 + PreallocBlocks uint8 + PreallocDirBlocks uint8 + ReservedGdtBlocks uint16 + JournalUUID [16]byte + JournalInum uint32 + JournalDev uint32 + LastOrphan uint32 + HashSeed [4]uint32 + DefaultHashVersion uint8 + JnlBackupType uint8 + BgDescSizeRaw uint16 + DefaultMountOpts uint32 + FirstMetaBg uint32 + MkfsTime uint32 + JnlBlocks [17]uint32 +} + +// Compiles only if SuperBlock32Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock32Bit)(nil) + +// Only override methods which change based on the additional fields above. +// Not overriding SuperBlock.BgDescSize because it would still return 32 here. + +// InodeSize implements SuperBlock.InodeSize. +func (sb *SuperBlock32Bit) InodeSize() uint16 { + return sb.InodeSizeRaw +} + +// CompatibleFeatures implements SuperBlock.CompatibleFeatures. +func (sb *SuperBlock32Bit) CompatibleFeatures() CompatFeatures { + return CompatFeaturesFromInt(sb.FeatureCompat) +} + +// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. +func (sb *SuperBlock32Bit) IncompatibleFeatures() IncompatFeatures { + return IncompatFeaturesFromInt(sb.FeatureIncompat) +} + +// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. +func (sb *SuperBlock32Bit) ReadOnlyCompatibleFeatures() RoCompatFeatures { + return RoCompatFeaturesFromInt(sb.FeatureRoCompat) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go new file mode 100644 index 000000000..7c1053fb4 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go @@ -0,0 +1,95 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of +// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly +// 1024 bytes (smallest possible block size) and hence the superblock always +// fits in no more than one data block. Should only be used when the 64-bit +// feature is set. +type SuperBlock64Bit struct { + // We embed the 32-bit struct here because 64-bit version is just an extension + // of the 32-bit version. + SuperBlock32Bit + + BlocksCountHi uint32 + ReservedBlocksCountHi uint32 + FreeBlocksCountHi uint32 + MinInodeSize uint16 + WantInodeSize uint16 + Flags uint32 + RaidStride uint16 + MmpInterval uint16 + MmpBlock uint64 + RaidStripeWidth uint32 + LogGroupsPerFlex uint8 + ChecksumType uint8 + _ uint16 + KbytesWritten uint64 + SnapshotInum uint32 + SnapshotID uint32 + SnapshotRsrvBlocksCount uint64 + SnapshotList uint32 + ErrorCount uint32 + FirstErrorTime uint32 + FirstErrorInode uint32 + FirstErrorBlock uint64 + FirstErrorFunction [32]byte + FirstErrorLine uint32 + LastErrorTime uint32 + LastErrorInode uint32 + LastErrorLine uint32 + LastErrorBlock uint64 + LastErrorFunction [32]byte + MountOpts [64]byte + UserQuotaInum uint32 + GroupQuotaInum uint32 + OverheadBlocks uint32 + BackupBgs [2]uint32 + EncryptAlgos [4]uint8 + EncryptPwSalt [16]uint8 + LostFoundInode uint32 + ProjectQuotaInode uint32 + ChecksumSeed uint32 + WtimeHi uint8 + MtimeHi uint8 + MkfsTimeHi uint8 + LastCheckHi uint8 + FirstErrorTimeHi uint8 + LastErrorTimeHi uint8 + _ [2]uint8 + Encoding uint16 + EncodingFlags uint16 + _ [95]uint32 + Checksum uint32 +} + +// Compiles only if SuperBlock64Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock64Bit)(nil) + +// Only override methods which change based on the 64-bit feature. + +// BlocksCount implements SuperBlock.BlocksCount. +func (sb *SuperBlock64Bit) BlocksCount() uint64 { + return (uint64(sb.BlocksCountHi) << 32) | uint64(sb.BlocksCountLo) +} + +// FreeBlocksCount implements SuperBlock.FreeBlocksCount. +func (sb *SuperBlock64Bit) FreeBlocksCount() uint64 { + return (uint64(sb.FreeBlocksCountHi) << 32) | uint64(sb.FreeBlocksCountLo) +} + +// BgDescSize implements SuperBlock.BgDescSize. +func (sb *SuperBlock64Bit) BgDescSize() uint16 { return sb.BgDescSizeRaw } diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go new file mode 100644 index 000000000..9221e0251 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go @@ -0,0 +1,105 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlockOld implements SuperBlock and represents the old version of the +// superblock struct. Should be used only if RevLevel = OldRev. +type SuperBlockOld struct { + InodesCountRaw uint32 + BlocksCountLo uint32 + ReservedBlocksCount uint32 + FreeBlocksCountLo uint32 + FreeInodesCountRaw uint32 + FirstDataBlockRaw uint32 + LogBlockSize uint32 + LogClusterSize uint32 + BlocksPerGroupRaw uint32 + ClustersPerGroupRaw uint32 + InodesPerGroupRaw uint32 + Mtime uint32 + Wtime uint32 + MountCountRaw uint16 + MaxMountCountRaw uint16 + MagicRaw uint16 + State uint16 + Errors uint16 + MinorRevLevel uint16 + LastCheck uint32 + CheckInterval uint32 + CreatorOS uint32 + RevLevel uint32 + DefResUID uint16 + DefResGID uint16 +} + +// Compiles only if SuperBlockOld implements SuperBlock. +var _ SuperBlock = (*SuperBlockOld)(nil) + +// InodesCount implements SuperBlock.InodesCount. +func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw } + +// BlocksCount implements SuperBlock.BlocksCount. +func (sb *SuperBlockOld) BlocksCount() uint64 { return uint64(sb.BlocksCountLo) } + +// FreeBlocksCount implements SuperBlock.FreeBlocksCount. +func (sb *SuperBlockOld) FreeBlocksCount() uint64 { return uint64(sb.FreeBlocksCountLo) } + +// FreeInodesCount implements SuperBlock.FreeInodesCount. +func (sb *SuperBlockOld) FreeInodesCount() uint32 { return sb.FreeInodesCountRaw } + +// MountCount implements SuperBlock.MountCount. +func (sb *SuperBlockOld) MountCount() uint16 { return sb.MountCountRaw } + +// MaxMountCount implements SuperBlock.MaxMountCount. +func (sb *SuperBlockOld) MaxMountCount() uint16 { return sb.MaxMountCountRaw } + +// FirstDataBlock implements SuperBlock.FirstDataBlock. +func (sb *SuperBlockOld) FirstDataBlock() uint32 { return sb.FirstDataBlockRaw } + +// BlockSize implements SuperBlock.BlockSize. +func (sb *SuperBlockOld) BlockSize() uint64 { return 1 << (10 + sb.LogBlockSize) } + +// BlocksPerGroup implements SuperBlock.BlocksPerGroup. +func (sb *SuperBlockOld) BlocksPerGroup() uint32 { return sb.BlocksPerGroupRaw } + +// ClusterSize implements SuperBlock.ClusterSize. +func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterSize) } + +// ClustersPerGroup implements SuperBlock.ClustersPerGroup. +func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } + +// InodeSize implements SuperBlock.InodeSize. +func (sb *SuperBlockOld) InodeSize() uint16 { return OldInodeSize } + +// InodesPerGroup implements SuperBlock.InodesPerGroup. +func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } + +// BgDescSize implements SuperBlock.BgDescSize. +func (sb *SuperBlockOld) BgDescSize() uint16 { return 32 } + +// CompatibleFeatures implements SuperBlock.CompatibleFeatures. +func (sb *SuperBlockOld) CompatibleFeatures() CompatFeatures { return CompatFeatures{} } + +// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. +func (sb *SuperBlockOld) IncompatibleFeatures() IncompatFeatures { return IncompatFeatures{} } + +// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. +func (sb *SuperBlockOld) ReadOnlyCompatibleFeatures() RoCompatFeatures { return RoCompatFeatures{} } + +// Magic implements SuperBlock.Magic. +func (sb *SuperBlockOld) Magic() uint16 { return sb.MagicRaw } + +// Revision implements SuperBlock.Revision. +func (sb *SuperBlockOld) Revision() SbRevision { return SbRevision(sb.RevLevel) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go new file mode 100644 index 000000000..463b5ba21 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestSuperBlockSize tests that the superblock structs are of the correct +// size. +func TestSuperBlockSize(t *testing.T) { + assertSize(t, SuperBlockOld{}, 84) + assertSize(t, SuperBlock32Bit{}, 336) + assertSize(t, SuperBlock64Bit{}, 1024) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go new file mode 100644 index 000000000..9c63f04c0 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/binary" +) + +func assertSize(t *testing.T, v interface{}, want uintptr) { + t.Helper() + + if got := binary.Size(v); got != want { + t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) + } +} diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go new file mode 100644 index 000000000..dac6effbf --- /dev/null +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -0,0 +1,157 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ext implements readonly ext(2/3/4) filesystems. +package ext + +import ( + "errors" + "fmt" + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the name of this filesystem. +const Name = "ext" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Compiles only if FilesystemType implements vfs.FilesystemType. +var _ vfs.FilesystemType = (*FilesystemType)(nil) + +// getDeviceFd returns an io.ReaderAt to the underlying device. +// Currently there are two ways of mounting an ext(2/3/4) fs: +// 1. Specify a mount with our internal special MountType in the OCI spec. +// 2. Expose the device to the container and mount it from application layer. +func getDeviceFd(source string, opts vfs.GetFilesystemOptions) (io.ReaderAt, error) { + if opts.InternalData == nil { + // User mount call. + // TODO(b/134676337): Open the device specified by `source` and return that. + panic("unimplemented") + } + + // GetFilesystem call originated from within the sentry. + devFd, ok := opts.InternalData.(int) + if !ok { + return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device") + } + + if devFd < 0 { + return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd) + } + + // The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership + // of the file descriptor and hence will not close it when it is garbage + // collected. + return fd.NewReadWriter(devFd), nil +} + +// isCompatible checks if the superblock has feature sets which are compatible. +// We only need to check the superblock incompatible feature set since we are +// mounting readonly. We will also need to check readonly compatible feature +// set when mounting for read/write. +func isCompatible(sb disklayout.SuperBlock) bool { + // Please note that what is being checked is limited based on the fact that we + // are mounting readonly and that we are not journaling. When mounting + // read/write or with a journal, this must be reevaluated. + incompatFeatures := sb.IncompatibleFeatures() + if incompatFeatures.MetaBG { + log.Warningf("ext fs: meta block groups are not supported") + return false + } + if incompatFeatures.MMP { + log.Warningf("ext fs: multiple mount protection is not supported") + return false + } + if incompatFeatures.Encrypted { + log.Warningf("ext fs: encrypted inodes not supported") + return false + } + if incompatFeatures.InlineData { + log.Warningf("ext fs: inline files not supported") + return false + } + return true +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + // TODO(b/134676337): Ensure that the user is mounting readonly. If not, + // EACCESS should be returned according to mount(2). Filesystem independent + // flags (like readonly) are currently not available in pkg/sentry/vfs. + + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + dev, err := getDeviceFd(source, opts) + if err != nil { + return nil, nil, err + } + + fs := filesystem{ + dev: dev, + inodeCache: make(map[uint32]*inode), + devMinor: devMinor, + } + fs.vfsfs.Init(vfsObj, &fsType, &fs) + fs.sb, err = readSuperBlock(dev) + if err != nil { + fs.vfsfs.DecRef() + return nil, nil, err + } + + if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { + // mount(2) specifies that EINVAL should be returned if the superblock is + // invalid. + fs.vfsfs.DecRef() + return nil, nil, syserror.EINVAL + } + + // Refuse to mount if the filesystem is incompatible. + if !isCompatible(fs.sb) { + fs.vfsfs.DecRef() + return nil, nil, syserror.EINVAL + } + + fs.bgs, err = readBlockGroups(dev, fs.sb) + if err != nil { + fs.vfsfs.DecRef() + return nil, nil, err + } + + rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) + if err != nil { + fs.vfsfs.DecRef() + return nil, nil, err + } + rootInode.incRef() + + return &fs.vfsfs, &newDentry(rootInode).vfsd, nil +} diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go new file mode 100644 index 000000000..64e9a579f --- /dev/null +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -0,0 +1,921 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "fmt" + "io" + "os" + "path" + "sort" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + assetsDir = "pkg/sentry/fsimpl/ext/assets" +) + +var ( + ext2ImagePath = path.Join(assetsDir, "tiny.ext2") + ext3ImagePath = path.Join(assetsDir, "tiny.ext3") + ext4ImagePath = path.Join(assetsDir, "tiny.ext4") +) + +// setUp opens imagePath as an ext Filesystem and returns all necessary +// elements required to run tests. If error is non-nil, it also returns a tear +// down function which must be called after the test is run for clean up. +func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) { + localImagePath, err := testutil.FindFile(imagePath) + if err != nil { + return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err) + } + + f, err := os.Open(localImagePath) + if err != nil { + return nil, nil, nil, nil, err + } + + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) + if err != nil { + f.Close() + return nil, nil, nil, nil, err + } + + root := mntns.Root() + + tearDown := func() { + root.DecRef() + + if err := f.Close(); err != nil { + t.Fatalf("tearDown failed: %v", err) + } + } + return ctx, vfsObj, &root, tearDown, nil +} + +// TODO(b/134676337): Test vfs.FilesystemImpl.ReadlinkAt and +// vfs.FilesystemImpl.StatFSAt which are not implemented in +// vfs.VirtualFilesystem yet. + +// TestSeek tests vfs.FileDescriptionImpl.Seek functionality. +func TestSeek(t *testing.T) { + type seekTest struct { + name string + image string + path string + } + + tests := []seekTest{ + { + name: "ext4 root dir seek", + image: ext4ImagePath, + path: "/", + }, + { + name: "ext3 root dir seek", + image: ext3ImagePath, + path: "/", + }, + { + name: "ext2 root dir seek", + image: ext2ImagePath, + path: "/", + }, + { + name: "ext4 reg file seek", + image: ext4ImagePath, + path: "/file.txt", + }, + { + name: "ext3 reg file seek", + image: ext3ImagePath, + path: "/file.txt", + }, + { + name: "ext2 reg file seek", + image: ext2ImagePath, + path: "/file.txt", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, vfsfs, root, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + fd, err := vfsfs.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { + t.Errorf("expected seek position 0, got %d and error %v", n, err) + } + + stat, err := fd.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err) + } + + // We should be able to seek beyond the end of file. + size := int64(stat.Size) + if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", size, n, err) + } + + // EINVAL should be returned if the resulting offset is negative. + if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { + t.Errorf("expected error EINVAL but got %v", err) + } + + if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err) + } + + // Make sure negative offsets work with SEEK_CUR. + if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) + } + + // EINVAL should be returned if the resulting offset is negative. + if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { + t.Errorf("expected error EINVAL but got %v", err) + } + + // Make sure SEEK_END works with regular files. + if _, ok := fd.Impl().(*regularFileFD); ok { + // Seek back to 0. + if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", 0, n, err) + } + + // Seek forward beyond EOF. + if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) + } + + // EINVAL should be returned if the resulting offset is negative. + if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { + t.Errorf("expected error EINVAL but got %v", err) + } + } + }) + } +} + +// TestStatAt tests filesystem.StatAt functionality. +func TestStatAt(t *testing.T) { + type statAtTest struct { + name string + image string + path string + want linux.Statx + } + + tests := []statAtTest{ + { + name: "ext4 statx small file", + image: ext4ImagePath, + path: "/file.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13, + }, + }, + { + name: "ext3 statx small file", + image: ext3ImagePath, + path: "/file.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13, + }, + }, + { + name: "ext2 statx small file", + image: ext2ImagePath, + path: "/file.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13, + }, + }, + { + name: "ext4 statx big file", + image: ext4ImagePath, + path: "/bigfile.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13042, + }, + }, + { + name: "ext3 statx big file", + image: ext3ImagePath, + path: "/bigfile.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13042, + }, + }, + { + name: "ext2 statx big file", + image: ext2ImagePath, + path: "/bigfile.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13042, + }, + }, + { + name: "ext4 statx symlink file", + image: ext4ImagePath, + path: "/symlink.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0777 | linux.ModeSymlink, + Size: 8, + }, + }, + { + name: "ext3 statx symlink file", + image: ext3ImagePath, + path: "/symlink.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0777 | linux.ModeSymlink, + Size: 8, + }, + }, + { + name: "ext2 statx symlink file", + image: ext2ImagePath, + path: "/symlink.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0777 | linux.ModeSymlink, + Size: 8, + }, + }, + } + + // Ignore the fields that are not supported by filesystem.StatAt yet and + // those which are likely to change as the image does. + ignoredFields := map[string]bool{ + "Attributes": true, + "AttributesMask": true, + "Atime": true, + "Blocks": true, + "Btime": true, + "Ctime": true, + "DevMajor": true, + "DevMinor": true, + "Ino": true, + "Mask": true, + "Mtime": true, + "RdevMajor": true, + "RdevMinor": true, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, vfsfs, root, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + got, err := vfsfs.StatAt(ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, + &vfs.StatOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.StatAt failed for file %s in image %s: %v", test.path, test.image, err) + } + + cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { + _, ok := ignoredFields[p.String()] + return ok + }, cmp.Ignore()) + if diff := cmp.Diff(got, test.want, cmpIgnoreFields, cmpopts.IgnoreUnexported(linux.Statx{})); diff != "" { + t.Errorf("stat mismatch (-want +got):\n%s", diff) + } + }) + } +} + +// TestRead tests the read functionality for vfs file descriptions. +func TestRead(t *testing.T) { + type readTest struct { + name string + image string + absPath string + } + + tests := []readTest{ + { + name: "ext4 read small file", + image: ext4ImagePath, + absPath: "/file.txt", + }, + { + name: "ext3 read small file", + image: ext3ImagePath, + absPath: "/file.txt", + }, + { + name: "ext2 read small file", + image: ext2ImagePath, + absPath: "/file.txt", + }, + { + name: "ext4 read big file", + image: ext4ImagePath, + absPath: "/bigfile.txt", + }, + { + name: "ext3 read big file", + image: ext3ImagePath, + absPath: "/bigfile.txt", + }, + { + name: "ext2 read big file", + image: ext2ImagePath, + absPath: "/bigfile.txt", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, vfsfs, root, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + fd, err := vfsfs.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + // Get a local file descriptor and compare its functionality with a vfs file + // description for the same file. + localFile, err := testutil.FindFile(path.Join(assetsDir, test.absPath)) + if err != nil { + t.Fatalf("testutil.FindFile failed for %s: %v", test.absPath, err) + } + + f, err := os.Open(localFile) + if err != nil { + t.Fatalf("os.Open failed for %s: %v", localFile, err) + } + defer f.Close() + + // Read the entire file by reading one byte repeatedly. Doing this stress + // tests the underlying file reader implementation. + got := make([]byte, 1) + want := make([]byte, 1) + for { + n, err := f.Read(want) + fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) + + if diff := cmp.Diff(got, want); diff != "" { + t.Errorf("file data mismatch (-want +got):\n%s", diff) + } + + // Make sure there is no more file data left after getting EOF. + if n == 0 || err == io.EOF { + if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { + t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image) + } + + break + } + + if err != nil { + t.Fatalf("read failed: %v", err) + } + } + }) + } +} + +// iterDirentsCb is a simple callback which just keeps adding the dirents to an +// internal list. Implements vfs.IterDirentsCallback. +type iterDirentsCb struct { + dirents []vfs.Dirent +} + +// Compiles only if iterDirentCb implements vfs.IterDirentsCallback. +var _ vfs.IterDirentsCallback = (*iterDirentsCb)(nil) + +// newIterDirentsCb is the iterDirent +func newIterDirentCb() *iterDirentsCb { + return &iterDirentsCb{dirents: make([]vfs.Dirent, 0)} +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error { + cb.dirents = append(cb.dirents, dirent) + return nil +} + +// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality. +func TestIterDirents(t *testing.T) { + type iterDirentTest struct { + name string + image string + path string + want []vfs.Dirent + } + + wantDirents := []vfs.Dirent{ + { + Name: ".", + Type: linux.DT_DIR, + }, + { + Name: "..", + Type: linux.DT_DIR, + }, + { + Name: "lost+found", + Type: linux.DT_DIR, + }, + { + Name: "file.txt", + Type: linux.DT_REG, + }, + { + Name: "bigfile.txt", + Type: linux.DT_REG, + }, + { + Name: "symlink.txt", + Type: linux.DT_LNK, + }, + } + tests := []iterDirentTest{ + { + name: "ext4 root dir iteration", + image: ext4ImagePath, + path: "/", + want: wantDirents, + }, + { + name: "ext3 root dir iteration", + image: ext3ImagePath, + path: "/", + want: wantDirents, + }, + { + name: "ext2 root dir iteration", + image: ext2ImagePath, + path: "/", + want: wantDirents, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, vfsfs, root, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + fd, err := vfsfs.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + cb := &iterDirentsCb{} + if err = fd.IterDirents(ctx, cb); err != nil { + t.Fatalf("dir fd.IterDirents() failed: %v", err) + } + + sort.Slice(cb.dirents, func(i int, j int) bool { return cb.dirents[i].Name < cb.dirents[j].Name }) + sort.Slice(test.want, func(i int, j int) bool { return test.want[i].Name < test.want[j].Name }) + + // Ignore the inode number and offset of dirents because those are likely to + // change as the underlying image changes. + cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { + return p.String() == "Ino" || p.String() == "NextOff" + }, cmp.Ignore()) + if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" { + t.Errorf("dirents mismatch (-want +got):\n%s", diff) + } + }) + } +} + +// TestRootDir tests that the root directory inode is correctly initialized and +// returned from setUp. +func TestRootDir(t *testing.T) { + type inodeProps struct { + Mode linux.FileMode + UID auth.KUID + GID auth.KGID + Size uint64 + InodeSize uint16 + Links uint16 + Flags disklayout.InodeFlags + } + + type rootDirTest struct { + name string + image string + wantInode inodeProps + } + + tests := []rootDirTest{ + { + name: "ext4 root dir", + image: ext4ImagePath, + wantInode: inodeProps{ + Mode: linux.ModeDirectory | 0755, + Size: 0x400, + InodeSize: 0x80, + Links: 3, + Flags: disklayout.InodeFlags{Extents: true}, + }, + }, + { + name: "ext3 root dir", + image: ext3ImagePath, + wantInode: inodeProps{ + Mode: linux.ModeDirectory | 0755, + Size: 0x400, + InodeSize: 0x80, + Links: 3, + }, + }, + { + name: "ext2 root dir", + image: ext2ImagePath, + wantInode: inodeProps{ + Mode: linux.ModeDirectory | 0755, + Size: 0x400, + InodeSize: 0x80, + Links: 3, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + _, _, vd, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + d, ok := vd.Dentry().Impl().(*dentry) + if !ok { + t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl()) + } + + // Offload inode contents into local structs for comparison. + gotInode := inodeProps{ + Mode: d.inode.diskInode.Mode(), + UID: d.inode.diskInode.UID(), + GID: d.inode.diskInode.GID(), + Size: d.inode.diskInode.Size(), + InodeSize: d.inode.diskInode.InodeSize(), + Links: d.inode.diskInode.LinksCount(), + Flags: d.inode.diskInode.Flags(), + } + + if diff := cmp.Diff(gotInode, test.wantInode); diff != "" { + t.Errorf("inode mismatch (-want +got):\n%s", diff) + } + }) + } +} + +// TestFilesystemInit tests that the filesystem superblock and block group +// descriptors are correctly read in and initialized. +func TestFilesystemInit(t *testing.T) { + // sb only contains the immutable properties of the superblock. + type sb struct { + InodesCount uint32 + BlocksCount uint64 + MaxMountCount uint16 + FirstDataBlock uint32 + BlockSize uint64 + BlocksPerGroup uint32 + ClusterSize uint64 + ClustersPerGroup uint32 + InodeSize uint16 + InodesPerGroup uint32 + BgDescSize uint16 + Magic uint16 + Revision disklayout.SbRevision + CompatFeatures disklayout.CompatFeatures + IncompatFeatures disklayout.IncompatFeatures + RoCompatFeatures disklayout.RoCompatFeatures + } + + // bg only contains the immutable properties of the block group descriptor. + type bg struct { + InodeTable uint64 + BlockBitmap uint64 + InodeBitmap uint64 + ExclusionBitmap uint64 + Flags disklayout.BGFlags + } + + type fsInitTest struct { + name string + image string + wantSb sb + wantBgs []bg + } + + tests := []fsInitTest{ + { + name: "ext4 filesystem init", + image: ext4ImagePath, + wantSb: sb{ + InodesCount: 0x10, + BlocksCount: 0x40, + MaxMountCount: 0xffff, + FirstDataBlock: 0x1, + BlockSize: 0x400, + BlocksPerGroup: 0x2000, + ClusterSize: 0x400, + ClustersPerGroup: 0x2000, + InodeSize: 0x80, + InodesPerGroup: 0x10, + BgDescSize: 0x40, + Magic: linux.EXT_SUPER_MAGIC, + Revision: disklayout.DynamicRev, + CompatFeatures: disklayout.CompatFeatures{ + ExtAttr: true, + ResizeInode: true, + DirIndex: true, + }, + IncompatFeatures: disklayout.IncompatFeatures{ + DirentFileType: true, + Extents: true, + Is64Bit: true, + FlexBg: true, + }, + RoCompatFeatures: disklayout.RoCompatFeatures{ + Sparse: true, + LargeFile: true, + HugeFile: true, + DirNlink: true, + ExtraIsize: true, + MetadataCsum: true, + }, + }, + wantBgs: []bg{ + { + InodeTable: 0x23, + BlockBitmap: 0x3, + InodeBitmap: 0x13, + Flags: disklayout.BGFlags{ + InodeZeroed: true, + }, + }, + }, + }, + { + name: "ext3 filesystem init", + image: ext3ImagePath, + wantSb: sb{ + InodesCount: 0x10, + BlocksCount: 0x40, + MaxMountCount: 0xffff, + FirstDataBlock: 0x1, + BlockSize: 0x400, + BlocksPerGroup: 0x2000, + ClusterSize: 0x400, + ClustersPerGroup: 0x2000, + InodeSize: 0x80, + InodesPerGroup: 0x10, + BgDescSize: 0x20, + Magic: linux.EXT_SUPER_MAGIC, + Revision: disklayout.DynamicRev, + CompatFeatures: disklayout.CompatFeatures{ + ExtAttr: true, + ResizeInode: true, + DirIndex: true, + }, + IncompatFeatures: disklayout.IncompatFeatures{ + DirentFileType: true, + }, + RoCompatFeatures: disklayout.RoCompatFeatures{ + Sparse: true, + LargeFile: true, + }, + }, + wantBgs: []bg{ + { + InodeTable: 0x5, + BlockBitmap: 0x3, + InodeBitmap: 0x4, + Flags: disklayout.BGFlags{ + InodeZeroed: true, + }, + }, + }, + }, + { + name: "ext2 filesystem init", + image: ext2ImagePath, + wantSb: sb{ + InodesCount: 0x10, + BlocksCount: 0x40, + MaxMountCount: 0xffff, + FirstDataBlock: 0x1, + BlockSize: 0x400, + BlocksPerGroup: 0x2000, + ClusterSize: 0x400, + ClustersPerGroup: 0x2000, + InodeSize: 0x80, + InodesPerGroup: 0x10, + BgDescSize: 0x20, + Magic: linux.EXT_SUPER_MAGIC, + Revision: disklayout.DynamicRev, + CompatFeatures: disklayout.CompatFeatures{ + ExtAttr: true, + ResizeInode: true, + DirIndex: true, + }, + IncompatFeatures: disklayout.IncompatFeatures{ + DirentFileType: true, + }, + RoCompatFeatures: disklayout.RoCompatFeatures{ + Sparse: true, + LargeFile: true, + }, + }, + wantBgs: []bg{ + { + InodeTable: 0x5, + BlockBitmap: 0x3, + InodeBitmap: 0x4, + Flags: disklayout.BGFlags{ + InodeZeroed: true, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + _, _, vd, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + fs, ok := vd.Mount().Filesystem().Impl().(*filesystem) + if !ok { + t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl()) + } + + // Offload superblock and block group descriptors contents into + // local structs for comparison. + totalFreeInodes := uint32(0) + totalFreeBlocks := uint64(0) + gotSb := sb{ + InodesCount: fs.sb.InodesCount(), + BlocksCount: fs.sb.BlocksCount(), + MaxMountCount: fs.sb.MaxMountCount(), + FirstDataBlock: fs.sb.FirstDataBlock(), + BlockSize: fs.sb.BlockSize(), + BlocksPerGroup: fs.sb.BlocksPerGroup(), + ClusterSize: fs.sb.ClusterSize(), + ClustersPerGroup: fs.sb.ClustersPerGroup(), + InodeSize: fs.sb.InodeSize(), + InodesPerGroup: fs.sb.InodesPerGroup(), + BgDescSize: fs.sb.BgDescSize(), + Magic: fs.sb.Magic(), + Revision: fs.sb.Revision(), + CompatFeatures: fs.sb.CompatibleFeatures(), + IncompatFeatures: fs.sb.IncompatibleFeatures(), + RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(), + } + gotNumBgs := len(fs.bgs) + gotBgs := make([]bg, gotNumBgs) + for i := 0; i < gotNumBgs; i++ { + gotBgs[i].InodeTable = fs.bgs[i].InodeTable() + gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap() + gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap() + gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap() + gotBgs[i].Flags = fs.bgs[i].Flags() + + totalFreeInodes += fs.bgs[i].FreeInodesCount() + totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount()) + } + + if diff := cmp.Diff(gotSb, test.wantSb); diff != "" { + t.Errorf("superblock mismatch (-want +got):\n%s", diff) + } + + if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" { + t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff) + } + + if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" { + t.Errorf("total free inodes mismatch (-want +got):\n%s", diff) + } + + if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" { + t.Errorf("total free blocks mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go new file mode 100644 index 000000000..c36225a7c --- /dev/null +++ b/pkg/sentry/fsimpl/ext/extent_file.go @@ -0,0 +1,238 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + "sort" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/syserror" +) + +// extentFile is a type of regular file which uses extents to store file data. +type extentFile struct { + regFile regularFile + + // root is the root extent node. This lives in the 60 byte diskInode.Data(). + // Immutable. + root disklayout.ExtentNode +} + +// Compiles only if extentFile implements io.ReaderAt. +var _ io.ReaderAt = (*extentFile)(nil) + +// newExtentFile is the extent file constructor. It reads the entire extent +// tree into memory. +// TODO(b/134676337): Build extent tree on demand to reduce memory usage. +func newExtentFile(args inodeArgs) (*extentFile, error) { + file := &extentFile{} + file.regFile.impl = file + file.regFile.inode.init(args, &file.regFile) + err := file.buildExtTree() + if err != nil { + return nil, err + } + return file, nil +} + +// buildExtTree builds the extent tree by reading it from disk by doing +// running a simple DFS. It first reads the root node from the inode struct in +// memory. Then it recursively builds the rest of the tree by reading it off +// disk. +// +// Precondition: inode flag InExtents must be set. +func (f *extentFile) buildExtTree() error { + rootNodeData := f.regFile.inode.diskInode.Data() + + binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header) + + // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. + if f.root.Header.NumEntries > 4 { + // read(2) specifies that EINVAL should be returned if the file is unsuitable + // for reading. + return syserror.EINVAL + } + + f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries) + for i, off := uint16(0), disklayout.ExtentEntrySize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize { + var curEntry disklayout.ExtentEntry + if f.root.Header.Height == 0 { + // Leaf node. + curEntry = &disklayout.Extent{} + } else { + // Internal node. + curEntry = &disklayout.ExtentIdx{} + } + binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry) + f.root.Entries[i].Entry = curEntry + } + + // If this node is internal, perform DFS. + if f.root.Header.Height > 0 { + for i := uint16(0); i < f.root.Header.NumEntries; i++ { + var err error + if f.root.Entries[i].Node, err = f.buildExtTreeFromDisk(f.root.Entries[i].Entry); err != nil { + return err + } + } + } + + return nil +} + +// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively +// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to +// by the ExtentEntry. +func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) { + var header disklayout.ExtentHeader + off := entry.PhysicalBlock() * f.regFile.inode.blkSize + err := readFromDisk(f.regFile.inode.fs.dev, int64(off), &header) + if err != nil { + return nil, err + } + + entries := make([]disklayout.ExtentEntryPair, header.NumEntries) + for i, off := uint16(0), off+disklayout.ExtentEntrySize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize { + var curEntry disklayout.ExtentEntry + if header.Height == 0 { + // Leaf node. + curEntry = &disklayout.Extent{} + } else { + // Internal node. + curEntry = &disklayout.ExtentIdx{} + } + + err := readFromDisk(f.regFile.inode.fs.dev, int64(off), curEntry) + if err != nil { + return nil, err + } + entries[i].Entry = curEntry + } + + // If this node is internal, perform DFS. + if header.Height > 0 { + for i := uint16(0); i < header.NumEntries; i++ { + var err error + entries[i].Node, err = f.buildExtTreeFromDisk(entries[i].Entry) + if err != nil { + return nil, err + } + } + } + + return &disklayout.ExtentNode{header, entries}, nil +} + +// ReadAt implements io.ReaderAt.ReadAt. +func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) { + if len(dst) == 0 { + return 0, nil + } + + if off < 0 { + return 0, syserror.EINVAL + } + + if uint64(off) >= f.regFile.inode.diskInode.Size() { + return 0, io.EOF + } + + n, err := f.read(&f.root, uint64(off), dst) + if n < len(dst) && err == nil { + err = io.EOF + } + return n, err +} + +// read is the recursive step of extentFile.ReadAt which traverses the extent +// tree from the node passed and reads file data. +func (f *extentFile) read(node *disklayout.ExtentNode, off uint64, dst []byte) (int, error) { + // Perform a binary search for the node covering bytes starting at r.fileOff. + // A highly fragmented filesystem can have upto 340 entries and so linear + // search should be avoided. Finds the first entry which does not cover the + // file block we want and subtracts 1 to get the desired index. + fileBlk := uint32(off / f.regFile.inode.blkSize) + n := len(node.Entries) + found := sort.Search(n, func(i int) bool { + return node.Entries[i].Entry.FileBlock() > fileBlk + }) - 1 + + // We should be in this recursive step only if the data we want exists under + // the current node. + if found < 0 { + panic("searching for a file block in an extent entry which does not cover it") + } + + read := 0 + toRead := len(dst) + var curR int + var err error + for i := found; i < n && read < toRead; i++ { + if node.Header.Height == 0 { + curR, err = f.readFromExtent(node.Entries[i].Entry.(*disklayout.Extent), off, dst[read:]) + } else { + curR, err = f.read(node.Entries[i].Node, off, dst[read:]) + } + + read += curR + off += uint64(curR) + if err != nil { + return read, err + } + } + + return read, nil +} + +// readFromExtent reads file data from the extent. It takes advantage of the +// sequential nature of extents and reads file data from multiple blocks in one +// call. +// +// A non-nil error indicates that this is a partial read and there is probably +// more to read from this extent. The caller should propagate the error upward +// and not move to the next extent in the tree. +// +// A subsequent call to extentReader.Read should continue reading from where we +// left off as expected. +func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byte) (int, error) { + curFileBlk := uint32(off / f.regFile.inode.blkSize) + exFirstFileBlk := ex.FileBlock() + exLastFileBlk := exFirstFileBlk + uint32(ex.Length) // This is exclusive. + + // We should be in this recursive step only if the data we want exists under + // the current extent. + if curFileBlk < exFirstFileBlk || exLastFileBlk <= curFileBlk { + panic("searching for a file block in an extent which does not cover it") + } + + curPhyBlk := uint64(curFileBlk-exFirstFileBlk) + ex.PhysicalBlock() + readStart := curPhyBlk*f.regFile.inode.blkSize + (off % f.regFile.inode.blkSize) + + endPhyBlk := ex.PhysicalBlock() + uint64(ex.Length) + extentEnd := endPhyBlk * f.regFile.inode.blkSize // This is exclusive. + + toRead := int(extentEnd - readStart) + if len(dst) < toRead { + toRead = len(dst) + } + + n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], int64(readStart)) + if n < toRead { + return n, syserror.EIO + } + return n, nil +} diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go new file mode 100644 index 000000000..cd10d46ee --- /dev/null +++ b/pkg/sentry/fsimpl/ext/extent_test.go @@ -0,0 +1,265 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "bytes" + "math/rand" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" +) + +const ( + // mockExtentBlkSize is the mock block size used for testing. + // No block has more than 1 header + 4 entries. + mockExtentBlkSize = uint64(64) +) + +// The tree described below looks like: +// +// 0.{Head}[Idx][Idx] +// / \ +// / \ +// 1.{Head}[Ext][Ext] 2.{Head}[Idx] +// / | \ +// [Phy] [Phy, Phy] 3.{Head}[Ext] +// | +// [Phy, Phy, Phy] +// +// Legend: +// - Head = ExtentHeader +// - Idx = ExtentIdx +// - Ext = Extent +// - Phy = Physical Block +// +// Please note that ext4 might not construct extent trees looking like this. +// This is purely for testing the tree traversal logic. +var ( + node3 = &disklayout.ExtentNode{ + Header: disklayout.ExtentHeader{ + Magic: disklayout.ExtentMagic, + NumEntries: 1, + MaxEntries: 4, + Height: 0, + }, + Entries: []disklayout.ExtentEntryPair{ + { + Entry: &disklayout.Extent{ + FirstFileBlock: 3, + Length: 3, + StartBlockLo: 6, + }, + Node: nil, + }, + }, + } + + node2 = &disklayout.ExtentNode{ + Header: disklayout.ExtentHeader{ + Magic: disklayout.ExtentMagic, + NumEntries: 1, + MaxEntries: 4, + Height: 1, + }, + Entries: []disklayout.ExtentEntryPair{ + { + Entry: &disklayout.ExtentIdx{ + FirstFileBlock: 3, + ChildBlockLo: 2, + }, + Node: node3, + }, + }, + } + + node1 = &disklayout.ExtentNode{ + Header: disklayout.ExtentHeader{ + Magic: disklayout.ExtentMagic, + NumEntries: 2, + MaxEntries: 4, + Height: 0, + }, + Entries: []disklayout.ExtentEntryPair{ + { + Entry: &disklayout.Extent{ + FirstFileBlock: 0, + Length: 1, + StartBlockLo: 3, + }, + Node: nil, + }, + { + Entry: &disklayout.Extent{ + FirstFileBlock: 1, + Length: 2, + StartBlockLo: 4, + }, + Node: nil, + }, + }, + } + + node0 = &disklayout.ExtentNode{ + Header: disklayout.ExtentHeader{ + Magic: disklayout.ExtentMagic, + NumEntries: 2, + MaxEntries: 4, + Height: 2, + }, + Entries: []disklayout.ExtentEntryPair{ + { + Entry: &disklayout.ExtentIdx{ + FirstFileBlock: 0, + ChildBlockLo: 0, + }, + Node: node1, + }, + { + Entry: &disklayout.ExtentIdx{ + FirstFileBlock: 3, + ChildBlockLo: 1, + }, + Node: node2, + }, + }, + } +) + +// TestExtentReader stress tests extentReader functionality. It performs random +// length reads from all possible positions in the extent tree. +func TestExtentReader(t *testing.T) { + mockExtentFile, want := extentTreeSetUp(t, node0) + n := len(want) + + for from := 0; from < n; from++ { + got := make([]byte, n-from) + + if read, err := mockExtentFile.ReadAt(got, int64(from)); err != nil { + t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err) + } + + if diff := cmp.Diff(got, want[from:]); diff != "" { + t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff) + } + } +} + +// TestBuildExtentTree tests the extent tree building logic. +func TestBuildExtentTree(t *testing.T) { + mockExtentFile, _ := extentTreeSetUp(t, node0) + + opt := cmpopts.IgnoreUnexported(disklayout.ExtentIdx{}, disklayout.ExtentHeader{}) + if diff := cmp.Diff(&mockExtentFile.root, node0, opt); diff != "" { + t.Errorf("extent tree mismatch (-want +got):\n%s", diff) + } +} + +// extentTreeSetUp writes the passed extent tree to a mock disk as an extent +// tree. It also constucts a mock extent file with the same tree built in it. +// It also writes random data file data and returns it. +func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []byte) { + t.Helper() + + mockDisk := make([]byte, mockExtentBlkSize*10) + mockExtentFile := &extentFile{} + args := inodeArgs{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, + diskInode: &disklayout.InodeNew{ + InodeOld: disklayout.InodeOld{ + SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), + }, + }, + blkSize: mockExtentBlkSize, + } + mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile) + + fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize) + + if err := mockExtentFile.buildExtTree(); err != nil { + t.Fatalf("inode.buildExtTree failed: %v", err) + } + return mockExtentFile, fileData +} + +// writeTree writes the tree represented by `root` to the inode and disk. It +// also writes random file data on disk. +func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte { + rootData := binary.Marshal(nil, binary.LittleEndian, root.Header) + for _, ep := range root.Entries { + rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry) + } + + copy(in.diskInode.Data(), rootData) + + var fileData []byte + for _, ep := range root.Entries { + if root.Header.Height == 0 { + fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...) + } else { + fileData = append(fileData, writeTreeToDisk(disk, ep)...) + } + } + return fileData +} + +// writeTreeToDisk is the recursive step for writeTree which writes the tree +// on the disk only. Also writes random file data on disk. +func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte { + nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header) + for _, ep := range curNode.Node.Entries { + nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry) + } + + copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData) + + var fileData []byte + for _, ep := range curNode.Node.Entries { + if curNode.Node.Header.Height == 0 { + fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...) + } else { + fileData = append(fileData, writeTreeToDisk(disk, ep)...) + } + } + return fileData +} + +// writeFileDataToExtent writes random bytes to the blocks on disk that the +// passed extent points to. +func writeFileDataToExtent(disk []byte, ex *disklayout.Extent) []byte { + phyExStartBlk := ex.PhysicalBlock() + phyExStartOff := phyExStartBlk * mockExtentBlkSize + phyExEndOff := phyExStartOff + uint64(ex.Length)*mockExtentBlkSize + rand.Read(disk[phyExStartOff:phyExEndOff]) + return disk[phyExStartOff:phyExEndOff] +} + +// getNumPhyBlks returns the number of physical blocks covered under the node. +func getNumPhyBlks(node *disklayout.ExtentNode) uint32 { + var res uint32 + for _, ep := range node.Entries { + if node.Header.Height == 0 { + res += uint32(ep.Entry.(*disklayout.Extent).Length) + } else { + res += getNumPhyBlks(ep.Node) + } + } + return res +} diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go new file mode 100644 index 000000000..90b086468 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -0,0 +1,65 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// fileDescription is embedded by ext implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) inode() *inode { + return fd.vfsfd.Dentry().Impl().(*dentry).inode +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + fd.inode().statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + return syserror.EPERM +} + +// SetStat implements vfs.FileDescriptionImpl.StatFS. +func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + var stat linux.Statfs + fd.filesystem().statTo(&stat) + return stat, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *fileDescription) Sync(ctx context.Context) error { + return nil +} diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go new file mode 100644 index 000000000..557963e03 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -0,0 +1,548 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "errors" + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +var ( + // errResolveDirent indicates that the vfs.ResolvingPath.Component() does + // not exist on the dentry tree but does exist on disk. So it has to be read in + // using the in-memory dirent and added to the dentry tree. Usually indicates + // the need to lock filesystem.mu for writing. + errResolveDirent = errors.New("resolve path component using dirent") +) + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // mu serializes changes to the Dentry tree. + mu sync.RWMutex + + // dev represents the underlying fs device. It does not require protection + // because io.ReaderAt permits concurrent read calls to it. It translates to + // the pread syscall which passes on the read request directly to the device + // driver. Device drivers are intelligent in serving multiple concurrent read + // requests in the optimal order (taking locality into consideration). + dev io.ReaderAt + + // inodeCache maps absolute inode numbers to the corresponding Inode struct. + // Inodes should be removed from this once their reference count hits 0. + // + // Protected by mu because most additions (see IterDirents) and all removals + // from this corresponds to a change in the dentry tree. + inodeCache map[uint32]*inode + + // sb represents the filesystem superblock. Immutable after initialization. + sb disklayout.SuperBlock + + // bgs represents all the block group descriptors for the filesystem. + // Immutable after initialization. + bgs []disklayout.BlockGroup + + // devMinor is this filesystem's device minor number. Immutable after + // initialization. + devMinor uint32 +} + +// Compiles only if filesystem implements vfs.FilesystemImpl. +var _ vfs.FilesystemImpl = (*filesystem)(nil) + +// stepLocked resolves rp.Component() in parent directory vfsd. The write +// parameter passed tells if the caller has acquired filesystem.mu for writing +// or not. If set to true, an existing inode on disk can be added to the dentry +// tree if not present already. +// +// stepLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +// - !rp.Done(). +// - inode == vfsd.Impl().(*Dentry).inode. +func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, nil, err + } + + for { + name := rp.Component() + if name == "." { + rp.Advance() + return vfsd, inode, nil + } + d := vfsd.Impl().(*dentry) + if name == ".." { + isRoot, err := rp.CheckRoot(vfsd) + if err != nil { + return nil, nil, err + } + if isRoot || d.parent == nil { + rp.Advance() + return vfsd, inode, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, nil, err + } + rp.Advance() + return &d.parent.vfsd, d.parent.inode, nil + } + + dir := inode.impl.(*directory) + child, ok := dir.childCache[name] + if !ok { + // We may need to instantiate a new dentry for this child. + childDirent, ok := dir.childMap[name] + if !ok { + // The underlying inode does not exist on disk. + return nil, nil, syserror.ENOENT + } + + if !write { + // filesystem.mu must be held for writing to add to the dentry tree. + return nil, nil, errResolveDirent + } + + // Create and add the component's dirent to the dentry tree. + fs := rp.Mount().Filesystem().Impl().(*filesystem) + childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode()) + if err != nil { + return nil, nil, err + } + // incRef because this is being added to the dentry tree. + childInode.incRef() + child = newDentry(childInode) + child.parent = d + child.name = name + dir.childCache[name] = child + } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, nil, err + } + if child.inode.isSymlink() && rp.ShouldFollowSymlink() { + if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil { + return nil, nil, err + } + continue + } + rp.Advance() + return &child.vfsd, child.inode, nil + } +} + +// walkLocked resolves rp to an existing file. The write parameter +// passed tells if the caller has acquired filesystem.mu for writing or not. +// If set to true, additions can be made to the dentry tree while walking. +// If errResolveDirent is returned, the walk needs to be continued with an +// upgraded filesystem.mu. +// +// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*dentry).inode + for !rp.Done() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + if err != nil { + return nil, nil, err + } + } + if rp.MustBeDir() && !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// walkParentLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. The write parameter passed tells if the +// caller has acquired filesystem.mu for writing or not. If set to true, +// additions can be made to the dentry tree while walking. +// If errResolveDirent is returned, the walk needs to be continued with an +// upgraded filesystem.mu. +// +// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +// - !rp.Done(). +func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*dentry).inode + for !rp.Final() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + if err != nil { + return nil, nil, err + } + } + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// walk resolves rp to an existing file. If parent is set to true, it resolves +// the rp till the parent of the last component which should be an existing +// directory. If parent is false then resolves rp entirely. Attemps to resolve +// the path as far as it can with a read lock and upgrades the lock if needed. +func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { + var ( + vfsd *vfs.Dentry + inode *inode + err error + ) + + // Try walking with the hopes that all dentries have already been pulled out + // of disk. This reduces congestion (allows concurrent walks). + fs.mu.RLock() + if parent { + vfsd, inode, err = walkParentLocked(rp, false) + } else { + vfsd, inode, err = walkLocked(rp, false) + } + fs.mu.RUnlock() + + if err == errResolveDirent { + // Upgrade lock and continue walking. Lock upgrading in the middle of the + // walk is fine as this is a read only filesystem. + fs.mu.Lock() + if parent { + vfsd, inode, err = walkParentLocked(rp, true) + } else { + vfsd, inode, err = walkLocked(rp, true) + } + fs.mu.Unlock() + } + + return vfsd, inode, err +} + +// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in. +// It creates a new one with the given inode number if one does not exist. +// The caller must increment the ref count if adding this to the dentry tree. +// +// Precondition: must be holding fs.mu for writing. +func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) { + if in, ok := fs.inodeCache[inodeNum]; ok { + return in, nil + } + + in, err := newInode(fs, inodeNum) + if err != nil { + return nil, err + } + + fs.inodeCache[inodeNum] = in + return in, nil +} + +// statTo writes the statfs fields to the output parameter. +func (fs *filesystem) statTo(stat *linux.Statfs) { + stat.Type = uint64(fs.sb.Magic()) + stat.BlockSize = int64(fs.sb.BlockSize()) + stat.Blocks = fs.sb.BlocksCount() + stat.BlocksFree = fs.sb.FreeBlocksCount() + stat.BlocksAvailable = fs.sb.FreeBlocksCount() + stat.Files = uint64(fs.sb.InodesCount()) + stat.FilesFree = uint64(fs.sb.FreeInodesCount()) + stat.NameLength = disklayout.MaxFileName + stat.FragmentSize = int64(fs.sb.BlockSize()) + // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID. +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + return inode.checkPermissions(rp.Credentials(), ats) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + vfsd, inode, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + + if opts.CheckSearchable { + if !inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + + inode.incRef() + return vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + vfsd, inode, err := fs.walk(rp, true) + if err != nil { + return nil, err + } + inode.incRef() + return vfsd, nil +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + vfsd, inode, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + + // EROFS is returned if write access is needed. + if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { + return nil, syserror.EROFS + } + return inode.open(rp, vfsd, &opts) +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + _, inode, err := fs.walk(rp, false) + if err != nil { + return "", err + } + symlink, ok := inode.impl.(*symlink) + if !ok { + return "", syserror.EINVAL + } + return symlink.target, nil +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + _, inode, err := fs.walk(rp, false) + if err != nil { + return linux.Statx{}, err + } + var stat linux.Statx + inode.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + if _, _, err := fs.walk(rp, false); err != nil { + return linux.Statfs{}, err + } + + var stat linux.Statfs + fs.statTo(&stat) + return stat, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // This is a readonly filesystem for now. + return nil +} + +// The vfs.FilesystemImpl functions below return EROFS because their respective +// man pages say that EROFS must be returned if the path resolves to a file on +// this read-only filesystem. + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + + if _, _, err := fs.walk(rp, true); err != nil { + return err + } + + return syserror.EROFS +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + + if _, _, err := fs.walk(rp, true); err != nil { + return err + } + + return syserror.EROFS +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + + _, _, err := fs.walk(rp, true) + if err != nil { + return err + } + + return syserror.EROFS +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if rp.Done() { + return syserror.ENOENT + } + + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + + return syserror.EROFS +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + + if !inode.isDir() { + return syserror.ENOTDIR + } + + return syserror.EROFS +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + + return syserror.EROFS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + + _, _, err := fs.walk(rp, true) + if err != nil { + return err + } + + return syserror.EROFS +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + + if inode.isDir() { + return syserror.EISDIR + } + + return syserror.EROFS +} + +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + _, inode, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + + // TODO(b/134676337): Support sockets. + return nil, syserror.ECONNREFUSED +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return "", err + } + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) +} diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go new file mode 100644 index 000000000..30636cf66 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -0,0 +1,242 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// inode represents an ext inode. +// +// inode uses the same inheritance pattern that pkg/sentry/vfs structures use. +// This has been done to increase memory locality. +// +// Implementations: +// inode -- +// |-- dir +// |-- symlink +// |-- regular-- +// |-- extent file +// |-- block map file +type inode struct { + // refs is a reference count. refs is accessed using atomic memory operations. + refs int64 + + // fs is the containing filesystem. + fs *filesystem + + // inodeNum is the inode number of this inode on disk. This is used to + // identify inodes within the ext filesystem. + inodeNum uint32 + + // blkSize is the fs data block size. Same as filesystem.sb.BlockSize(). + blkSize uint64 + + // diskInode gives us access to the inode struct on disk. Immutable. + diskInode disklayout.Inode + + locks vfs.FileLocks + + // This is immutable. The first field of the implementations must have inode + // as the first field to ensure temporality. + impl interface{} +} + +// incRef increments the inode ref count. +func (in *inode) incRef() { + atomic.AddInt64(&in.refs, 1) +} + +// tryIncRef tries to increment the ref count. Returns true if successful. +func (in *inode) tryIncRef() bool { + for { + refs := atomic.LoadInt64(&in.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) { + return true + } + } +} + +// decRef decrements the inode ref count and releases the inode resources if +// the ref count hits 0. +// +// Precondition: Must have locked filesystem.mu. +func (in *inode) decRef() { + if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { + delete(in.fs.inodeCache, in.inodeNum) + } else if refs < 0 { + panic("ext.inode.decRef() called without holding a reference") + } +} + +// newInode is the inode constructor. Reads the inode off disk. Identifies +// inodes based on the absolute inode number on disk. +func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { + if inodeNum == 0 { + panic("inode number 0 on ext filesystems is not possible") + } + + inodeRecordSize := fs.sb.InodeSize() + var diskInode disklayout.Inode + if inodeRecordSize == disklayout.OldInodeSize { + diskInode = &disklayout.InodeOld{} + } else { + diskInode = &disklayout.InodeNew{} + } + + // Calculate where the inode is actually placed. + inodesPerGrp := fs.sb.InodesPerGroup() + blkSize := fs.sb.BlockSize() + inodeTableOff := fs.bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize + inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp)) + + if err := readFromDisk(fs.dev, int64(inodeOff), diskInode); err != nil { + return nil, err + } + + // Build the inode based on its type. + args := inodeArgs{ + fs: fs, + inodeNum: inodeNum, + blkSize: blkSize, + diskInode: diskInode, + } + + switch diskInode.Mode().FileType() { + case linux.ModeSymlink: + f, err := newSymlink(args) + if err != nil { + return nil, err + } + return &f.inode, nil + case linux.ModeRegular: + f, err := newRegularFile(args) + if err != nil { + return nil, err + } + return &f.inode, nil + case linux.ModeDirectory: + f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType) + if err != nil { + return nil, err + } + return &f.inode, nil + default: + // TODO(b/134676337): Return appropriate errors for sockets, pipes and devices. + return nil, syserror.EINVAL + } +} + +type inodeArgs struct { + fs *filesystem + inodeNum uint32 + blkSize uint64 + diskInode disklayout.Inode +} + +func (in *inode) init(args inodeArgs, impl interface{}) { + in.fs = args.fs + in.inodeNum = args.inodeNum + in.blkSize = args.blkSize + in.diskInode = args.diskInode + in.impl = impl +} + +// open creates and returns a file description for the dentry passed in. +func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if err := in.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + mnt := rp.Mount() + switch in.impl.(type) { + case *regularFile: + var fd regularFileFD + fd.LockFD.Init(&in.locks) + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case *directory: + // Can't open directories writably. This check is not necessary for a read + // only filesystem but will be required when write is implemented. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + var fd directoryFD + fd.LockFD.Init(&in.locks) + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case *symlink: + if opts.Flags&linux.O_PATH == 0 { + // Can't open symlinks without O_PATH. + return nil, syserror.ELOOP + } + var fd symlinkFD + fd.LockFD.Init(&in.locks) + fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + return &fd.vfsfd, nil + default: + panic(fmt.Sprintf("unknown inode type: %T", in.impl)) + } +} + +func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID()) +} + +// statTo writes the statx fields to the output parameter. +func (in *inode) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | + linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME + stat.Blksize = uint32(in.blkSize) + stat.Mode = uint16(in.diskInode.Mode()) + stat.Nlink = uint32(in.diskInode.LinksCount()) + stat.UID = uint32(in.diskInode.UID()) + stat.GID = uint32(in.diskInode.GID()) + stat.Ino = uint64(in.inodeNum) + stat.Size = in.diskInode.Size() + stat.Atime = in.diskInode.AccessTime().StatxTimestamp() + stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp() + stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp() + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = in.fs.devMinor + // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks + // (including metadata blocks) required to represent this file. +} + +// getBGNum returns the block group number that a given inode belongs to. +func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 { + return (inodeNum - 1) / inodesPerGrp +} + +// getBGOff returns the offset at which the given inode lives in the block +// group's inode table, i.e. the index of the inode in the inode table. +func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 { + return (inodeNum - 1) % inodesPerGrp +} diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go new file mode 100644 index 000000000..66d14bb95 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -0,0 +1,162 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// regularFile represents a regular file's inode. This too follows the +// inheritance pattern prevelant in the vfs layer described in +// pkg/sentry/vfs/README.md. +type regularFile struct { + inode inode + + // This is immutable. The first field of fileReader implementations must be + // regularFile to ensure temporality. + // io.ReaderAt is more strict than io.Reader in the sense that a partial read + // is always accompanied by an error. If a read spans past the end of file, a + // partial read (within file range) is done and io.EOF is returned. + impl io.ReaderAt +} + +// newRegularFile is the regularFile constructor. It figures out what kind of +// file this is and initializes the fileReader. +func newRegularFile(args inodeArgs) (*regularFile, error) { + if args.diskInode.Flags().Extents { + file, err := newExtentFile(args) + if err != nil { + return nil, err + } + return &file.regFile, nil + } + + file, err := newBlockMapFile(args) + if err != nil { + return nil, err + } + return &file.regFile, nil +} + +func (in *inode) isRegular() bool { + _, ok := in.impl.(*regularFile) + return ok +} + +// directoryFD represents a directory file description. It implements +// vfs.FileDescriptionImpl. +type regularFileFD struct { + fileDescription + vfs.LockFD + + // off is the file offset. off is accessed using atomic memory operations. + off int64 + + // offMu serializes operations that may mutate off. + offMu sync.Mutex +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() {} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + safeReader := safemem.FromIOReaderAt{ + ReaderAt: fd.inode().impl.(*regularFile).impl, + Offset: offset, + } + + // Copies data from disk directly into usermem without any intermediate + // allocations (if dst is converted into BlockSeq such that it does not need + // safe copying). + return dst.CopyOutFrom(ctx, safeReader) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.offMu.Lock() + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // write(2) specifies that EBADF must be returned if the fd is not open for + // writing. + return 0, syserror.EBADF +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.offMu.Lock() + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.offMu.Lock() + defer fd.offMu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as specified. + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + offset += int64(fd.inode().diskInode.Size()) + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + // TODO(b/134676337): Implement mmap(2). + return syserror.ENODEV +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go new file mode 100644 index 000000000..62efd4095 --- /dev/null +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -0,0 +1,111 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// symlink represents a symlink inode. +type symlink struct { + inode inode + target string // immutable +} + +// newSymlink is the symlink constructor. It reads out the symlink target from +// the inode (however it might have been stored). +func newSymlink(args inodeArgs) (*symlink, error) { + var link []byte + + // If the symlink target is lesser than 60 bytes, its stores in inode.Data(). + // Otherwise either extents or block maps will be used to store the link. + size := args.diskInode.Size() + if size < 60 { + link = args.diskInode.Data()[:size] + } else { + // Create a regular file out of this inode and read out the target. + regFile, err := newRegularFile(args) + if err != nil { + return nil, err + } + + link = make([]byte, size) + if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size { + return nil, err + } + } + + file := &symlink{target: string(link)} + file.inode.init(args, file) + return file, nil +} + +func (in *inode) isSymlink() bool { + _, ok := in.impl.(*symlink) + return ok +} + +// symlinkFD represents a symlink file description and implements implements +// vfs.FileDescriptionImpl. which may only be used if open options contains +// O_PATH. For this reason most of the functions return EBADF. +type symlinkFD struct { + fileDescription + vfs.NoLockFD +} + +// Compiles only if symlinkFD implements vfs.FileDescriptionImpl. +var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *symlinkFD) Release() {} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.EBADF +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return syserror.EBADF +} diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go new file mode 100644 index 000000000..d8b728f8c --- /dev/null +++ b/pkg/sentry/fsimpl/ext/utils.go @@ -0,0 +1,94 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/syserror" +) + +// readFromDisk performs a binary read from disk into the given struct from +// the absolute offset provided. +func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error { + n := binary.Size(v) + buf := make([]byte, n) + if read, _ := dev.ReadAt(buf, abOff); read < int(n) { + return syserror.EIO + } + + binary.Unmarshal(buf, binary.LittleEndian, v) + return nil +} + +// readSuperBlock reads the SuperBlock from block group 0 in the underlying +// device. There are three versions of the superblock. This function identifies +// and returns the correct version. +func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) { + var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + if sb.Revision() == disklayout.OldRev { + return sb, nil + } + + sb = &disklayout.SuperBlock32Bit{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + if !sb.IncompatibleFeatures().Is64Bit { + return sb, nil + } + + sb = &disklayout.SuperBlock64Bit{} + if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { + return nil, err + } + return sb, nil +} + +// blockGroupsCount returns the number of block groups in the ext fs. +func blockGroupsCount(sb disklayout.SuperBlock) uint64 { + blocksCount := sb.BlocksCount() + blocksPerGroup := uint64(sb.BlocksPerGroup()) + + // Round up the result. float64 can compromise precision so do it manually. + return (blocksCount + blocksPerGroup - 1) / blocksPerGroup +} + +// readBlockGroups reads the block group descriptor table from block group 0 in +// the underlying device. +func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { + bgCount := blockGroupsCount(sb) + bgdSize := uint64(sb.BgDescSize()) + is64Bit := sb.IncompatibleFeatures().Is64Bit + bgds := make([]disklayout.BlockGroup, bgCount) + + for i, off := uint64(0), uint64(sb.FirstDataBlock()+1)*sb.BlockSize(); i < bgCount; i, off = i+1, off+bgdSize { + if is64Bit { + bgds[i] = &disklayout.BlockGroup64Bit{} + } else { + bgds[i] = &disklayout.BlockGroup32Bit{} + } + + if err := readFromDisk(dev, int64(off), bgds[i]); err != nil { + return nil, err + } + } + return bgds, nil +} diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD new file mode 100644 index 000000000..41567967d --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -0,0 +1,19 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "fuse", + srcs = [ + "dev.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go new file mode 100644 index 000000000..f6a67d005 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -0,0 +1,100 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const fuseDevMinor = 229 + +// fuseDevice implements vfs.Device for /dev/fuse. +type fuseDevice struct{} + +// Open implements vfs.Device.Open. +func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + var fd DeviceFD + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse. +type DeviceFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // TODO(gvisor.dev/issue/2987): Add all the data structures needed to enqueue + // and deque requests, control synchronization and establish communication + // between the FUSE kernel module and the /dev/fuse character device. +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *DeviceFD) Release() {} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ENOSYS +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ENOSYS +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ENOSYS +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ENOSYS +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.ENOSYS +} + +// Register registers the FUSE device with vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ + GroupName: "misc", + }); err != nil { + return err + } + + return nil +} + +// CreateDevtmpfsFile creates a device special file in devtmpfs. +func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { + if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { + return err + } + + return nil +} diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD new file mode 100644 index 000000000..4a800dcf9 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -0,0 +1,89 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "gofer", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "gofer", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + +go_library( + name = "gofer", + srcs = [ + "dentry_list.go", + "directory.go", + "filesystem.go", + "fstree.go", + "gofer.go", + "handle.go", + "host_named_pipe.go", + "p9file.go", + "regular_file.go", + "socket.go", + "special_file.go", + "symlink.go", + "time.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fd", + "//pkg/fdnotifier", + "//pkg/fspath", + "//pkg/log", + "//pkg/p9", + "//pkg/safemem", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/hostfd", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/unet", + "//pkg/usermem", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "gofer_test", + srcs = ["gofer_test.go"], + library = ":gofer", + deps = [ + "//pkg/p9", + "//pkg/sentry/contexttest", + ], +) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go new file mode 100644 index 000000000..8c7c8e1b3 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -0,0 +1,308 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isDir() bool { + return d.fileType() == linux.S_IFDIR +} + +// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked. +// d.isDir(). child must be a newly-created dentry that has never had a parent. +func (d *dentry) cacheNewChildLocked(child *dentry, name string) { + d.IncRef() // reference held by child on its parent + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*dentry) + } + d.children[name] = child +} + +// Preconditions: d.dirMu must be locked. d.isDir(). +func (d *dentry) cacheNegativeLookupLocked(name string) { + // Don't cache negative lookups if InteropModeShared is in effect (since + // this makes remote lookup unavoidable), or if d.isSynthetic() (in which + // case the only files in the directory are those for which a dentry exists + // in d.children). Instead, just delete any previously-cached dentry. + if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { + delete(d.children, name) + return + } + if d.children == nil { + d.children = make(map[string]*dentry) + } + d.children[name] = nil +} + +type createSyntheticOpts struct { + name string + mode linux.FileMode + kuid auth.KUID + kgid auth.KGID + + // The endpoint for a synthetic socket. endpoint should be nil if the file + // being created is not a socket. + endpoint transport.BoundEndpoint + + // pipe should be nil if the file being created is not a pipe. + pipe *pipe.VFSPipe +} + +// createSyntheticChildLocked creates a synthetic file with the given name +// in d. +// +// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain +// a child with the given name. +func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { + d2 := &dentry{ + refs: 1, // held by d + fs: d.fs, + ino: d.fs.nextSyntheticIno(), + mode: uint32(opts.mode), + uid: uint32(opts.kuid), + gid: uint32(opts.kgid), + blockSize: usermem.PageSize, // arbitrary + handle: handle{ + fd: -1, + }, + nlink: uint32(2), + } + switch opts.mode.FileType() { + case linux.S_IFDIR: + // Nothing else needs to be done. + case linux.S_IFSOCK: + d2.endpoint = opts.endpoint + case linux.S_IFIFO: + d2.pipe = opts.pipe + default: + panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) + } + d2.pf.dentry = d2 + d2.vfsd.Init(d2) + + d.cacheNewChildLocked(d2, opts.name) + d.syntheticChildren++ +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + mu sync.Mutex + off int64 + dirents []vfs.Dirent +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fd.mu.Lock() + defer fd.mu.Unlock() + + d := fd.dentry() + if fd.dirents == nil { + ds, err := d.getDirents(ctx) + if err != nil { + return err + } + fd.dirents = ds + } + + d.InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + if d.cachedMetadataAuthoritative() { + d.touchAtime(fd.vfsfd.Mount()) + } + + for fd.off < int64(len(fd.dirents)) { + if err := cb.Handle(fd.dirents[fd.off]); err != nil { + return err + } + fd.off++ + } + return nil +} + +// Preconditions: d.isDir(). There exists at least one directoryFD representing d. +func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { + // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the + // presence of concurrent mutation of an iterated directory, so + // implementations may duplicate or omit entries in this case, which + // violates POSIX semantics. Thus we read all directory entries while + // holding d.dirMu to exclude directory mutations. (Note that it is + // impossible for the client to exclude concurrent mutation from other + // remote filesystem users. Since there is no way to detect if the server + // has incorrectly omitted directory entries, we simply assume that the + // server is well-behaved under InteropModeShared.) This is inconsistent + // with Linux (which appears to assume that directory fids have the correct + // semantics, and translates struct file_operations::readdir calls directly + // to readdir RPCs), but is consistent with VFS1. + + // filesystem.renameMu is needed for d.parent, and must be locked before + // dentry.dirMu. + d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() + d.dirMu.Lock() + defer d.dirMu.Unlock() + if d.dirents != nil { + return d.dirents, nil + } + + // It's not clear if 9P2000.L's readdir is expected to return "." and "..", + // so we generate them here. + parent := genericParentOrSelf(d) + dirents := []vfs.Dirent{ + { + Name: ".", + Type: linux.DT_DIR, + Ino: uint64(d.ino), + NextOff: 1, + }, + { + Name: "..", + Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), + Ino: uint64(parent.ino), + NextOff: 2, + }, + } + var realChildren map[string]struct{} + if !d.isSynthetic() { + if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { + // Record the set of children d actually has so that we don't emit + // duplicate entries for synthetic children. + realChildren = make(map[string]struct{}) + } + off := uint64(0) + const count = 64 * 1024 // for consistency with the vfs1 client + d.handleMu.RLock() + if !d.handleReadable { + // This should not be possible because a readable handle should + // have been opened when the calling directoryFD was opened. + d.handleMu.RUnlock() + panic("gofer.dentry.getDirents called without a readable handle") + } + for { + p9ds, err := d.handle.file.readdir(ctx, off, count) + if err != nil { + d.handleMu.RUnlock() + return nil, err + } + if len(p9ds) == 0 { + d.handleMu.RUnlock() + break + } + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: uint64(inoFromPath(p9d.QID.Path)), + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[p9d.Name] = struct{}{} + } + } + off = p9ds[len(p9ds)-1].Offset + } + } + // Emit entries for synthetic children. + if d.syntheticChildren != 0 { + for _, child := range d.children { + if child == nil || !child.isSynthetic() { + continue + } + if _, ok := realChildren[child.name]; ok { + continue + } + dirents = append(dirents, vfs.Dirent{ + Name: child.name, + Type: uint8(atomic.LoadUint32(&child.mode) >> 12), + Ino: uint64(child.ino), + NextOff: int64(len(dirents) + 1), + }) + } + } + // Cache dirents for future directoryFDs if permitted. + if d.cachedMetadataAuthoritative() { + d.dirents = dirents + } + return dirents, nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return 0, syserror.EINVAL + } + if offset == 0 { + // Ensure that the next call to fd.IterDirents() calls + // fd.dentry().getDirents(). + fd.dirents = nil + } + fd.off = offset + return fd.off, nil + case linux.SEEK_CUR: + offset += fd.off + if offset < 0 { + return 0, syserror.EINVAL + } + // Don't clear fd.dirents in this case, even if offset == 0. + fd.off = offset + return fd.off, nil + default: + return 0, syserror.EINVAL + } +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *directoryFD) Sync(ctx context.Context) error { + return fd.dentry().handle.sync(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go new file mode 100644 index 000000000..cd5f5049e --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -0,0 +1,1504 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // Snapshot current syncable dentries and special files. + fs.syncMu.Lock() + ds := make([]*dentry, 0, len(fs.syncableDentries)) + for d := range fs.syncableDentries { + d.IncRef() + ds = append(ds, d) + } + sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) + for sffd := range fs.specialFileFDs { + sffd.vfsfd.IncRef() + sffds = append(sffds, sffd) + } + fs.syncMu.Unlock() + + // Return the first error we encounter, but sync everything we can + // regardless. + var retErr error + + // Sync regular files. + for _, d := range ds { + err := d.syncSharedHandle(ctx) + d.DecRef() + if err != nil && retErr == nil { + retErr = err + } + } + + // Sync special files, which may be writable but do not use dentry shared + // handles (so they won't be synced by the above). + for _, sffd := range sffds { + err := sffd.Sync(ctx) + sffd.vfsfd.DecRef() + if err != nil && retErr == nil { + retErr = err + } + } + + return retErr +} + +// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's +// encoding of strings, which uses 2 bytes for the length prefix. +const maxFilenameLen = (1 << 16) - 1 + +// dentrySlicePool is a pool of *[]*dentry used to store dentries for which +// dentry.checkCachingLocked() must be called. The pool holds pointers to +// slices because Go lacks generics, so sync.Pool operates on interface{}, so +// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy +// of the slice header on the heap. +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may become cached as a result of the traversal are appended +// to *ds. +// +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata +// must be up to date. +// +// Postconditions: The returned dentry's cached metadata is up to date. +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + // We must assume that d.parent is correct, because if d has been moved + // elsewhere in the remote filesystem so that its parent has changed, + // we have no way of determining its new parent's location in the + // filesystem. + // + // Call rp.CheckMount() before updating d.parent's metadata, since if + // we traverse to another mount then d.parent's metadata is irrelevant. + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + if d != d.parent && !d.cachedMetadataAuthoritative() { + _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) + if err != nil { + return nil, err + } + d.parent.updateFromP9Attrs(attrMask, &attr) + } + rp.Advance() + return d.parent, nil + } + child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds) + if err != nil { + return nil, err + } + if child == nil { + return nil, syserror.ENOENT + } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } + if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// getChildLocked returns a dentry representing the child of parent with the +// given name. If no such child exists, getChildLocked returns (nil, nil). +// +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +// parent.isDir(). name is not "." or "..". +// +// Postconditions: If getChildLocked returns a non-nil dentry, its cached +// metadata is up to date. +func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { + if len(name) > maxFilenameLen { + return nil, syserror.ENAMETOOLONG + } + child, ok := parent.children[name] + if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() { + // Whether child is nil or not, it is cached information that is + // assumed to be correct. + return child, nil + } + // We either don't have cached information or need to verify that it's + // still correct, either of which requires a remote lookup. Check if this + // name is valid before performing the lookup. + return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds) +} + +// Preconditions: As for getChildLocked. !parent.isSynthetic(). +func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { + qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) + if err != nil && err != syserror.ENOENT { + return nil, err + } + if child != nil { + if !file.isNil() && inoFromPath(qid.Path) == child.ino { + // The file at this path hasn't changed. Just update cached metadata. + file.close(ctx) + child.updateFromP9Attrs(attrMask, &attr) + return child, nil + } + if file.isNil() && child.isSynthetic() { + // We have a synthetic file, and no remote file has arisen to + // replace it. + return child, nil + } + // The file at this path has changed or no longer exists. Mark the + // dentry invalidated, and re-evaluate its caching status (i.e. if it + // has 0 references, drop it). Wait to update parent.children until we + // know what to replace the existing dentry with (i.e. one of the + // returns below), to avoid a redundant map access. + vfsObj.InvalidateDentry(&child.vfsd) + if child.isSynthetic() { + // Normally we don't mark invalidated dentries as deleted since + // they may still exist (but at a different path), and also for + // consistency with Linux. However, synthetic files are guaranteed + // to become unreachable if their dentries are invalidated, so + // treat their invalidation as deletion. + child.setDeleted() + parent.syntheticChildren-- + child.decRefLocked() + parent.dirents = nil + } + *ds = appendDentry(*ds, child) + } + if file.isNil() { + // No file exists at this path now. Cache the negative lookup if + // allowed. + parent.cacheNegativeLookupLocked(name) + return nil, nil + } + // Create a new dentry representing the file. + child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) + if err != nil { + file.close(ctx) + delete(parent.children, name) + return nil, err + } + parent.cacheNewChildLocked(child, name) + // For now, child has 0 references, so our caller should call + // child.checkCachingLocked(). + *ds = appendDentry(*ds, child) + return child, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: fs.renameMu must be locked. !rp.Done(). If +// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to +// date. +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + if !d.cachedMetadataAuthoritative() { + // Get updated metadata for rp.Start() as required by fs.stepLocked(). + if err := d.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// createInRemoteDir (if the parent directory is a real remote directory) or +// createInSyntheticDir (if the parent directory is synthetic) to do so. +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if !start.cachedMetadataAuthoritative() { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return err + } + } + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + if len(name) > maxFilenameLen { + return syserror.ENAMETOOLONG + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + if parent.isDeleted() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + if parent.isSynthetic() { + if child := parent.children[name]; child != nil { + return syserror.EEXIST + } + if createInSyntheticDir == nil { + return syserror.EPERM + } + if err := createInSyntheticDir(parent, name); err != nil { + return err + } + parent.touchCMtime() + parent.dirents = nil + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + return nil + } + if fs.opts.interop == InteropModeShared { + if child := parent.children[name]; child != nil && child.isSynthetic() { + return syserror.EEXIST + } + // The existence of a non-synthetic dentry at name would be inconclusive + // because the file it represents may have been deleted from the remote + // filesystem, so we would need to make an RPC to revalidate the dentry. + // Just attempt the file creation RPC instead. If a file does exist, the + // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a + // stale dentry exists, the dentry will fail revalidation next time it's + // used. + if err := createInRemoteDir(parent, name); err != nil { + return err + } + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + return nil + } + if child := parent.children[name]; child != nil { + return syserror.EEXIST + } + // No cached dentry exists; however, there might still be an existing file + // at name. As above, we attempt the file creation RPC anyway. + if err := createInRemoteDir(parent, name); err != nil { + return err + } + if child, ok := parent.children[name]; ok && child == nil { + // Delete the now-stale negative dentry. + delete(parent.children, name) + } + parent.touchCMtime() + parent.dirents = nil + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + return nil +} + +// Preconditions: !rp.Done(). +func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if !start.cachedMetadataAuthoritative() { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return err + } + } + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + + name := rp.Component() + if dir { + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + } else { + if name == "." || name == ".." { + return syserror.EISDIR + } + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + child, ok := parent.children[name] + if ok && child == nil { + return syserror.ENOENT + } + + sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0 + if sticky { + if !ok { + // If the sticky bit is set, we need to retrieve the child to determine + // whether removing it is allowed. + child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) + if err != nil { + return err + } + } else if child != nil && !child.cachedMetadataAuthoritative() { + // Make sure the dentry representing the file at name is up to date + // before examining its metadata. + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) + if err != nil { + return err + } + } + if err := parent.mayDelete(rp.Credentials(), child); err != nil { + return err + } + } + + // If a child dentry exists, prepare to delete it. This should fail if it is + // a mount point. We detect mount points by speculatively calling + // PrepareDeleteDentry, which fails if child is a mount point. However, we + // may need to revalidate the file in this case to make sure that it has not + // been deleted or replaced on the remote fs, in which case the mount point + // will have disappeared. If calling PrepareDeleteDentry fails again on the + // up-to-date dentry, we can be sure that it is a mount point. + // + // Also note that if child is nil, then it can't be a mount point. + if child != nil { + // Hold child.dirMu so we can check child.children and + // child.syntheticChildren. We don't access these fields until a bit later, + // but locking child.dirMu after calling vfs.PrepareDeleteDentry() would + // create an inconsistent lock ordering between dentry.dirMu and + // vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a + // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between + // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock + // child.dirMu before calling PrepareDeleteDentry. + child.dirMu.Lock() + defer child.dirMu.Unlock() + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + // We can skip revalidation in several cases: + // - We are not in InteropModeShared + // - The parent directory is synthetic, in which case the child must also + // be synthetic + // - We already updated the child during the sticky bit check above + if parent.cachedMetadataAuthoritative() || sticky { + return err + } + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) + if err != nil { + return err + } + if child != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + } + } + } + flags := uint32(0) + // If a dentry exists, use it for best-effort checks on its deletability. + if dir { + if child != nil { + // child must be an empty directory. + if child.syntheticChildren != 0 { + // This is definitely not an empty directory, irrespective of + // fs.opts.interop. + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + // If InteropModeShared is in effect and the first call to + // PrepareDeleteDentry above succeeded, then child wasn't + // revalidated (so we can't expect its file type to be correct) and + // individually revalidating its children (to confirm that they + // still exist) would be a waste of time. + if child.cachedMetadataAuthoritative() { + if !child.isDir() { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTDIR + } + for _, grandchild := range child.children { + if grandchild != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + } + } + } + flags = linux.AT_REMOVEDIR + } else { + // child must be a non-directory file. + if child != nil && child.isDir() { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.EISDIR + } + if rp.MustBeDir() { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return syserror.ENOTDIR + } + } + if parent.isSynthetic() { + if child == nil { + return syserror.ENOENT + } + } else if child == nil || !child.isSynthetic() { + err = parent.file.unlinkAt(ctx, name, flags) + if err != nil { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err + } + } + + // Generate inotify events for rmdir or unlink. + if dir { + parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + } else { + var cw *vfs.Watches + if child != nil { + cw = &child.watches + } + vfs.InotifyRemoveChild(cw, &parent.watches, name) + } + + if child != nil { + vfsObj.CommitDeleteDentry(&child.vfsd) + child.setDeleted() + if child.isSynthetic() { + parent.syntheticChildren-- + child.decRefLocked() + } + ds = appendDentry(ds, child) + } + parent.cacheNegativeLookupLocked(name) + if parent.cachedMetadataAuthoritative() { + parent.dirents = nil + parent.touchCMtime() + if dir { + parent.decLinks() + } + } + return nil +} + +// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls +// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkCachingLocked() + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkCachingLocked() + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if !start.cachedMetadataAuthoritative() { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + // 9P2000.L supports hard links, but we don't. + return syserror.EPERM + }, nil) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + creds := rp.Credentials() + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { + if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil { + if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + return err + } + ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: linux.S_IFDIR | opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + }) + } + if fs.opts.interop != InteropModeShared { + parent.incLinks() + } + return nil + }, func(parent *dentry, name string) error { + if !opts.ForSyntheticMountpoint { + // Can't create non-synthetic files in synthetic directories. + return syserror.EPERM + } + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: linux.S_IFDIR | opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + }) + parent.incLinks() + return nil + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + // If the gofer does not allow creating a socket or pipe, create a + // synthetic one, i.e. one that is kept entirely in memory. + if err == syserror.EPERM { + switch opts.Mode.FileType() { + case linux.S_IFSOCK: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + endpoint: opts.Endpoint, + }) + return nil + case linux.S_IFIFO: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + }) + return nil + } + } + return err + }, nil) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Reject O_TMPFILE, which is not supported; supporting it correctly in the + // presence of other remote filesystem users requires remote filesystem + // support, and it isn't clear that there's any way to implement this in + // 9P. + if opts.Flags&linux.O_TMPFILE != 0 { + return nil, syserror.EOPNOTSUPP + } + mayCreate := opts.Flags&linux.O_CREAT != 0 + mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + + start := rp.Start().Impl().(*dentry) + if !start.cachedMetadataAuthoritative() { + // Get updated metadata for start as required by fs.stepLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + if rp.Done() { + return start.openLocked(ctx, rp, &opts) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Determine whether or not we need to create a file. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) + if err == syserror.ENOENT && mayCreate { + if parent.isSynthetic() { + parent.dirMu.Unlock() + return nil, syserror.EPERM + } + fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds) + parent.dirMu.Unlock() + return fd, err + } + parent.dirMu.Unlock() + if err != nil { + return nil, err + } + if mustCreate { + return nil, syserror.EEXIST + } + if !child.isDir() && rp.MustBeDir() { + return nil, syserror.ENOTDIR + } + // Open existing child or follow symlink. + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, &opts) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + + trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG + if trunc { + // Lock metadataMu *while* we open a regular file with O_TRUNC because + // open(2) will change the file size on server. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + } + + var vfd *vfs.FileDescription + var err error + mnt := rp.Mount() + switch d.fileType() { + case linux.S_IFREG: + if !d.fs.opts.regularFilesUseSpecialFileFD { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil { + return nil, err + } + fd := ®ularFileFD{} + fd.LockFD.Init(&d.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + vfd = &fd.vfsfd + } + case linux.S_IFDIR: + // Can't open directories with O_CREAT. + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EISDIR + } + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + if !d.isSynthetic() { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { + return nil, err + } + } + fd := &directoryFD{} + fd.LockFD.Init(&d.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case linux.S_IFLNK: + // Can't open symlinks without O_PATH (which is unimplemented). + return nil, syserror.ELOOP + case linux.S_IFSOCK: + if d.isSynthetic() { + return nil, syserror.ENXIO + } + if d.fs.iopts.OpenSocketsByConnecting { + return d.connectSocketLocked(ctx, opts) + } + case linux.S_IFIFO: + if d.isSynthetic() { + return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) + } + } + + if vfd == nil { + if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil { + return nil, err + } + } + + if trunc { + // If no errors occured so far then update file size in memory. This + // step is required even if !d.cachedMetadataAuthoritative() because + // d.mappings has to be updated. + // d.metadataMu has already been acquired if trunc == true. + d.updateFileSizeLocked(0) + + if d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + } + return vfd, err +} + +func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) + if err != nil { + return nil, err + } + fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{ + HaveFlags: true, + Flags: opts.Flags, + }) + if err != nil { + fdObj.Close() + return nil, err + } + fdObj.Release() + return fd, nil +} + +func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + // We assume that the server silently inserts O_NONBLOCK in the open flags + // for all named pipes (because all existing gofers do this). + // + // NOTE(b/133875563): This makes named pipe opens racy, because the + // mechanisms for translating nonblocking to blocking opens can only detect + // the instantaneous presence of a peer holding the other end of the pipe + // open, not whether the pipe was *previously* opened by a peer that has + // since closed its end. + isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 +retry: + h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) + if err != nil { + if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO { + // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails + // with ENXIO if opening the same named pipe with O_WRONLY would + // block because there are no readers of the pipe. + if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { + return nil, err + } + goto retry + } + return nil, err + } + if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 { + if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil { + h.close(ctx) + return nil, err + } + } + fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags) + if err != nil { + h.close(ctx) + return nil, err + } + return &fd.vfsfd, nil +} + +// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. +// !d.isSynthetic(). +func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if d.isDeleted() { + return nil, syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + defer mnt.EndWrite() + + // 9P2000.L's lcreate takes a fid representing the parent directory, and + // converts it into an open fid representing the created file, so we need + // to duplicate the directory fid first. + _, dirfile, err := d.file.walk(ctx, nil) + if err != nil { + return nil, err + } + creds := rp.Credentials() + name := rp.Component() + // Filter file creation flags and O_LARGEFILE out; the create RPC already + // has the semantics of O_CREAT|O_EXCL, while some servers will choke on + // O_LARGEFILE. + createFlags := p9.OpenFlags(opts.Flags &^ (vfs.FileCreationFlags | linux.O_LARGEFILE)) + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + if err != nil { + dirfile.close(ctx) + return nil, err + } + // Then we need to walk to the file we just created to get a non-open fid + // representing it, and to get its metadata. This must use d.file since, as + // explained above, dirfile was invalidated by dirfile.Create(). + _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + if err != nil { + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + + // Construct the new dentry. + child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) + if err != nil { + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + *ds = appendDentry(*ds, child) + // Incorporate the fid that was opened by lcreate. + useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD + if useRegularFileFD { + child.handleMu.Lock() + child.handle.file = openFile + if fdobj != nil { + child.handle.fd = int32(fdobj.Release()) + } + child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags) + child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) + child.handleMu.Unlock() + } + // Insert the dentry into the tree. + d.cacheNewChildLocked(child, name) + if d.cachedMetadataAuthoritative() { + d.touchCMtime() + d.dirents = nil + } + + // Finally, construct a file description representing the created file. + var childVFSFD *vfs.FileDescription + if useRegularFileFD { + fd := ®ularFileFD{} + fd.LockFD.Init(&child.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + childVFSFD = &fd.vfsfd + } else { + h := handle{ + file: openFile, + fd: -1, + } + if fdobj != nil { + h.fd = int32(fdobj.Release()) + } + fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags) + if err != nil { + h.close(ctx) + return nil, err + } + childVFSFD = &fd.vfsfd + } + d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + return childVFSFD, nil +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + if !d.isSymlink() { + return "", syserror.EINVAL + } + return d.readlink(ctx, rp.Mount()) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + // Requires 9P support. + return syserror.EINVAL + } + + var ds *[]*dentry + fs.renameMu.Lock() + defer fs.renameMuUnlockAndCheckCaching(&ds) + newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + oldParent := oldParentVD.Dentry().Impl().(*dentry) + if !oldParent.cachedMetadataAuthoritative() { + if err := oldParent.updateFromGetattr(ctx); err != nil { + return err + } + } + creds := rp.Credentials() + if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + vfsObj := rp.VirtualFilesystem() + // We need a dentry representing the renamed file since, if it's a + // directory, we need to check for write permission on it. + oldParent.dirMu.Lock() + defer oldParent.dirMu.Unlock() + renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds) + if err != nil { + return err + } + if renamed == nil { + return syserror.ENOENT + } + if err := oldParent.mayDelete(creds, renamed); err != nil { + return err + } + if renamed.isDir() { + if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { + return syserror.EINVAL + } + if oldParent != newParent { + if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if oldParent != newParent { + if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + newParent.dirMu.Lock() + defer newParent.dirMu.Unlock() + } + if newParent.isDeleted() { + return syserror.ENOENT + } + replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) + if err != nil { + return err + } + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } + } + + if oldParent == newParent && oldName == newName { + return nil + } + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { + return err + } + + // Update the remote filesystem. + if !renamed.isSynthetic() { + if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } else if replaced != nil && !replaced.isSynthetic() { + // We are replacing an existing real file with a synthetic one, so we + // need to unlink the former. + flags := uint32(0) + if replaced.isDir() { + flags = linux.AT_REMOVEDIR + } + if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + + // Update the dentry tree. + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + if replaced != nil { + replaced.setDeleted() + if replaced.isSynthetic() { + newParent.syntheticChildren-- + replaced.decRefLocked() + } + ds = appendDentry(ds, replaced) + } + oldParent.cacheNegativeLookupLocked(oldName) + // We don't use newParent.cacheNewChildLocked() since we don't want to mess + // with reference counts and queue oldParent for checkCachingLocked if the + // parent isn't actually changing. + if oldParent != newParent { + ds = appendDentry(ds, oldParent) + newParent.IncRef() + if renamed.isSynthetic() { + oldParent.syntheticChildren-- + newParent.syntheticChildren++ + } + } + renamed.parent = newParent + renamed.name = newName + if newParent.children == nil { + newParent.children = make(map[string]*dentry) + } + newParent.children[newName] = renamed + + // Update metadata. + if renamed.cachedMetadataAuthoritative() { + renamed.touchCtime() + } + if oldParent.cachedMetadataAuthoritative() { + oldParent.dirents = nil + oldParent.touchCMtime() + if renamed.isDir() { + oldParent.decLinks() + } + } + if newParent.cachedMetadataAuthoritative() { + newParent.dirents = nil + newParent.touchCMtime() + if renamed.isDir() && (replaced == nil || !replaced.isDir()) { + // Increase the link count if we did not replace another directory. + newParent.incLinks() + } + } + vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return fs.unlinkAt(ctx, rp, true /* dir */) +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + if err := d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()); err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + fs.renameMuRUnlockAndCheckCaching(&ds) + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0, vfs.InodeEvent) + } + return nil +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + // Since walking updates metadata for all traversed dentries under + // InteropModeShared, including the returned one, we can return cached + // metadata here regardless of fs.opts.interop. + var stat linux.Statx + d.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statfs{}, err + } + // If d is synthetic, invoke statfs on the first ancestor of d that isn't. + for d.isSynthetic() { + d = d.parent + } + fsstat, err := d.file.statFS(ctx) + if err != nil { + return linux.Statfs{}, err + } + nameLen := uint64(fsstat.NameLength) + if nameLen > maxFilenameLen { + nameLen = maxFilenameLen + } + return linux.Statfs{ + // This is primarily for distinguishing a gofer file system in + // tests. Testing is important, so instead of defining + // something completely random, use a standard value. + Type: linux.V9FS_MAGIC, + BlockSize: int64(fsstat.BlockSize), + Blocks: fsstat.Blocks, + BlocksFree: fsstat.BlocksFree, + BlocksAvailable: fsstat.BlocksAvailable, + Files: fsstat.Files, + FilesFree: fsstat.FilesFree, + NameLength: nameLen, + }, nil +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }, nil) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return fs.unlinkAt(ctx, rp, false /* dir */) +} + +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if d.isSocket() { + if !d.isSynthetic() { + d.IncRef() + return &endpoint{ + dentry: d, + file: d.file.file, + path: opts.Addr, + }, nil + } + return d.endpoint, nil + } + return nil, syserror.ECONNREFUSED +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + return d.listxattr(ctx, rp.Credentials(), size) +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + return d.getxattr(ctx, rp.Credentials(), &opts) +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + fs.renameMuRUnlockAndCheckCaching(&ds) + + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + var ds *[]*dentry + fs.renameMu.RLock() + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + if err := d.removexattr(ctx, rp.Credentials(), name); err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + fs.renameMuRUnlockAndCheckCaching(&ds) + + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) +} + +func (fs *filesystem) nextSyntheticIno() inodeNumber { + return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask) +} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go new file mode 100644 index 000000000..2b83094cd --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -0,0 +1,1550 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gofer provides a filesystem implementation that is backed by a 9p +// server, interchangably referred to as "gofers" throughout this package. +// +// Lock order: +// regularFileFD/directoryFD.mu +// filesystem.renameMu +// dentry.dirMu +// filesystem.syncMu +// dentry.metadataMu +// *** "memmap.Mappable locks" below this point +// dentry.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point +// dentry.handleMu +// dentry.dataMu +// +// Locking dentry.dirMu in multiple dentries requires that either ancestor +// dentries are locked before descendant dentries, or that filesystem.renameMu +// is locked for writing. +package gofer + +import ( + "fmt" + "strconv" + "strings" + "sync" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Name is the default filesystem name. +const Name = "9p" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // mfp is used to allocate memory that caches regular file contents. mfp is + // immutable. + mfp pgalloc.MemoryFileProvider + + // Immutable options. + opts filesystemOptions + iopts InternalFilesystemOptions + + // client is the client used by this filesystem. client is immutable. + client *p9.Client + + // clock is a realtime clock used to set timestamps in file operations. + clock ktime.Clock + + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 + + // renameMu serves two purposes: + // + // - It synchronizes path resolution with renaming initiated by this + // client. + // + // - It is held by path resolution to ensure that reachable dentries remain + // valid. A dentry is reachable by path resolution if it has a non-zero + // reference count (such that it is usable as vfs.ResolvingPath.Start() or + // is reachable from its children), or if it is a child dentry (such that + // it is reachable from its parent). + renameMu sync.RWMutex + + // cachedDentries contains all dentries with 0 references. (Due to race + // conditions, it may also contain dentries with non-zero references.) + // cachedDentriesLen is the number of dentries in cachedDentries. These + // fields are protected by renameMu. + cachedDentries dentryList + cachedDentriesLen uint64 + + // syncableDentries contains all dentries in this filesystem for which + // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs. + // These fields are protected by syncMu. + syncMu sync.Mutex + syncableDentries map[*dentry]struct{} + specialFileFDs map[*specialFileFD]struct{} + + // syntheticSeq stores a counter to used to generate unique inodeNumber for + // synthetic dentries. + syntheticSeq uint64 +} + +// inodeNumber represents inode number reported in Dirent.Ino. For regular +// dentries, it comes from QID.Path from the 9P server. Synthetic dentries +// have have their inodeNumber generated sequentially, with the MSB reserved to +// prevent conflicts with regular dentries. +type inodeNumber uint64 + +// Reserve MSB for synthetic mounts. +const syntheticInoMask = uint64(1) << 63 + +func inoFromPath(path uint64) inodeNumber { + if path&syntheticInoMask != 0 { + log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask) + } + return inodeNumber(path &^ syntheticInoMask) +} + +type filesystemOptions struct { + // "Standard" 9P options. + fd int + aname string + interop InteropMode // derived from the "cache" mount option + dfltuid auth.KUID + dfltgid auth.KGID + msize uint32 + version string + + // maxCachedDentries is the maximum number of dentries with 0 references + // retained by the client. + maxCachedDentries uint64 + + // If forcePageCache is true, host FDs may not be used for application + // memory mappings even if available; instead, the client must perform its + // own caching of regular file pages. This is primarily useful for testing. + forcePageCache bool + + // If limitHostFDTranslation is true, apply maxFillRange() constraints to + // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This + // makes memory accounting behavior more consistent between cases where + // host FDs are / are not available, but may increase the frequency of + // sentry-handled page faults on files for which a host FD is available. + limitHostFDTranslation bool + + // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote + // filesystem may not be coherent with writable host FDs opened later, so + // all uses of the former must be replaced by uses of the latter. This is + // usually only the case when the remote filesystem is a Linux overlayfs + // mount. (Prior to Linux 4.18, patch series centered on commit + // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were + // incoherent between pre-copy-up and post-copy-up FDs; after that patch + // series, only memory mappings are incoherent.) + overlayfsStaleRead bool + + // If regularFilesUseSpecialFileFD is true, application FDs representing + // regular files will use distinct file handles for each FD, in the same + // way that application FDs representing "special files" such as sockets + // do. Note that this disables client caching and mmap for regular files. + regularFilesUseSpecialFileFD bool +} + +// InteropMode controls the client's interaction with other remote filesystem +// users. +type InteropMode uint32 + +const ( + // InteropModeExclusive is appropriate when the filesystem client is the + // only user of the remote filesystem. + // + // - The client may cache arbitrary filesystem state (file data, metadata, + // filesystem structure, etc.). + // + // - Client changes to filesystem state may be sent to the remote + // filesystem asynchronously, except when server permission checks are + // necessary. + // + // - File timestamps are based on client clocks. This ensures that users of + // the client observe timestamps that are coherent with their own clocks + // and consistent with Linux's semantics. However, since it is not always + // possible for clients to set arbitrary atimes and mtimes, and never + // possible for clients to set arbitrary ctimes, file timestamp changes are + // stored in the client only and never sent to the remote filesystem. + InteropModeExclusive InteropMode = iota + + // InteropModeWritethrough is appropriate when there are read-only users of + // the remote filesystem that expect to observe changes made by the + // filesystem client. + // + // - The client may cache arbitrary filesystem state. + // + // - Client changes to filesystem state must be sent to the remote + // filesystem synchronously. + // + // - File timestamps are based on client clocks. As a corollary, access + // timestamp changes from other remote filesystem users will not be visible + // to the client. + InteropModeWritethrough + + // InteropModeShared is appropriate when there are users of the remote + // filesystem that may mutate its state other than the client. + // + // - The client must verify ("revalidate") cached filesystem state before + // using it. + // + // - Client changes to filesystem state must be sent to the remote + // filesystem synchronously. + // + // - File timestamps are based on server clocks. This is necessary to + // ensure that timestamp changes are synchronized between remote filesystem + // users. + // + // Note that the correctness of InteropModeShared depends on the server + // correctly implementing 9P fids (i.e. each fid immutably represents a + // single filesystem object), even in the presence of remote filesystem + // mutations from other users. If this is violated, the behavior of the + // client is undefined. + InteropModeShared +) + +// InternalFilesystemOptions may be passed as +// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +type InternalFilesystemOptions struct { + // If LeakConnection is true, do not close the connection to the server + // when the Filesystem is released. This is necessary for deployments in + // which servers can handle only a single client and report failure if that + // client disconnects. + LeakConnection bool + + // If OpenSocketsByConnecting is true, silently translate attempts to open + // files identifying as sockets to connect RPCs. + OpenSocketsByConnecting bool +} + +// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default +// UIDs and GIDs used for files that do not provide a specific owner or group +// respectively. +const ( + // uint32(-2) doesn't work in Go. + _V9FS_DEFUID = auth.KUID(4294967294) + _V9FS_DEFGID = auth.KGID(4294967294) +) + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + if mfp == nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") + return nil, nil, syserror.EINVAL + } + + mopts := vfs.GenericParseMountOptions(opts.Data) + var fsopts filesystemOptions + + // Check that the transport is "fd". + trans, ok := mopts["trans"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'") + return nil, nil, syserror.EINVAL + } + delete(mopts, "trans") + if trans != "fd" { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans) + return nil, nil, syserror.EINVAL + } + + // Check that read and write FDs are provided and identical. + rfdstr, ok := mopts["rfdno"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>") + return nil, nil, syserror.EINVAL + } + delete(mopts, "rfdno") + rfd, err := strconv.Atoi(rfdstr) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr) + return nil, nil, syserror.EINVAL + } + wfdstr, ok := mopts["wfdno"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>") + return nil, nil, syserror.EINVAL + } + delete(mopts, "wfdno") + wfd, err := strconv.Atoi(wfdstr) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr) + return nil, nil, syserror.EINVAL + } + if rfd != wfd { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd) + return nil, nil, syserror.EINVAL + } + fsopts.fd = rfd + + // Get the attach name. + fsopts.aname = "/" + if aname, ok := mopts["aname"]; ok { + delete(mopts, "aname") + fsopts.aname = aname + } + + // Parse the cache policy. For historical reasons, this defaults to the + // least generally-applicable option, InteropModeExclusive. + fsopts.interop = InteropModeExclusive + if cache, ok := mopts["cache"]; ok { + delete(mopts, "cache") + switch cache { + case "fscache": + fsopts.interop = InteropModeExclusive + case "fscache_writethrough": + fsopts.interop = InteropModeWritethrough + case "none": + fsopts.regularFilesUseSpecialFileFD = true + fallthrough + case "remote_revalidating": + fsopts.interop = InteropModeShared + default: + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache) + return nil, nil, syserror.EINVAL + } + } + + // Parse the default UID and GID. + fsopts.dfltuid = _V9FS_DEFUID + if dfltuidstr, ok := mopts["dfltuid"]; ok { + delete(mopts, "dfltuid") + dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr) + return nil, nil, syserror.EINVAL + } + // In Linux, dfltuid is interpreted as a UID and is converted to a KUID + // in the caller's user namespace, but goferfs isn't + // application-mountable. + fsopts.dfltuid = auth.KUID(dfltuid) + } + fsopts.dfltgid = _V9FS_DEFGID + if dfltgidstr, ok := mopts["dfltgid"]; ok { + delete(mopts, "dfltgid") + dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr) + return nil, nil, syserror.EINVAL + } + fsopts.dfltgid = auth.KGID(dfltgid) + } + + // Parse the 9P message size. + fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M + if msizestr, ok := mopts["msize"]; ok { + delete(mopts, "msize") + msize, err := strconv.ParseUint(msizestr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr) + return nil, nil, syserror.EINVAL + } + fsopts.msize = uint32(msize) + } + + // Parse the 9P protocol version. + fsopts.version = p9.HighestVersionString() + if version, ok := mopts["version"]; ok { + delete(mopts, "version") + fsopts.version = version + } + + // Parse the dentry cache limit. + fsopts.maxCachedDentries = 1000 + if str, ok := mopts["dentry_cache_limit"]; ok { + delete(mopts, "dentry_cache_limit") + maxCachedDentries, err := strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) + return nil, nil, syserror.EINVAL + } + fsopts.maxCachedDentries = maxCachedDentries + } + + // Handle simple flags. + if _, ok := mopts["force_page_cache"]; ok { + delete(mopts, "force_page_cache") + fsopts.forcePageCache = true + } + if _, ok := mopts["limit_host_fd_translation"]; ok { + delete(mopts, "limit_host_fd_translation") + fsopts.limitHostFDTranslation = true + } + if _, ok := mopts["overlayfs_stale_read"]; ok { + delete(mopts, "overlayfs_stale_read") + fsopts.overlayfsStaleRead = true + } + // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying + // "cache=none". + + // Check for unparsed options. + if len(mopts) != 0 { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + // Handle internal options. + iopts, ok := opts.InternalData.(InternalFilesystemOptions) + if opts.InternalData != nil && !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) + return nil, nil, syserror.EINVAL + } + // If !ok, iopts being the zero value is correct. + + // Establish a connection with the server. + conn, err := unet.NewSocket(fsopts.fd) + if err != nil { + return nil, nil, err + } + + // Perform version negotiation with the server. + ctx.UninterruptibleSleepStart(false) + client, err := p9.NewClient(conn, fsopts.msize, fsopts.version) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + conn.Close() + return nil, nil, err + } + // Ownership of conn has been transferred to client. + + // Perform attach to obtain the filesystem root. + ctx.UninterruptibleSleepStart(false) + attached, err := client.Attach(fsopts.aname) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + client.Close() + return nil, nil, err + } + attachFile := p9file{attached} + qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } + + // Construct the filesystem object. + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } + fs := &filesystem{ + mfp: mfp, + opts: fsopts, + iopts: iopts, + client: client, + clock: ktime.RealtimeClockFromContext(ctx), + devMinor: devMinor, + syncableDentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), + } + fs.vfsfs.Init(vfsObj, &fstype, fs) + + // Construct the root dentry. + root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) + if err != nil { + attachFile.close(ctx) + fs.vfsfs.DecRef() + return nil, nil, err + } + // Set the root's reference count to 2. One reference is returned to the + // caller, and the other is deliberately leaked to prevent the root from + // being "cached" and subsequently evicted. Its resources will still be + // cleaned up by fs.Release(). + root.refs = 2 + + return &fs.vfsfs, &root.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + ctx := context.Background() + mf := fs.mfp.MemoryFile() + + fs.syncMu.Lock() + for d := range fs.syncableDentries { + d.handleMu.Lock() + d.dataMu.Lock() + if d.handleWritable { + // Write dirty cached data to the remote file. + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil { + log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) + } + // TODO(jamieliu): Do we need to flushf/fsync d? + } + // Discard cached pages. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Close the host fd if one exists. + if d.handle.fd >= 0 { + syscall.Close(int(d.handle.fd)) + d.handle.fd = -1 + } + d.handleMu.Unlock() + } + // There can't be any specialFileFDs still using fs, since each such + // FileDescription would hold a reference on a Mount holding a reference on + // fs. + fs.syncMu.Unlock() + + if !fs.iopts.LeakConnection { + // Close the connection to the server. This implicitly clunks all fids. + fs.client.Close() + } + + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + // refs is the reference count. Each dentry holds a reference on its + // parent, even if disowned. An additional reference is held on all + // synthetic dentries until they are unlinked or invalidated. When refs + // reaches 0, the dentry may be added to the cache or destroyed. If refs == + // -1, the dentry has already been destroyed. refs is accessed using atomic + // memory operations. + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // parent is this dentry's parent directory. Each dentry holds a reference + // on its parent. If this dentry is a filesystem root, parent is nil. + // parent is protected by filesystem.renameMu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.renameMu. + name string + + // We don't support hard links, so each dentry maps 1:1 to an inode. + + // file is the unopened p9.File that backs this dentry. file is immutable. + // + // If file.isNil(), this dentry represents a synthetic file, i.e. a file + // that does not exist on the remote filesystem. As of this writing, the + // only files that can be synthetic are sockets, pipes, and directories. + file p9file + + // If deleted is non-zero, the file represented by this dentry has been + // deleted. deleted is accessed using atomic memory operations. + deleted uint32 + + // If cached is true, dentryEntry links dentry into + // filesystem.cachedDentries. cached and dentryEntry are protected by + // filesystem.renameMu. + cached bool + dentryEntry + + dirMu sync.Mutex + + // If this dentry represents a directory, children contains: + // + // - Mappings of child filenames to dentries representing those children. + // + // - Mappings of child filenames that are known not to exist to nil + // dentries (only if InteropModeShared is not in effect and the directory + // is not synthetic). + // + // children is protected by dirMu. + children map[string]*dentry + + // If this dentry represents a directory, syntheticChildren is the number + // of child dentries for which dentry.isSynthetic() == true. + // syntheticChildren is protected by dirMu. + syntheticChildren int + + // If this dentry represents a directory, + // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it + // is a cache of all entries in the directory, in the order they were + // returned by the server. dirents is protected by dirMu. + dirents []vfs.Dirent + + // Cached metadata; protected by metadataMu and accessed using atomic + // memory operations unless otherwise specified. + metadataMu sync.Mutex + ino inodeNumber // immutable + mode uint32 // type is immutable, perms are mutable + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + blockSize uint32 // 0 if unknown + // Timestamps, all nsecs from the Unix epoch. + atime int64 + mtime int64 + ctime int64 + btime int64 + // File size, protected by both metadataMu and dataMu (i.e. both must be + // locked to mutate it). + size uint64 + + // nlink counts the number of hard links to this dentry. It's updated and + // accessed using atomic operations. It's not protected by metadataMu like the + // other metadata fields. + nlink uint32 + + mapsMu sync.Mutex + + // If this dentry represents a regular file, mappings tracks mappings of + // the file into memmap.MappingSpaces. mappings is protected by mapsMu. + mappings memmap.MappingSet + + // If this dentry represents a regular file or directory: + // + // - handle is the I/O handle used by all regularFileFDs/directoryFDs + // representing this dentry. + // + // - handleReadable is true if handle is readable. + // + // - handleWritable is true if handle is writable. + // + // Invariants: + // + // - If handleReadable == handleWritable == false, then handle.file == nil + // (i.e. there is no open handle). Conversely, if handleReadable || + // handleWritable == true, then handle.file != nil (i.e. there is an open + // handle). + // + // - handleReadable and handleWritable cannot transition from true to false + // (i.e. handles may not be downgraded). + // + // These fields are protected by handleMu. + handleMu sync.RWMutex + handle handle + handleReadable bool + handleWritable bool + + dataMu sync.RWMutex + + // If this dentry represents a regular file that is client-cached, cache + // maps offsets into the cached file to offsets into + // filesystem.mfp.MemoryFile() that store the file's data. cache is + // protected by dataMu. + cache fsutil.FileRangeSet + + // If this dentry represents a regular file that is client-cached, dirty + // tracks dirty segments in cache. dirty is protected by dataMu. + dirty fsutil.DirtySet + + // pf implements platform.File for mappings of handle.fd. + pf dentryPlatformFile + + // If this dentry represents a symbolic link, InteropModeShared is not in + // effect, and haveTarget is true, target is the symlink target. haveTarget + // and target are protected by dataMu. + haveTarget bool + target string + + // If this dentry represents a synthetic socket file, endpoint is the + // transport endpoint bound to this file. + endpoint transport.BoundEndpoint + + // If this dentry represents a synthetic named pipe, pipe is the pipe + // endpoint bound to this file. + pipe *pipe.VFSPipe + + locks vfs.FileLocks + + // Inotify watches for this dentry. + watches vfs.Watches +} + +// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the +// gofer client. +func dentryAttrMask() p9.AttrMask { + return p9.AttrMask{ + Mode: true, + UID: true, + GID: true, + ATime: true, + MTime: true, + CTime: true, + Size: true, + BTime: true, + } +} + +// newDentry creates a new dentry representing the given file. The dentry +// initially has no references, but is not cached; it is the caller's +// responsibility to set the dentry's reference count and/or call +// dentry.checkCachingLocked() as appropriate. +// +// Preconditions: !file.isNil(). +func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { + if !mask.Mode { + ctx.Warningf("can't create gofer.dentry without file type") + return nil, syserror.EIO + } + if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { + ctx.Warningf("can't create regular file gofer.dentry without file size") + return nil, syserror.EIO + } + + d := &dentry{ + fs: fs, + file: file, + ino: inoFromPath(qid.Path), + mode: uint32(attr.Mode), + uid: uint32(fs.opts.dfltuid), + gid: uint32(fs.opts.dfltgid), + blockSize: usermem.PageSize, + handle: handle{ + fd: -1, + }, + } + d.pf.dentry = d + if mask.UID { + d.uid = dentryUIDFromP9UID(attr.UID) + } + if mask.GID { + d.gid = dentryGIDFromP9GID(attr.GID) + } + if mask.Size { + d.size = attr.Size + } + if attr.BlockSize != 0 { + d.blockSize = uint32(attr.BlockSize) + } + if mask.ATime { + d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds) + } + if mask.MTime { + d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds) + } + if mask.CTime { + d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds) + } + if mask.BTime { + d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) + } + if mask.NLink { + d.nlink = uint32(attr.NLink) + } + d.vfsd.Init(d) + + fs.syncMu.Lock() + fs.syncableDentries[d] = struct{}{} + fs.syncMu.Unlock() + return d, nil +} + +func (d *dentry) isSynthetic() bool { + return d.file.isNil() +} + +func (d *dentry) cachedMetadataAuthoritative() bool { + return d.fs.opts.interop != InteropModeShared || d.isSynthetic() +} + +// updateFromP9Attrs is called to update d's metadata after an update from the +// remote filesystem. +func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { + d.metadataMu.Lock() + if mask.Mode { + if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { + d.metadataMu.Unlock() + panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) + } + atomic.StoreUint32(&d.mode, uint32(attr.Mode)) + } + if mask.UID { + atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID)) + } + if mask.GID { + atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID)) + } + // There is no P9_GETATTR_* bit for I/O block size. + if attr.BlockSize != 0 { + atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) + } + if mask.ATime { + atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) + } + if mask.MTime { + atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) + } + if mask.CTime { + atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)) + } + if mask.BTime { + atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) + } + if mask.NLink { + atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) + } + if mask.Size { + d.updateFileSizeLocked(attr.Size) + } + d.metadataMu.Unlock() +} + +// Preconditions: !d.isSynthetic() +func (d *dentry) updateFromGetattr(ctx context.Context) error { + // Use d.handle.file, which represents a 9P fid that has been opened, in + // preference to d.file, which represents a 9P fid that has not. This may + // be significantly more efficient in some implementations. + var ( + file p9file + handleMuRLocked bool + ) + d.handleMu.RLock() + if !d.handle.file.isNil() { + file = d.handle.file + handleMuRLocked = true + } else { + file = d.file + d.handleMu.RUnlock() + } + _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) + if handleMuRLocked { + d.handleMu.RUnlock() + } + if err != nil { + return err + } + d.updateFromP9Attrs(attrMask, &attr) + return nil +} + +func (d *dentry) fileType() uint32 { + return atomic.LoadUint32(&d.mode) & linux.S_IFMT +} + +func (d *dentry) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME + stat.Blksize = atomic.LoadUint32(&d.blockSize) + stat.Nlink = atomic.LoadUint32(&d.nlink) + if stat.Nlink == 0 { + // The remote filesystem doesn't support link count; just make + // something up. This is consistent with Linux, where + // fs/inode.c:inode_init_always() initializes link count to 1, and + // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if + // it's not provided by the remote filesystem. + stat.Nlink = 1 + } + stat.UID = atomic.LoadUint32(&d.uid) + stat.GID = atomic.LoadUint32(&d.gid) + stat.Mode = uint16(atomic.LoadUint32(&d.mode)) + stat.Ino = uint64(d.ino) + stat.Size = atomic.LoadUint64(&d.size) + // This is consistent with regularFileFD.Seek(), which treats regular files + // as having no holes. + stat.Blocks = (stat.Size + 511) / 512 + stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime)) + stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) + stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) + stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = d.fs.devMinor +} + +func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { + if stat.Mask == 0 { + return nil + } + if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { + return syserror.EPERM + } + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + setLocalAtime := false + setLocalMtime := false + if d.cachedMetadataAuthoritative() { + // Timestamp updates will be handled locally. + setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 + setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 + stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME + + // Prepare for truncate. + if stat.Mask&linux.STATX_SIZE != 0 { + switch d.mode & linux.S_IFMT { + case linux.S_IFREG: + if !setLocalMtime { + // Truncate updates mtime. + setLocalMtime = true + stat.Mtime.Nsec = linux.UTIME_NOW + } + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } + } + } + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if stat.Mask&linux.STATX_SIZE != 0 { + // The size needs to be changed even when + // !d.cachedMetadataAuthoritative() because d.mappings has to be + // updated. + d.updateFileSizeLocked(stat.Size) + } + if !d.isSynthetic() { + if stat.Mask != 0 { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + return err + } + } + if d.fs.opts.interop == InteropModeShared { + // There's no point to updating d's metadata in this case since + // it'll be overwritten by revalidation before the next time it's + // used anyway. (InteropModeShared inhibits client caching of + // regular file data, so there's no cache to truncate either.) + return nil + } + } + now := d.fs.clock.Now().Nanoseconds() + if stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) + } + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, stat.UID) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.gid, stat.GID) + } + if setLocalAtime { + if stat.Atime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&d.atime, now) + } else { + atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) + } + // Restore mask bits that we cleared earlier. + stat.Mask |= linux.STATX_ATIME + } + if setLocalMtime { + if stat.Mtime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&d.mtime, now) + } else { + atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) + } + // Restore mask bits that we cleared earlier. + stat.Mask |= linux.STATX_MTIME + } + atomic.StoreInt64(&d.ctime, now) + return nil +} + +// Preconditions: d.metadataMu must be locked. +func (d *dentry) updateFileSizeLocked(newSize uint64) { + d.dataMu.Lock() + oldSize := d.size + d.size = newSize + // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings + // below. This allows concurrent calls to Read/Translate/etc. These + // functions synchronize with truncation by refusing to use cache + // contents beyond the new d.size. (We are still holding d.metadataMu, + // so we can't race with Write or another truncate.) + d.dataMu.Unlock() + if d.size < oldSize { + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(d.size) + if oldpgend != newpgend { + d.mapsMu.Lock() + d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + d.mapsMu.Unlock() + } + // We are now guaranteed that there are no translations of + // truncated pages, and can remove them from the cache. Since + // truncated pages have been removed from the remote file, they + // should be dropped without being written back. + d.dataMu.Lock() + d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) + d.dataMu.Unlock() + } +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { + return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid))) +} + +func dentryUIDFromP9UID(uid p9.UID) uint32 { + if !uid.Ok() { + return uint32(auth.OverflowUID) + } + return uint32(uid) +} + +func dentryGIDFromP9GID(gid p9.GID) uint32 { + if !gid.Ok() { + return uint32(auth.OverflowGID) + } + return uint32(gid) +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.checkCachingLocked(). + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkCachingLocked() + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("gofer.dentry.DecRef() called without holding a reference") + } +} + +// decRefLocked decrements d's reference count without calling +// d.checkCachingLocked, even if d's reference count reaches 0; callers are +// responsible for ensuring that d.checkCachingLocked will be called later. +func (d *dentry) decRefLocked() { + if refs := atomic.AddInt64(&d.refs, -1); refs < 0 { + panic("gofer.dentry.decRefLocked() called without holding a reference") + } +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { + if d.isDir() { + events |= linux.IN_ISDIR + } + + d.fs.renameMu.RLock() + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + d.parent.watches.Notify(d.name, events, cookie, et, d.isDeleted()) + } + d.watches.Notify("", events, cookie, et, d.isDeleted()) + d.fs.renameMu.RUnlock() +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + return &d.watches +} + +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +// +// If no watches are left on this dentry and it has no references, cache it. +func (d *dentry) OnZeroWatches() { + if atomic.LoadInt64(&d.refs) == 0 { + d.fs.renameMu.Lock() + d.checkCachingLocked() + d.fs.renameMu.Unlock() + } +} + +// checkCachingLocked should be called after d's reference count becomes 0 or it +// becomes disowned. +// +// It may be called on a destroyed dentry. For example, +// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times +// for the same dentry when the dentry is visited more than once in the same +// operation. One of the calls may destroy the dentry, so subsequent calls will +// do nothing. +// +// Preconditions: d.fs.renameMu must be locked for writing. +func (d *dentry) checkCachingLocked() { + // Dentries with a non-zero reference count must be retained. (The only way + // to obtain a reference on a dentry with zero references is via path + // resolution, which requires renameMu, so if d.refs is zero then it will + // remain zero while we hold renameMu for writing.) + refs := atomic.LoadInt64(&d.refs) + if refs > 0 { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + return + } + if refs == -1 { + // Dentry has already been destroyed. + return + } + // Deleted and invalidated dentries with zero references are no longer + // reachable by path resolution and should be dropped immediately. + if d.vfsd.IsDead() { + if d.isDeleted() { + d.watches.HandleDeletion() + } + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + d.destroyLocked() + return + } + // If d still has inotify watches and it is not deleted or invalidated, we + // cannot cache it and allow it to be evicted. Otherwise, we will lose its + // watches, even if a new dentry is created for the same file in the future. + // Note that the size of d.watches cannot concurrently transition from zero + // to non-zero, because adding a watch requires holding a reference on d. + if d.watches.Size() > 0 { + return + } + // If d is already cached, just move it to the front of the LRU. + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentries.PushFront(d) + return + } + // Cache the dentry, then evict the least recently used cached dentry if + // the cache becomes over-full. + d.fs.cachedDentries.PushFront(d) + d.fs.cachedDentriesLen++ + d.cached = true + if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries { + victim := d.fs.cachedDentries.Back() + d.fs.cachedDentries.Remove(victim) + d.fs.cachedDentriesLen-- + victim.cached = false + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. + if atomic.LoadInt64(&victim.refs) == 0 { + if victim.parent != nil { + victim.parent.dirMu.Lock() + if !victim.vfsd.IsDead() { + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) + delete(victim.parent.children, victim.name) + // We're only deleting the dentry, not the file it + // represents, so we don't need to update + // victimParent.dirents etc. + } + victim.parent.dirMu.Unlock() + } + victim.destroyLocked() + } + // Whether or not victim was destroyed, we brought fs.cachedDentriesLen + // back down to fs.opts.maxCachedDentries, so we don't loop. + } +} + +// destroyLocked destroys the dentry. It may flushes dirty pages from cache, +// close p9 file and remove reference on parent dentry. +// +// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is +// not a child dentry. +func (d *dentry) destroyLocked() { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("dentry.destroyLocked() called on already destroyed dentry") + default: + panic("dentry.destroyLocked() called with references on the dentry") + } + + ctx := context.Background() + d.handleMu.Lock() + if !d.handle.file.isNil() { + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + // Write dirty pages back to the remote filesystem. + if d.handleWritable { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err) + } + } + // Discard cached data. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Clunk open fids and close open host FDs. + d.handle.close(ctx) + } + d.handleMu.Unlock() + + if !d.file.isNil() { + d.file.close(ctx) + d.file = p9file{} + // Remove d from the set of syncable dentries. + d.fs.syncMu.Lock() + delete(d.fs.syncableDentries, d) + d.fs.syncMu.Unlock() + } + // Drop the reference held by d on its parent without recursively locking + // d.fs.renameMu. + if d.parent != nil { + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkCachingLocked() + } else if refs < 0 { + panic("gofer.dentry.DecRef() called without holding a reference") + } + } +} + +func (d *dentry) isDeleted() bool { + return atomic.LoadUint32(&d.deleted) != 0 +} + +func (d *dentry) setDeleted() { + atomic.StoreUint32(&d.deleted, 1) +} + +// We only support xattrs prefixed with "user." (see b/148380782). Currently, +// there is no need to expose any other xattrs through a gofer. +func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { + if d.file.isNil() || !d.userXattrSupported() { + return nil, nil + } + xattrMap, err := d.file.listXattr(ctx, size) + if err != nil { + return nil, err + } + xattrs := make([]string, 0, len(xattrMap)) + for x := range xattrMap { + if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { + xattrs = append(xattrs, x) + } + } + return xattrs, nil +} + +func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if d.file.isNil() { + return "", syserror.ENODATA + } + if err := d.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + if !d.userXattrSupported() { + return "", syserror.ENODATA + } + return d.file.getXattr(ctx, opts.Name, opts.Size) +} + +func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if d.file.isNil() { + return syserror.EPERM + } + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !d.userXattrSupported() { + return syserror.EPERM + } + return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) +} + +func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { + if d.file.isNil() { + return syserror.EPERM + } + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !d.userXattrSupported() { + return syserror.EPERM + } + return d.file.removeXattr(ctx, name) +} + +// Extended attributes in the user.* namespace are only supported for regular +// files and directories. +func (d *dentry) userXattrSupported() bool { + filetype := linux.S_IFMT & atomic.LoadUint32(&d.mode) + return filetype == linux.S_IFREG || filetype == linux.S_IFDIR +} + +// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir(). +func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { + // O_TRUNC unconditionally requires us to obtain a new handle (opened with + // O_TRUNC). + if !trunc { + d.handleMu.RLock() + if (!read || d.handleReadable) && (!write || d.handleWritable) { + // The current handle is sufficient. + d.handleMu.RUnlock() + return nil + } + d.handleMu.RUnlock() + } + + haveOldFD := false + d.handleMu.Lock() + if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc { + // Get a new handle. + wantReadable := d.handleReadable || read + wantWritable := d.handleWritable || write + h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc) + if err != nil { + d.handleMu.Unlock() + return err + } + if !d.handle.file.isNil() { + // Check that old and new handles are compatible: If the old handle + // includes a host file descriptor but the new one does not, or + // vice versa, old and new memory mappings may be incoherent. + haveOldFD = d.handle.fd >= 0 + haveNewFD := h.fd >= 0 + if haveOldFD != haveNewFD { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD) + h.close(ctx) + return syserror.EIO + } + if haveOldFD { + // We may have raced with callers of d.pf.FD() that are now + // using the old file descriptor, preventing us from safely + // closing it. We could handle this by invalidating existing + // memmap.Translations, but this is expensive. Instead, use + // dup3 to make the old file descriptor refer to the new file + // description, then close the new file descriptor (which is no + // longer needed). Racing callers may use the old or new file + // description, but this doesn't matter since they refer to the + // same file (unless d.fs.opts.overlayfsStaleRead is true, + // which we handle separately). + if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) + h.close(ctx) + return err + } + syscall.Close(int(h.fd)) + h.fd = d.handle.fd + if d.fs.opts.overlayfsStaleRead { + // Replace sentry mappings of the old FD with mappings of + // the new FD, since the two are not necessarily coherent. + if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) + h.close(ctx) + return err + } + } + // Clunk the old fid before making the new handle visible (by + // unlocking d.handleMu). + d.handle.file.close(ctx) + } + } + // Switch to the new handle. + d.handle = h + d.handleReadable = wantReadable + d.handleWritable = wantWritable + } + d.handleMu.Unlock() + + if d.fs.opts.overlayfsStaleRead && haveOldFD { + // Invalidate application mappings that may be using the old FD; they + // will be replaced with mappings using the new FD after future calls + // to d.Translate(). This requires holding d.mapsMu, which precedes + // d.handleMu in the lock order. + d.mapsMu.Lock() + d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + } + + return nil +} + +// incLinks increments link count. +func (d *dentry) incLinks() { + if atomic.LoadUint32(&d.nlink) == 0 { + // The remote filesystem doesn't support link count. + return + } + atomic.AddUint32(&d.nlink, 1) +} + +// decLinks decrements link count. +func (d *dentry) decLinks() { + if atomic.LoadUint32(&d.nlink) == 0 { + // The remote filesystem doesn't support link count. + return + } + atomic.AddUint32(&d.nlink, ^uint32(0)) +} + +// fileDescription is embedded by gofer implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + lockLogging sync.Once +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + d := fd.dentry() + const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) + if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if + // available? + if err := d.updateFromGetattr(ctx); err != nil { + return linux.Statx{}, err + } + } + var stat linux.Statx + d.statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()); err != nil { + return err + } + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + fd.dentry().InotifyWithParent(ev, 0, vfs.InodeEvent) + } + return nil +} + +// Listxattr implements vfs.FileDescriptionImpl.Listxattr. +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size) +} + +// Getxattr implements vfs.FileDescriptionImpl.Getxattr. +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts) +} + +// Setxattr implements vfs.FileDescriptionImpl.Setxattr. +func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { + d := fd.dentry() + if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { + return err + } + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// Removexattr implements vfs.FileDescriptionImpl.Removexattr. +func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { + d := fd.dentry() + if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { + return err + } + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + fd.lockLogging.Do(func() { + log.Infof("File lock using gofer file handled internally.") + }) + return fd.LockFD.LockBSD(ctx, uid, t, block) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + fd.lockLogging.Do(func() { + log.Infof("Range lock using gofer file handled internally.") + }) + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go new file mode 100644 index 000000000..adff39490 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + "testing" + + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/contexttest" +) + +func TestDestroyIdempotent(t *testing.T) { + fs := filesystem{ + syncableDentries: make(map[*dentry]struct{}), + opts: filesystemOptions{ + // Test relies on no dentry being held in the cache. + maxCachedDentries: 0, + }, + } + + ctx := contexttest.Context(t) + attr := &p9.Attr{ + Mode: p9.ModeRegular, + } + mask := p9.AttrMask{ + Mode: true, + Size: true, + } + parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + + child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + parent.cacheNewChildLocked(child, "child") + + child.checkCachingLocked() + if got := atomic.LoadInt64(&child.refs); got != -1 { + t.Fatalf("child.refs=%d, want: -1", got) + } + // Parent will also be destroyed when child reference is removed. + if got := atomic.LoadInt64(&parent.refs); got != -1 { + t.Fatalf("parent.refs=%d, want: -1", got) + } + child.checkCachingLocked() + child.checkCachingLocked() +} diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go new file mode 100644 index 000000000..8792ca4f2 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -0,0 +1,141 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/hostfd" +) + +// handle represents a remote "open file descriptor", consisting of an opened +// fid (p9.File) and optionally a host file descriptor. +type handle struct { + file p9file + fd int32 // -1 if unavailable +} + +// Preconditions: read || write. +func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) { + _, newfile, err := file.walk(ctx, nil) + if err != nil { + return handle{fd: -1}, err + } + var flags p9.OpenFlags + switch { + case read && !write: + flags = p9.ReadOnly + case !read && write: + flags = p9.WriteOnly + case read && write: + flags = p9.ReadWrite + } + if trunc { + flags |= p9.OpenTruncate + } + fdobj, _, _, err := newfile.open(ctx, flags) + if err != nil { + newfile.close(ctx) + return handle{fd: -1}, err + } + fd := int32(-1) + if fdobj != nil { + fd = int32(fdobj.Release()) + } + return handle{ + file: newfile, + fd: fd, + }, nil +} + +func (h *handle) close(ctx context.Context) { + h.file.close(ctx) + h.file = p9file{} + if h.fd >= 0 { + syscall.Close(int(h.fd)) + h.fd = -1 + } +} + +func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */) + ctx.UninterruptibleSleepFinish(false) + return n, err + } + if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() { + n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset) + return uint64(n), err + } + // Buffer the read since p9.File.ReadAt() takes []byte. + buf := make([]byte, dsts.NumBytes()) + n, err := h.file.readAt(ctx, buf, offset) + if n == 0 { + return 0, err + } + if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil { + return cp, cperr + } + return uint64(n), err +} + +func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */) + ctx.UninterruptibleSleepFinish(false) + return n, err + } + if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() { + n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) + return uint64(n), err + } + // Buffer the write since p9.File.WriteAt() takes []byte. + buf := make([]byte, srcs.NumBytes()) + cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs) + if cp == 0 { + return 0, cperr + } + n, err := h.file.writeAt(ctx, buf[:cp], offset) + if err != nil { + return uint64(n), err + } + return cp, cperr +} + +func (h *handle) sync(ctx context.Context) error { + // Handle most common case first. + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(h.fd)) + ctx.UninterruptibleSleepFinish(false) + return err + } + if h.file.isNil() { + // File hasn't been touched, there is nothing to sync. + return nil + } + return h.file.fsync(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go new file mode 100644 index 000000000..7294de7d6 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go @@ -0,0 +1,97 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "sync" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create +// pipes after sentry initialization due to syscall filters. +var ( + tempPipeMu sync.Mutex + tempPipeReadFD int + tempPipeWriteFD int + tempPipeBuf [1]byte +) + +func init() { + var pipeFDs [2]int + if err := unix.Pipe(pipeFDs[:]); err != nil { + panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err)) + } + tempPipeReadFD = pipeFDs[0] + tempPipeWriteFD = pipeFDs[1] +} + +func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error { + for { + ok, err := nonblockingPipeHasWriter(fd) + if err != nil { + return err + } + if ok { + return nil + } + if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { + return err + } + } +} + +func nonblockingPipeHasWriter(fd int32) (bool, error) { + tempPipeMu.Lock() + defer tempPipeMu.Unlock() + // Copy 1 byte from fd into the temporary pipe. + n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK) + if err == syserror.EAGAIN { + // The pipe represented by fd is empty, but has a writer. + return true, nil + } + if err != nil { + return false, err + } + if n == 0 { + // The pipe represented by fd is empty and has no writer. + return false, nil + } + // The pipe represented by fd is non-empty, so it either has, or has + // previously had, a writer. Remove the byte copied to the temporary pipe + // before returning. + if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 { + panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err)) + } + return true, nil +} + +func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error { + t := time.NewTimer(100 * time.Millisecond) + defer t.Stop() + cancel := ctx.SleepStart() + select { + case <-t.C: + ctx.SleepFinish(true) + return nil + case <-cancel: + ctx.SleepFinish(false) + return syserror.ErrInterrupted + } +} diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go new file mode 100644 index 000000000..87f0b877f --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -0,0 +1,233 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/syserror" +) + +// p9file is a wrapper around p9.File that provides methods that are +// Context-aware. +type p9file struct { + file p9.File +} + +func (f p9file) isNil() bool { + return f.file == nil +} + +func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, err := f.file.Walk(names) + ctx.UninterruptibleSleepFinish(false) + return qids, p9file{newfile}, err +} + +func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names) + ctx.UninterruptibleSleepFinish(false) + return qids, p9file{newfile}, attrMask, attr, err +} + +// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single +// path component and returns a single qid. +func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name}) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err + } + if len(qids) != 1 { + ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids) + if newfile != nil { + p9file{newfile}.close(ctx) + } + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO + } + return qids[0], p9file{newfile}, attrMask, attr, nil +} + +func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) { + ctx.UninterruptibleSleepStart(false) + fsstat, err := f.file.StatFS() + ctx.UninterruptibleSleepFinish(false) + return fsstat, err +} + +func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qid, attrMask, attr, err := f.file.GetAttr(req) + ctx.UninterruptibleSleepFinish(false) + return qid, attrMask, attr, err +} + +func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetAttr(valid, attr) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { + ctx.UninterruptibleSleepStart(false) + xattrs, err := f.file.ListXattr(size) + ctx.UninterruptibleSleepFinish(false) + return xattrs, err +} + +func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { + ctx.UninterruptibleSleepStart(false) + val, err := f.file.GetXattr(name, size) + ctx.UninterruptibleSleepFinish(false) + return val, err +} + +func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetXattr(name, value, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) removeXattr(ctx context.Context, name string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.RemoveXattr(name) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Allocate(mode, offset, length) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) close(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Close() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, qid, iounit, err := f.file.Open(flags) + ctx.UninterruptibleSleepFinish(false) + return fdobj, qid, iounit, err +} + +func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + n, err := f.file.ReadAt(p, offset) + ctx.UninterruptibleSleepFinish(false) + return n, err +} + +func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + n, err := f.file.WriteAt(p, offset) + ctx.UninterruptibleSleepFinish(false) + return n, err +} + +func (f p9file) fsync(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.FSync() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return fdobj, p9file{newfile}, qid, iounit, err +} + +func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Mkdir(name, permissions, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Symlink(oldName, newName, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) link(ctx context.Context, target p9file, newName string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Link(target.file, newName) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Mknod(name, mode, major, minor, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Rename(newDir.file, newName) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.UnlinkAt(name, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) { + ctx.UninterruptibleSleepStart(false) + dirents, err := f.file.Readdir(offset, count) + ctx.UninterruptibleSleepFinish(false) + return dirents, err +} + +func (f p9file) readlink(ctx context.Context) (string, error) { + ctx.UninterruptibleSleepStart(false) + target, err := f.file.Readlink() + ctx.UninterruptibleSleepFinish(false) + return target, err +} + +func (f p9file) flush(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Flush() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, err := f.file.Connect(flags) + ctx.UninterruptibleSleepFinish(false) + return fdobj, err +} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go new file mode 100644 index 000000000..a2f02d9c7 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -0,0 +1,892 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isRegularFile() bool { + return d.fileType() == linux.S_IFREG +} + +type regularFileFD struct { + fileDescription + + // off is the file offset. off is protected by mu. + mu sync.Mutex + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *regularFileFD) OnClose(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + // Skip flushing if writes may be buffered by the client, since (as with + // the VFS1 client) we don't flush buffered writes on close anyway. + d := fd.dentry() + if d.fs.opts.interop == InteropModeExclusive { + return nil + } + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.handle.file.flush(ctx) +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + + d := fd.dentry() + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + size := offset + length + + // Allocating a smaller size is a noop. + if size <= d.size { + return nil + } + + d.handleMu.Lock() + defer d.handleMu.Unlock() + + err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + if err != nil { + return err + } + d.size = size + if !d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + return nil +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Check for reading at EOF before calling into MM (but not under + // InteropModeShared, which makes d.size unreliable). + d := fd.dentry() + if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) { + return 0, io.EOF + } + + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Lock d.metadataMu for the rest of the read to prevent d.size from + // changing. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + // Write dirty cached pages that will be touched by the read back to + // the remote file. + if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { + return 0, err + } + } + + rw := getDentryReadWriter(ctx, d, offset) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Require the read to go to the remote file. + rw.direct = true + } + n, err := dst.CopyOutFrom(ctx, rw) + putDentryReadWriter(rw) + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + d.touchAtime(fd.vfsfd.Mount()) + } + return n, err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) + + d := fd.dentry() + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:__generic_file_write_iter() => + // file_update_time(). This is d.touchCMtime(), but without locking + // d.metadataMu (recursively). + d.touchCMtimeLocked() + } + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Write dirty cached pages that will be touched by the write back to + // the remote file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return 0, err + } + // Remove touched pages from the cache. + pgstart := usermem.PageRoundDown(uint64(offset)) + pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) + if !ok { + return 0, syserror.EINVAL + } + mr := memmap.MappableRange{pgstart, pgend} + var freed []platform.FileRange + d.dataMu.Lock() + cseg := d.cache.LowerBoundSegment(mr.Start) + for cseg.Ok() && cseg.Start() < mr.End { + cseg = d.cache.Isolate(cseg, mr) + freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + cseg = d.cache.Remove(cseg).NextSegment() + } + d.dataMu.Unlock() + // Invalidate mappings of removed pages. + d.mapsMu.Lock() + d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + // Finally free pages removed from the cache. + mf := d.fs.mfp.MemoryFile() + for _, freedFR := range freed { + mf.DecRef(freedFR) + } + } + rw := getDentryReadWriter(ctx, d, offset) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Require the write to go to the remote file. + rw.direct = true + } + n, err := src.CopyInTo(ctx, rw) + putDentryReadWriter(rw) + if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + // Write dirty cached pages touched by the write back to the remote + // file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return 0, err + } + // Request the remote filesystem to sync the remote file. + if err := d.handle.file.fsync(ctx); err != nil { + return 0, err + } + } + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +type dentryReadWriter struct { + ctx context.Context + d *dentry + off uint64 + direct bool +} + +var dentryReadWriterPool = sync.Pool{ + New: func() interface{} { + return &dentryReadWriter{} + }, +} + +func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { + rw := dentryReadWriterPool.Get().(*dentryReadWriter) + rw.ctx = ctx + rw.d = d + rw.off = uint64(offset) + rw.direct = false + return rw +} + +func putDentryReadWriter(rw *dentryReadWriter) { + rw.ctx = nil + rw.d = nil + dentryReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + + // If we have a mmappable host FD (which must be used here to ensure + // coherence with memory-mapped I/O), or if InteropModeShared is in effect + // (which prevents us from caching file contents and makes dentry.size + // unreliable), or if the file was opened O_DIRECT, read directly from + // dentry.handle without locking dentry.dataMu. + rw.d.handleMu.RLock() + if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off) + rw.d.handleMu.RUnlock() + rw.off += n + return n, err + } + + // Otherwise read from/through the cache. + mf := rw.d.fs.mfp.MemoryFile() + fillCache := mf.ShouldCacheEvictable() + var dataMuUnlock func() + if fillCache { + rw.d.dataMu.Lock() + dataMuUnlock = rw.d.dataMu.Unlock + } else { + rw.d.dataMu.RLock() + dataMuUnlock = rw.d.dataMu.RUnlock + } + + // Compute the range to read (limited by file size and overflow-checked). + if rw.off >= rw.d.size { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return 0, io.EOF + } + end := rw.d.size + if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { + end = rend + } + + var done uint64 + seg, gap := rw.d.cache.Find(rw.off) + for rw.off < end { + mr := memmap.MappableRange{rw.off, end} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.off += n + dsts = dsts.DropFirst64(n) + if err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + gapMR := gap.Range().Intersect(mr) + if fillCache { + // Read into the cache, then re-enter the loop to read from the + // cache. + gapEnd, _ := usermem.PageRoundUp(gapMR.End) + reqMR := memmap.MappableRange{ + Start: usermem.PageRoundDown(gapMR.Start), + End: gapEnd, + } + optMR := gap.Range() + err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) + mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) + seg, gap = rw.d.cache.Find(rw.off) + if !seg.Ok() { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + // err might have occurred in part of gap.Range() outside + // gapMR. Forget about it for now; if the error matters and + // persists, we'll run into it again in a later iteration of + // this loop. + } else { + // Read directly from the file. + gapDsts := dsts.TakeFirst64(gapMR.Length()) + n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) + done += n + rw.off += n + dsts = dsts.DropFirst64(n) + // Partial reads are fine. But we must stop reading. + if n != gapDsts.NumBytes() || err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } + } + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: rw.d.metadataMu must be locked. +func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + + // If we have a mmappable host FD (which must be used here to ensure + // coherence with memory-mapped I/O), or if InteropModeShared is in effect + // (which prevents us from caching file contents), or if the file was + // opened with O_DIRECT, write directly to dentry.handle without locking + // dentry.dataMu. + rw.d.handleMu.RLock() + if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) + rw.off += n + rw.d.dataMu.Lock() + if rw.off > rw.d.size { + atomic.StoreUint64(&rw.d.size, rw.off) + // The remote file's size will implicitly be extended to the correct + // value when we write back to it. + } + rw.d.dataMu.Unlock() + rw.d.handleMu.RUnlock() + return n, err + } + + // Otherwise write to/through the cache. + mf := rw.d.fs.mfp.MemoryFile() + rw.d.dataMu.Lock() + + // Compute the range to write (overflow-checked). + start := rw.off + end := rw.off + srcs.NumBytes() + if end <= rw.off { + end = math.MaxInt64 + } + + var ( + done uint64 + retErr error + ) + seg, gap := rw.d.cache.Find(rw.off) + for rw.off < end { + mr := memmap.MappableRange{rw.off, end} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + segMR := seg.Range().Intersect(mr) + ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write) + if err != nil { + retErr = err + goto exitLoop + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.off += n + srcs = srcs.DropFirst64(n) + rw.d.dirty.MarkDirty(segMR) + if err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Write directly to the file. At present, we never fill the cache + // when writing, since doing so can convert small writes into + // inefficient read-modify-write cycles, and we have no mechanism + // for detecting or avoiding this. + gapMR := gap.Range().Intersect(mr) + gapSrcs := srcs.TakeFirst64(gapMR.Length()) + n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) + done += n + rw.off += n + srcs = srcs.DropFirst64(n) + // Partial writes are fine. But we must stop writing. + if n != gapSrcs.NumBytes() || err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } +exitLoop: + if rw.off > rw.d.size { + atomic.StoreUint64(&rw.d.size, rw.off) + // The remote file's size will implicitly be extended to the correct + // value when we write back to it. + } + // If InteropModeWritethrough is in effect, flush written data back to the + // remote filesystem. + if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { + if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ + Start: start, + End: rw.off, + }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil { + // We have no idea how many bytes were actually flushed. + rw.off = start + done = 0 + retErr = err + } + } + rw.d.dataMu.Unlock() + rw.d.handleMu.RUnlock() + return done, retErr +} + +func (d *dentry) writeback(ctx context.Context, offset, size int64) error { + if size == 0 { + return nil + } + d.handleMu.RLock() + defer d.handleMu.RUnlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + // Compute the range of valid bytes (overflow-checked). + if uint64(offset) >= d.size { + return nil + } + end := int64(d.size) + if rend := offset + size; rend > offset && rend < end { + end = rend + } + return fsutil.SyncDirty(ctx, memmap.MappableRange{ + Start: uint64(offset), + End: uint64(end), + }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) + if err != nil { + return 0, err + } + fd.off = newOffset + return newOffset, nil +} + +// Calculate the new offset for a seek operation on a regular file. +func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) { + switch whence { + case linux.SEEK_SET: + // Use offset as specified. + case linux.SEEK_CUR: + offset += fdOffset + case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: + // Ensure file size is up to date. + if !d.cachedMetadataAuthoritative() { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, err + } + } + size := int64(atomic.LoadUint64(&d.size)) + // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous + // block of data. + switch whence { + case linux.SEEK_END: + offset += size + case linux.SEEK_DATA: + if offset > size { + return 0, syserror.ENXIO + } + // Use offset as specified. + case linux.SEEK_HOLE: + if offset > size { + return 0, syserror.ENXIO + } + offset = size + } + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + return fd.dentry().syncSharedHandle(ctx) +} + +func (d *dentry) syncSharedHandle(ctx context.Context) error { + d.handleMu.RLock() + defer d.handleMu.RUnlock() + + if d.handleWritable { + d.dataMu.Lock() + // Write dirty cached data to the remote file. + err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + d.dataMu.Unlock() + if err != nil { + return err + } + } + // Sync the remote file. + return d.handle.sync(ctx) +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + d := fd.dentry() + switch d.fs.opts.interop { + case InteropModeExclusive: + // Any mapping is fine. + case InteropModeWritethrough: + // Shared writable mappings require a host FD, since otherwise we can't + // synchronously flush memory-mapped writes to the remote file. + if opts.Private || !opts.MaxPerms.Write { + break + } + fallthrough + case InteropModeShared: + // All mappings require a host FD to be coherent with other filesystem + // users. + if d.fs.opts.forcePageCache { + // Whether or not we have a host FD, we're not allowed to use it. + return syserror.ENODEV + } + d.handleMu.RLock() + haveFD := d.handle.fd >= 0 + d.handleMu.RUnlock() + if !haveFD { + return syserror.ENODEV + } + default: + panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) + } + // After this point, d may be used as a memmap.Mappable. + d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) + return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) +} + +func (d *dentry) mayCachePages() bool { + if d.fs.opts.interop == InteropModeShared { + return false + } + if d.fs.opts.forcePageCache { + return true + } + d.handleMu.RLock() + haveFD := d.handle.fd >= 0 + d.handleMu.RUnlock() + return haveFD +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + mapped := d.mappings.AddMapping(ms, ar, offset, writable) + // Do this unconditionally since whether we have a host FD can change + // across save/restore. + for _, r := range mapped { + d.pf.hostFileMapper.IncRefOn(r) + } + if d.mayCachePages() { + // d.Evict() will refuse to evict memory-mapped pages, so tell the + // MemoryFile to not bother trying. + mf := d.fs.mfp.MemoryFile() + for _, r := range mapped { + mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) + } + } + d.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + d.mapsMu.Lock() + unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + d.pf.hostFileMapper.DecRefOn(r) + } + if d.mayCachePages() { + // Pages that are no longer referenced by any application memory + // mappings are now considered unused; allow MemoryFile to evict them + // when necessary. + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + for _, r := range unmapped { + // Since these pages are no longer mapped, they are no longer + // concurrently dirtyable by a writable memory mapping. + d.dirty.AllowClean(r) + mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) + } + d.dataMu.Unlock() + } + d.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return d.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + d.handleMu.RLock() + if d.handle.fd >= 0 && !d.fs.opts.forcePageCache { + d.handleMu.RUnlock() + mr := optional + if d.fs.opts.limitHostFDTranslation { + mr = maxFillRange(required, optional) + } + return []memmap.Translation{ + { + Source: mr, + File: &d.pf, + Offset: mr.Start, + Perms: usermem.AnyAccess, + }, + }, nil + } + + d.dataMu.Lock() + + // Constrain translations to d.size (rounded up) to prevent translation to + // pages that may be concurrently truncated. + pgend, _ := usermem.PageRoundUp(d.size) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + d.dataMu.Unlock() + d.handleMu.RUnlock() + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + mf := d.fs.mfp.MemoryFile() + cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + // TODO(jamieliu): Make Translations writable even if writability is + // not required if already kept-dirty by another writable translation. + perms := usermem.AccessType{ + Read: true, + Execute: true, + } + if at.Write { + // From this point forward, this memory can be dirtied through the + // mapping at any time. + d.dirty.KeepDirty(segMR) + perms.Write = true + } + ts = append(ts, memmap.Translation{ + Source: segMR, + File: mf, + Offset: seg.FileRangeOf(segMR).Start, + Perms: perms, + }) + translatedEnd = segMR.End + } + + d.dataMu.Unlock() + d.handleMu.RUnlock() + + // Don't return the error returned by c.cache.Fill if it occurred outside + // of required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { + const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily + if required.Length() >= maxReadahead { + return required + } + if optional.Length() <= maxReadahead { + return optional + } + optional.Start = required.Start + if optional.Length() <= maxReadahead { + return optional + } + optional.End = optional.Start + maxReadahead + return optional +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (d *dentry) InvalidateUnsavable(ctx context.Context) error { + // Whether we have a host fd (and consequently what platform.File is + // mapped) can change across save/restore, so invalidate all translations + // unconditionally. + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Write the cache's contents back to the remote file so that if we have a + // host fd after restore, the remote file's contents are coherent. + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + defer d.dataMu.Unlock() + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + return err + } + + // Discard the cache so that it's not stored in saved state. This is safe + // because per InvalidateUnsavable invariants, no new translations can have + // been returned after we invalidated all existing translations above. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + + return nil +} + +// Evict implements pgalloc.EvictableMemoryUser.Evict. +func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + + mr := memmap.MappableRange{er.Start, er.End} + mf := d.fs.mfp.MemoryFile() + // Only allow pages that are no longer memory-mapped to be evicted. + for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + mgapMR := mgap.Range().Intersect(mr) + if mgapMR.Length() == 0 { + continue + } + if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) + } + d.cache.Drop(mgapMR, mf) + d.dirty.KeepClean(mgapMR) + } +} + +// dentryPlatformFile implements platform.File. It exists solely because dentry +// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. +// +// dentryPlatformFile is only used when a host FD representing the remote file +// is available (i.e. dentry.handle.fd >= 0), and that FD is used for +// application memory mappings (i.e. !filesystem.opts.forcePageCache). +type dentryPlatformFile struct { + *dentry + + // fdRefs counts references on platform.File offsets. fdRefs is protected + // by dentry.dataMu. + fdRefs fsutil.FrameRefSet + + // If this dentry represents a regular file, and handle.fd >= 0, + // hostFileMapper caches mappings of handle.fd. + hostFileMapper fsutil.HostFileMapper + + // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. + hostFileMapperInitOnce sync.Once +} + +// IncRef implements platform.File.IncRef. +func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { + d.dataMu.Lock() + d.fdRefs.IncRefAndAccount(fr) + d.dataMu.Unlock() +} + +// DecRef implements platform.File.DecRef. +func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { + d.dataMu.Lock() + d.fdRefs.DecRefAndAccount(fr) + d.dataMu.Unlock() +} + +// MapInternal implements platform.File.MapInternal. +func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + d.handleMu.RLock() + bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) + d.handleMu.RUnlock() + return bs, err +} + +// FD implements platform.File.FD. +func (d *dentryPlatformFile) FD() int { + d.handleMu.RLock() + fd := d.handle.fd + d.handleMu.RUnlock() + return int(fd) +} diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go new file mode 100644 index 000000000..d6dbe9092 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -0,0 +1,146 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/waiter" +) + +func (d *dentry) isSocket() bool { + return d.fileType() == linux.S_IFSOCK +} + +// endpoint is a Gofer-backed transport.BoundEndpoint. +// +// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt() +// is called and either BoundEndpoint.BidirectionalConnect or +// BoundEndpoint.UnidirectionalConnect is called. +type endpoint struct { + // dentry is the filesystem dentry which produced this endpoint. + dentry *dentry + + // file is the p9 file that contains a single unopened fid. + file p9.File + + // path is the sentry path where this endpoint is bound. + path string +} + +func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) { + switch t { + case linux.SOCK_STREAM: + return p9.StreamSocket, true + case linux.SOCK_SEQPACKET: + return p9.SeqpacketSocket, true + case linux.SOCK_DGRAM: + return p9.DgramSocket, true + } + return 0, false +} + +// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. +func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { + cf, ok := sockTypeToP9(ce.Type()) + if !ok { + return syserr.ErrConnectionRefused + } + + // No lock ordering required as only the ConnectingEndpoint has a mutex. + ce.Lock() + + // Check connecting state. + if ce.Connected() { + ce.Unlock() + return syserr.ErrAlreadyConnected + } + if ce.Listening() { + ce.Unlock() + return syserr.ErrInvalidEndpointState + } + + c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue()) + if err != nil { + ce.Unlock() + return err + } + + returnConnect(c, c) + ce.Unlock() + if err := c.Init(); err != nil { + return syserr.FromError(err) + } + + return nil +} + +// UnidirectionalConnect implements +// transport.BoundEndpoint.UnidirectionalConnect. +func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) { + c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{}) + if err != nil { + return nil, err + } + + if err := c.Init(); err != nil { + return nil, syserr.FromError(err) + } + + // We don't need the receiver. + c.CloseRecv() + c.Release() + + return c, nil +} + +func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { + hostFile, err := e.file.Connect(flags) + if err != nil { + return nil, syserr.ErrConnectionRefused + } + // Dup the fd so that the new endpoint can manage its lifetime. + hostFD, err := syscall.Dup(hostFile.FD()) + if err != nil { + log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err) + return nil, syserr.FromError(err) + } + // After duplicating, we no longer need hostFile. + hostFile.Close() + + c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path) + if serr != nil { + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr) + return nil, serr + } + return c, nil +} + +// Release implements transport.BoundEndpoint.Release. +func (e *endpoint) Release() { + e.dentry.DecRef() +} + +// Passcred implements transport.BoundEndpoint.Passcred. +func (e *endpoint) Passcred() bool { + return false +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go new file mode 100644 index 000000000..c1e6b13e5 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -0,0 +1,245 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device +// special files, and (when filesystemOptions.specialRegularFiles is in effect) +// regular files. specialFileFD differs from regularFileFD by using per-FD +// handles instead of shared per-dentry handles, and never buffering I/O. +type specialFileFD struct { + fileDescription + + // handle is used for file I/O. handle is immutable. + handle handle + + // seekable is true if this file description represents a file for which + // file offset is significant, i.e. a regular file. seekable is immutable. + seekable bool + + // haveQueue is true if this file description represents a file for which + // queue may send I/O readiness events. haveQueue is immutable. + haveQueue bool + queue waiter.Queue + + // If seekable is true, off is the file offset. off is protected by mu. + mu sync.Mutex + off int64 +} + +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) { + ftype := d.fileType() + seekable := ftype == linux.S_IFREG + haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0 + fd := &specialFileFD{ + handle: h, + seekable: seekable, + haveQueue: haveQueue, + } + fd.LockFD.Init(locks) + if haveQueue { + if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { + return nil, err + } + } + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { + if haveQueue { + fdnotifier.RemoveFD(h.fd) + } + return nil, err + } + return fd, nil +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *specialFileFD) Release() { + if fd.haveQueue { + fdnotifier.RemoveFD(fd.handle.fd) + } + fd.handle.close(context.Background()) + fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) + fs.syncMu.Lock() + delete(fs.specialFileFDs, fd) + fs.syncMu.Unlock() +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *specialFileFD) OnClose(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + return fd.handle.file.flush(ctx) +} + +// Readiness implements waiter.Waitable.Readiness. +func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { + if fd.haveQueue { + return fdnotifier.NonBlockingPoll(fd.handle.fd, mask) + } + return fd.fileDescription.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + if fd.haveQueue { + fd.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(fd.handle.fd) + return + } + fd.fileDescription.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { + if fd.haveQueue { + fd.queue.EventUnregister(e) + fdnotifier.UpdateFD(fd.handle.fd) + return + } + fd.fileDescription.EventUnregister(e) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if fd.seekable && offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Going through dst.CopyOutFrom() holds MM locks around file operations of + // unknown duration. For regularFileFD, doing so is necessary to support + // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't + // hold here since specialFileFD doesn't client-cache data. Just buffer the + // read instead. + if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + d.touchAtime(fd.vfsfd.Mount()) + } + buf := make([]byte, dst.NumBytes()) + n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if err == syserror.EAGAIN { + err = syserror.ErrWouldBlock + } + if n == 0 { + return 0, err + } + if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { + return int64(cp), cperr + } + return int64(n), err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if !fd.seekable { + return fd.PRead(ctx, dst, -1, opts) + } + + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if fd.seekable && offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + + if fd.seekable { + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) + } + + // Do a buffered write. See rationale in PRead. + if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + d.touchCMtime() + } + buf := make([]byte, src.NumBytes()) + // Don't do partial writes if we get a partial read from src. + if _, err := src.CopyIn(ctx, buf); err != nil { + return 0, err + } + n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if err == syserror.EAGAIN { + err = syserror.ErrWouldBlock + } + return int64(n), err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + if !fd.seekable { + return fd.PWrite(ctx, src, -1, opts) + } + + fd.mu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if !fd.seekable { + return 0, syserror.ESPIPE + } + fd.mu.Lock() + defer fd.mu.Unlock() + newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) + if err != nil { + return 0, err + } + fd.off = newOffset + return newOffset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *specialFileFD) Sync(ctx context.Context) error { + return fd.dentry().syncSharedHandle(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go new file mode 100644 index 000000000..2ec819f86 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -0,0 +1,47 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func (d *dentry) isSymlink() bool { + return d.fileType() == linux.S_IFLNK +} + +// Precondition: d.isSymlink(). +func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { + if d.fs.opts.interop != InteropModeShared { + d.touchAtime(mnt) + d.dataMu.Lock() + if d.haveTarget { + target := d.target + d.dataMu.Unlock() + return target, nil + } + } + target, err := d.file.readlink(ctx) + if d.fs.opts.interop != InteropModeShared { + if err == nil { + d.haveTarget = true + d.target = target + } + d.dataMu.Unlock() + } + return target, err +} diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go new file mode 100644 index 000000000..0eef4e16e --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -0,0 +1,79 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func dentryTimestampFromP9(s, ns uint64) int64 { + return int64(s*1e9 + ns) +} + +func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 { + return ts.Sec*1e9 + int64(ts.Nsec) +} + +func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { + return linux.StatxTimestamp{ + Sec: ns / 1e9, + Nsec: uint32(ns % 1e9), + } +} + +// Preconditions: d.cachedMetadataAuthoritative() == true. +func (d *dentry) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime { + return + } + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now := d.fs.clock.Now().Nanoseconds() + d.metadataMu.Lock() + atomic.StoreInt64(&d.atime, now) + d.metadataMu.Unlock() + mnt.EndWrite() +} + +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// successfully called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCtime() { + now := d.fs.clock.Now().Nanoseconds() + d.metadataMu.Lock() + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} + +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// successfully called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCMtime() { + now := d.fs.clock.Now().Nanoseconds() + d.metadataMu.Lock() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} + +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// locked d.metadataMu. +func (d *dentry) touchCMtimeLocked() { + now := d.fs.clock.Now().Nanoseconds() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) +} diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD new file mode 100644 index 000000000..44a09d87a --- /dev/null +++ b/pkg/sentry/fsimpl/host/BUILD @@ -0,0 +1,52 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "host", + srcs = [ + "control.go", + "host.go", + "ioctl_unsafe.go", + "mmap.go", + "socket.go", + "socket_iovec.go", + "socket_unsafe.go", + "tty.go", + "util.go", + "util_unsafe.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fdnotifier", + "//pkg/fspath", + "//pkg/log", + "//pkg/refs", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/hostfd", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/unimpl", + "//pkg/sentry/uniqueid", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/unet", + "//pkg/usermem", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go new file mode 100644 index 000000000..b9082a20f --- /dev/null +++ b/pkg/sentry/fsimpl/host/control.go @@ -0,0 +1,96 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type scmRights struct { + fds []int +} + +func newSCMRights(fds []int) control.SCMRightsVFS2 { + return &scmRights{fds} +} + +// Files implements control.SCMRights.Files. +func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFilesVFS2, bool) { + n := max + var trunc bool + if l := len(c.fds); n > l { + n = l + } else if n < l { + trunc = true + } + + rf := control.RightsFilesVFS2(fdsToFiles(ctx, c.fds[:n])) + + // Only consume converted FDs (fdsToFiles may convert fewer than n FDs). + c.fds = c.fds[len(rf):] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (c *scmRights) Clone() transport.RightsControlMessage { + // Host rights never need to be cloned. + return nil +} + +// Release implements transport.RightsControlMessage.Release. +func (c *scmRights) Release() { + for _, fd := range c.fds { + syscall.Close(fd) + } + c.fds = nil +} + +// If an error is encountered, only files created before the error will be +// returned. This is what Linux does. +func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription { + files := make([]*vfs.FileDescription, 0, len(fds)) + for _, fd := range fds { + // Get flags. We do it here because they may be modified + // by subsequent functions. + fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if errno != 0 { + ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) + break + } + + // Create the file backed by hostFD. + file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */) + if err != nil { + ctx.Warningf("Error creating file from host FD: %v", err) + break + } + + if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil { + ctx.Warningf("Error setting flags on host FD file: %v", err) + break + } + + files = append(files, file) + } + return files +} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go new file mode 100644 index 000000000..1cd2982cb --- /dev/null +++ b/pkg/sentry/fsimpl/host/host.go @@ -0,0 +1,766 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package host provides a filesystem implementation for host files imported as +// file descriptors. +package host + +import ( + "fmt" + "math" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// NewFDOptions contains options to NewFD. +type NewFDOptions struct { + // If IsTTY is true, the file descriptor is a TTY. + IsTTY bool + + // If HaveFlags is true, use Flags for the new file description. Otherwise, + // the new file description will inherit flags from hostFD. + HaveFlags bool + Flags uint32 +} + +// NewFD returns a vfs.FileDescription representing the given host file +// descriptor. mnt must be Kernel.HostMount(). +func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) { + fs, ok := mnt.Filesystem().Impl().(*filesystem) + if !ok { + return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) + } + + // Retrieve metadata. + var s unix.Stat_t + if err := unix.Fstat(hostFD, &s); err != nil { + return nil, err + } + + flags := opts.Flags + if !opts.HaveFlags { + // Get flags for the imported FD. + flagsInt, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0) + if err != nil { + return nil, err + } + flags = uint32(flagsInt) + } + + fileMode := linux.FileMode(s.Mode) + fileType := fileMode.FileType() + + // Determine if hostFD is seekable. If not, this syscall will return ESPIPE + // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character + // devices. + _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) + seekable := err != syserror.ESPIPE + + i := &inode{ + hostFD: hostFD, + ino: fs.NextIno(), + isTTY: opts.IsTTY, + wouldBlock: wouldBlock(uint32(fileType)), + seekable: seekable, + // NOTE(b/38213152): Technically, some obscure char devices can be memory + // mapped, but we only allow regular files. + canMap: fileType == linux.S_IFREG, + } + i.pf.inode = i + + // Non-seekable files can't be memory mapped, assert this. + if !i.seekable && i.canMap { + panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") + } + + // If the hostFD would block, we must set it to non-blocking and handle + // blocking behavior in the sentry. + if i.wouldBlock { + if err := syscall.SetNonblock(i.hostFD, true); err != nil { + return nil, err + } + if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { + return nil, err + } + } + + d := &kernfs.Dentry{} + d.Init(i) + + // i.open will take a reference on d. + defer d.DecRef() + + // For simplicity, fileDescription.offset is set to 0. Technically, we + // should only set to 0 on files that are not seekable (sockets, pipes, + // etc.), and use the offset from the host fd otherwise when importing. + return i.open(ctx, d.VFSDentry(), mnt, flags) +} + +// ImportFD sets up and returns a vfs.FileDescription from a donated fd. +func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { + return NewFD(ctx, mnt, hostFD, &NewFDOptions{ + IsTTY: isTTY, + }) +} + +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("host.filesystemType.GetFilesystem should never be called") +} + +// Name implements FilesystemType.Name. +func (filesystemType) Name() string { + return "none" +} + +// NewFilesystem sets up and returns a new hostfs filesystem. +// +// Note that there should only ever be one instance of host.filesystem, +// a global mount for host fds. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.VFSFilesystem(), nil +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + +func (fs *filesystem) Release() { + fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + d := vd.Dentry().Impl().(*kernfs.Dentry) + inode := d.Inode().(*inode) + b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino)) + return vfs.PrependPathSyntheticError{} +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + locks vfs.FileLocks + + // When the reference count reaches zero, the host fd is closed. + refs.AtomicRefCount + + // hostFD contains the host fd that this file was originally created from, + // which must be available at time of restore. + // + // This field is initialized at creation time and is immutable. + hostFD int + + // ino is an inode number unique within this filesystem. + // + // This field is initialized at creation time and is immutable. + ino uint64 + + // isTTY is true if this file represents a TTY. + // + // This field is initialized at creation time and is immutable. + isTTY bool + + // seekable is false if the host fd points to a file representing a stream, + // e.g. a socket or a pipe. Such files are not seekable and can return + // EWOULDBLOCK for I/O operations. + // + // This field is initialized at creation time and is immutable. + seekable bool + + // wouldBlock is true if the host FD would return EWOULDBLOCK for + // operations that would block. + // + // This field is initialized at creation time and is immutable. + wouldBlock bool + + // Event queue for blocking operations. + queue waiter.Queue + + // canMap specifies whether we allow the file to be memory mapped. + // + // This field is initialized at creation time and is immutable. + canMap bool + + // mapsMu protects mappings. + mapsMu sync.Mutex + + // If canMap is true, mappings tracks mappings of hostFD into + // memmap.MappingSpaces. + mappings memmap.MappingSet + + // pf implements platform.File for mappings of hostFD. + pf inodePlatformFile +} + +// CheckPermissions implements kernfs.Inode. +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return err + } + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) +} + +// Mode implements kernfs.Inode. +func (i *inode) Mode() linux.FileMode { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + // Retrieving the mode from the host fd using fstat(2) should not fail. + // If the syscall does not succeed, something is fundamentally wrong. + panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) + } + return linux.FileMode(s.Mode) +} + +// Stat implements kernfs.Inode. +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + if opts.Mask&linux.STATX__RESERVED != 0 { + return linux.Statx{}, syserror.EINVAL + } + if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { + return linux.Statx{}, syserror.EINVAL + } + + fs := vfsfs.Impl().(*filesystem) + + // Limit our host call only to known flags. + mask := opts.Mask & linux.STATX_ALL + var s unix.Statx_t + err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) + if err == syserror.ENOSYS { + // Fallback to fstat(2), if statx(2) is not supported on the host. + // + // TODO(b/151263641): Remove fallback. + return i.fstat(fs) + } + if err != nil { + return linux.Statx{}, err + } + + // Unconditionally fill blksize, attributes, and device numbers, as + // indicated by /include/uapi/linux/stat.h. Inode number is always + // available, since we use our own rather than the host's. + ls := linux.Statx{ + Mask: linux.STATX_INO, + Blksize: s.Blksize, + Attributes: s.Attributes, + Ino: i.ino, + AttributesMask: s.Attributes_mask, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + } + + // Copy other fields that were returned by the host. RdevMajor/RdevMinor + // are never copied (and therefore left as zero), so as not to expose host + // device numbers. + ls.Mask |= s.Mask & linux.STATX_ALL + if s.Mask&linux.STATX_TYPE != 0 { + ls.Mode |= s.Mode & linux.S_IFMT + } + if s.Mask&linux.STATX_MODE != 0 { + ls.Mode |= s.Mode &^ linux.S_IFMT + } + if s.Mask&linux.STATX_NLINK != 0 { + ls.Nlink = s.Nlink + } + if s.Mask&linux.STATX_UID != 0 { + ls.UID = s.Uid + } + if s.Mask&linux.STATX_GID != 0 { + ls.GID = s.Gid + } + if s.Mask&linux.STATX_ATIME != 0 { + ls.Atime = unixToLinuxStatxTimestamp(s.Atime) + } + if s.Mask&linux.STATX_BTIME != 0 { + ls.Btime = unixToLinuxStatxTimestamp(s.Btime) + } + if s.Mask&linux.STATX_CTIME != 0 { + ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) + } + if s.Mask&linux.STATX_MTIME != 0 { + ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) + } + if s.Mask&linux.STATX_SIZE != 0 { + ls.Size = s.Size + } + if s.Mask&linux.STATX_BLOCKS != 0 { + ls.Blocks = s.Blocks + } + + return ls, nil +} + +// fstat is a best-effort fallback for inode.Stat() if the host does not +// support statx(2). +// +// We ignore the mask and sync flags in opts and simply supply +// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification +// of a mask or sync flags. fstat(2) does not provide any metadata +// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so +// those fields remain empty. +func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { + var s unix.Stat_t + if err := unix.Fstat(i.hostFD, &s); err != nil { + return linux.Statx{}, err + } + + // As with inode.Stat(), we always use internal device and inode numbers, + // and never expose the host's represented device numbers. + return linux.Statx{ + Mask: linux.STATX_BASIC_STATS, + Blksize: uint32(s.Blksize), + Nlink: uint32(s.Nlink), + UID: s.Uid, + GID: s.Gid, + Mode: uint16(s.Mode), + Ino: i.ino, + Size: uint64(s.Size), + Blocks: uint64(s.Blocks), + Atime: timespecToStatxTimestamp(s.Atim), + Ctime: timespecToStatxTimestamp(s.Ctim), + Mtime: timespecToStatxTimestamp(s.Mtim), + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + }, nil +} + +// SetStat implements kernfs.Inode. +func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + s := opts.Stat + + m := s.Mask + if m == 0 { + return nil + } + if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { + return syserror.EPERM + } + var hostStat syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &hostStat); err != nil { + return err + } + if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { + return err + } + + if m&linux.STATX_MODE != 0 { + if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { + return err + } + } + if m&linux.STATX_SIZE != 0 { + if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { + return err + } + oldSize := uint64(hostStat.Size) + if s.Size < oldSize { + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(s.Size) + if oldpgend != newpgend { + i.mapsMu.Lock() + i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + i.mapsMu.Unlock() + } + } + } + if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { + ts := [2]syscall.Timespec{ + toTimespec(s.Atime, m&linux.STATX_ATIME == 0), + toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), + } + if err := setTimestamps(i.hostFD, &ts); err != nil { + return err + } + } + return nil +} + +// DecRef implements kernfs.Inode. +func (i *inode) DecRef() { + i.AtomicRefCount.DecRefWithDestructor(i.Destroy) +} + +// Destroy implements kernfs.Inode. +func (i *inode) Destroy() { + if i.wouldBlock { + fdnotifier.RemoveFD(int32(i.hostFD)) + } + if err := unix.Close(i.hostFD); err != nil { + log.Warningf("failed to close host fd %d: %v", i.hostFD, err) + } +} + +// Open implements kernfs.Inode. +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. + if i.Mode().FileType() == linux.S_IFSOCK { + return nil, syserror.ENXIO + } + return i.open(ctx, vfsd, rp.Mount(), opts.Flags) +} + +func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return nil, err + } + fileType := s.Mode & linux.FileTypeMask + + // Constrain flags to a subset we can handle. + // + // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. + flags &= syscall.O_ACCMODE | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND + + switch fileType { + case syscall.S_IFSOCK: + if i.isTTY { + log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) + return nil, syserror.ENOTTY + } + + ep, err := newEndpoint(ctx, i.hostFD, &i.queue) + if err != nil { + return nil, err + } + // Currently, we only allow Unix sockets to be imported. + return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks) + + case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR: + if i.isTTY { + fd := &TTYFileDescription{ + fileDescription: fileDescription{inode: i}, + termios: linux.DefaultSlaveTermios, + } + fd.LockFD.Init(&i.locks) + vfsfd := &fd.vfsfd + if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return vfsfd, nil + } + + fd := &fileDescription{inode: i} + fd.LockFD.Init(&i.locks) + vfsfd := &fd.vfsfd + if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return vfsfd, nil + + default: + log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) + return nil, syserror.EPERM + } +} + +// fileDescription is embedded by host fd implementations of FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but + // cached to reduce indirections and casting. fileDescription does not hold + // a reference on the inode through the inode field (since one is already + // held via the Dentry). + // + // inode is immutable after fileDescription creation. + inode *inode + + // offsetMu protects offset. + offsetMu sync.Mutex + + // offset specifies the current file offset. It is only meaningful when + // inode.seekable is true. + offset int64 +} + +// SetStat implements vfs.FileDescriptionImpl. +func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl. +func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) { + return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts) +} + +// Release implements vfs.FileDescriptionImpl. +func (f *fileDescription) Release() { + // noop +} + +// Allocate implements vfs.FileDescriptionImpl. +func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + if !f.inode.seekable { + return syserror.ESPIPE + } + + // TODO(gvisor.dev/issue/2923): Implement Allocate for non-pipe hostfds. + return syserror.EOPNOTSUPP +} + +// PRead implements FileDescriptionImpl. +func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + i := f.inode + if !i.seekable { + return 0, syserror.ESPIPE + } + + return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) +} + +// Read implements FileDescriptionImpl. +func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + i := f.inode + if !i.seekable { + n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + + f.offsetMu.Lock() + n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) + f.offset += n + f.offsetMu.Unlock() + return n, err +} + +func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) + n, err := dst.CopyOutFrom(ctx, reader) + hostfd.PutReadWriterAt(reader) + return int64(n), err +} + +// PWrite implements FileDescriptionImpl. +func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if !f.inode.seekable { + return 0, syserror.ESPIPE + } + + return f.writeToHostFD(ctx, src, offset, opts.Flags) +} + +// Write implements FileDescriptionImpl. +func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + i := f.inode + if !i.seekable { + n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + + f.offsetMu.Lock() + // NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if + // another process modifies the host file between retrieving the file size + // and writing to the host fd. This is an unavoidable race condition because + // we cannot enforce synchronization on the host. + if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + f.offsetMu.Unlock() + return 0, err + } + f.offset = s.Size + } + n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags) + f.offset += n + f.offsetMu.Unlock() + return n, err +} + +func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { + hostFD := f.inode.hostFD + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. + if flags != 0 { + return 0, syserror.EOPNOTSUPP + } + writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) + n, err := src.CopyInTo(ctx, writer) + hostfd.PutReadWriterAt(writer) + // NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC. + if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + if syncErr := unix.Fsync(hostFD); syncErr != nil { + return int64(n), syncErr + } + } + return int64(n), err +} + +// Seek implements FileDescriptionImpl. +// +// Note that we do not support seeking on directories, since we do not even +// allow directory fds to be imported at all. +func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { + i := f.inode + if !i.seekable { + return 0, syserror.ESPIPE + } + + f.offsetMu.Lock() + defer f.offsetMu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset = offset + + case linux.SEEK_CUR: + // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. + if offset > math.MaxInt64-f.offset { + return f.offset, syserror.EOVERFLOW + } + if f.offset+offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset += offset + + case linux.SEEK_END: + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return f.offset, err + } + size := s.Size + + // Check for overflow. Note that underflow cannot occur, since size >= 0. + if offset > math.MaxInt64-size { + return f.offset, syserror.EOVERFLOW + } + if size+offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset = size + offset + + case linux.SEEK_DATA, linux.SEEK_HOLE: + // Modifying the offset in the host file table should not matter, since + // this is the only place where we use it. + // + // For reading and writing, we always rely on our internal offset. + n, err := unix.Seek(i.hostFD, offset, int(whence)) + if err != nil { + return f.offset, err + } + f.offset = n + + default: + // Invalid whence. + return f.offset, syserror.EINVAL + } + + return f.offset, nil +} + +// Sync implements FileDescriptionImpl. +func (f *fileDescription) Sync(context.Context) error { + // TODO(gvisor.dev/issue/1897): Currently, we always sync everything. + return unix.Fsync(f.inode.hostFD) +} + +// ConfigureMMap implements FileDescriptionImpl. +func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { + if !f.inode.canMap { + return syserror.ENODEV + } + i := f.inode + i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init) + return vfs.GenericConfigureMMap(&f.vfsfd, i, opts) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + f.inode.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(f.inode.hostFD)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (f *fileDescription) EventUnregister(e *waiter.Entry) { + f.inode.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(f.inode.hostFD)) +} + +// Readiness uses the poll() syscall to check the status of the underlying FD. +func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/host/ioctl_unsafe.go b/pkg/sentry/fsimpl/host/ioctl_unsafe.go new file mode 100644 index 000000000..0983bf7d8 --- /dev/null +++ b/pkg/sentry/fsimpl/host/ioctl_unsafe.go @@ -0,0 +1,56 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +func ioctlGetTermios(fd int) (*linux.Termios, error) { + var t linux.Termios + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) + if errno != 0 { + return nil, errno + } + return &t, nil +} + +func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t))) + if errno != 0 { + return errno + } + return nil +} + +func ioctlGetWinsize(fd int) (*linux.Winsize, error) { + var w linux.Winsize + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w))) + if errno != 0 { + return nil, errno + } + return &w, nil +} + +func ioctlSetWinsize(fd int, w *linux.Winsize) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w))) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go new file mode 100644 index 000000000..8545a82f0 --- /dev/null +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -0,0 +1,132 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +// inodePlatformFile implements platform.File. It exists solely because inode +// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef. +// +// inodePlatformFile should only be used if inode.canMap is true. +type inodePlatformFile struct { + *inode + + // fdRefsMu protects fdRefs. + fdRefsMu sync.Mutex + + // fdRefs counts references on platform.File offsets. It is used solely for + // memory accounting. + fdRefs fsutil.FrameRefSet + + // fileMapper caches mappings of the host file represented by this inode. + fileMapper fsutil.HostFileMapper + + // fileMapperInitOnce is used to lazily initialize fileMapper. + fileMapperInitOnce sync.Once +} + +// IncRef implements platform.File.IncRef. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) IncRef(fr platform.FileRange) { + i.fdRefsMu.Lock() + i.fdRefs.IncRefAndAccount(fr) + i.fdRefsMu.Unlock() +} + +// DecRef implements platform.File.DecRef. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) DecRef(fr platform.FileRange) { + i.fdRefsMu.Lock() + i.fdRefs.DecRefAndAccount(fr) + i.fdRefsMu.Unlock() +} + +// MapInternal implements platform.File.MapInternal. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) +} + +// FD implements platform.File.FD. +func (i *inodePlatformFile) FD() int { + return i.hostFD +} + +// AddMapping implements memmap.Mappable.AddMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + i.mapsMu.Lock() + mapped := i.mappings.AddMapping(ms, ar, offset, writable) + for _, r := range mapped { + i.pf.fileMapper.IncRefOn(r) + } + i.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + i.mapsMu.Lock() + unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + i.pf.fileMapper.DecRefOn(r) + } + i.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return i.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + mr := optional + return []memmap.Translation{ + { + Source: mr, + File: &i.pf, + Offset: mr.Start, + Perms: usermem.AnyAccess, + }, + }, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) InvalidateUnsavable(ctx context.Context) error { + // We expect the same host fd across save/restore, so all translations + // should be valid. + return nil +} diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go new file mode 100644 index 000000000..fd16bd92d --- /dev/null +++ b/pkg/sentry/fsimpl/host/socket.go @@ -0,0 +1,385 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Create a new host-backed endpoint from the given fd and its corresponding +// notification queue. +func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) { + // Set up an external transport.Endpoint using the host fd. + addr := fmt.Sprintf("hostfd:[%d]", hostFD) + e, err := NewConnectedEndpoint(ctx, hostFD, addr, true /* saveable */) + if err != nil { + return nil, err.ToError() + } + ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), queue, e, e) + return ep, nil +} + +// ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and +// transport.Receiver. It is backed by a host fd that was imported at sentry +// startup. This fd is shared with a hostfs inode, which retains ownership of +// it. +// +// ConnectedEndpoint is saveable, since we expect that the host will provide +// the same fd upon restore. +// +// As of this writing, we only allow Unix sockets to be imported. +// +// +stateify savable +type ConnectedEndpoint struct { + // ref keeps track of references to a ConnectedEndpoint. + ref refs.AtomicRefCount + + // mu protects fd below. + mu sync.RWMutex `state:"nosave"` + + // fd is the host fd backing this endpoint. + fd int + + // addr is the address at which this endpoint is bound. + addr string + + // sndbuf is the size of the send buffer. + // + // N.B. When this is smaller than the host size, we present it via + // GetSockOpt and message splitting/rejection in SendMsg, but do not + // prevent lots of small messages from filling the real send buffer + // size on the host. + sndbuf int64 `state:"nosave"` + + // stype is the type of Unix socket. + stype linux.SockType +} + +// init performs initialization required for creating new ConnectedEndpoints and +// for restoring them. +func (c *ConnectedEndpoint) init() *syserr.Error { + family, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN) + if err != nil { + return syserr.FromError(err) + } + + if family != syscall.AF_UNIX { + // We only allow Unix sockets. + return syserr.ErrInvalidEndpointState + } + + stype, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_TYPE) + if err != nil { + return syserr.FromError(err) + } + + if err := syscall.SetNonblock(c.fd, true); err != nil { + return syserr.FromError(err) + } + + sndbuf, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF) + if err != nil { + return syserr.FromError(err) + } + + c.stype = linux.SockType(stype) + c.sndbuf = int64(sndbuf) + + return nil +} + +// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host fd +// imported at sentry startup, +// +// The caller is responsible for calling Init(). Additionaly, Release needs to +// be called twice because ConnectedEndpoint is both a transport.Receiver and +// transport.ConnectedEndpoint. +func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable bool) (*ConnectedEndpoint, *syserr.Error) { + e := ConnectedEndpoint{ + fd: hostFD, + addr: addr, + } + + if err := e.init(); err != nil { + return nil, err + } + + // AtomicRefCounters start off with a single reference. We need two. + e.ref.IncRef() + e.ref.EnableLeakCheck("host.ConnectedEndpoint") + return &e, nil +} + +// Send implements transport.ConnectedEndpoint.Send. +func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + + if !controlMessages.Empty() { + return 0, false, syserr.ErrInvalidEndpointState + } + + // Since stream sockets don't preserve message boundaries, we can write + // only as much of the message as fits in the send buffer. + truncate := c.stype == linux.SOCK_STREAM + + n, totalLen, err := fdWriteVec(c.fd, data, c.sndbuf, truncate) + if n < totalLen && err == nil { + // The host only returns a short write if it would otherwise + // block (and only for stream sockets). + err = syserror.EAGAIN + } + if n > 0 && err != syserror.EAGAIN { + // The caller may need to block to send more data, but + // otherwise there isn't anything that can be done about an + // error with a partial write. + err = nil + } + + // There is no need for the callee to call SendNotify because fdWriteVec + // uses the host's sendmsg(2) and the host kernel's queue. + return n, false, syserr.FromError(err) +} + +// SendNotify implements transport.ConnectedEndpoint.SendNotify. +func (c *ConnectedEndpoint) SendNotify() {} + +// CloseSend implements transport.ConnectedEndpoint.CloseSend. +func (c *ConnectedEndpoint) CloseSend() { + c.mu.Lock() + defer c.mu.Unlock() + + if err := syscall.Shutdown(c.fd, syscall.SHUT_WR); err != nil { + // A well-formed UDS shutdown can't fail. See + // net/unix/af_unix.c:unix_shutdown. + panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err)) + } +} + +// CloseNotify implements transport.ConnectedEndpoint.CloseNotify. +func (c *ConnectedEndpoint) CloseNotify() {} + +// Writable implements transport.ConnectedEndpoint.Writable. +func (c *ConnectedEndpoint) Writable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + + return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.EventOut)&waiter.EventOut != 0 +} + +// Passcred implements transport.ConnectedEndpoint.Passcred. +func (c *ConnectedEndpoint) Passcred() bool { + // We don't support credential passing for host sockets. + return false +} + +// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress. +func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, nil +} + +// EventUpdate implements transport.ConnectedEndpoint.EventUpdate. +func (c *ConnectedEndpoint) EventUpdate() { + c.mu.RLock() + defer c.mu.RUnlock() + if c.fd != -1 { + fdnotifier.UpdateFD(int32(c.fd)) + } +} + +// Recv implements transport.Receiver.Recv. +func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + + var cm unet.ControlMessage + if numRights > 0 { + cm.EnableFDs(int(numRights)) + } + + // N.B. Unix sockets don't have a receive buffer, the send buffer + // serves both purposes. + rl, ml, cl, cTrunc, err := fdReadVec(c.fd, data, []byte(cm), peek, c.sndbuf) + if rl > 0 && err != nil { + // We got some data, so all we need to do on error is return + // the data that we got. Short reads are fine, no need to + // block. + err = nil + } + if err != nil { + return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err) + } + + // There is no need for the callee to call RecvNotify because fdReadVec uses + // the host's recvmsg(2) and the host kernel's queue. + + // Trim the control data if we received less than the full amount. + if cl < uint64(len(cm)) { + cm = cm[:cl] + } + + // Avoid extra allocations in the case where there isn't any control data. + if len(cm) == 0 { + return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil + } + + fds, err := cm.ExtractFDs() + if err != nil { + return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err) + } + + if len(fds) == 0 { + return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil + } + return rl, ml, control.NewVFS2(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil +} + +// RecvNotify implements transport.Receiver.RecvNotify. +func (c *ConnectedEndpoint) RecvNotify() {} + +// CloseRecv implements transport.Receiver.CloseRecv. +func (c *ConnectedEndpoint) CloseRecv() { + c.mu.Lock() + defer c.mu.Unlock() + + if err := syscall.Shutdown(c.fd, syscall.SHUT_RD); err != nil { + // A well-formed UDS shutdown can't fail. See + // net/unix/af_unix.c:unix_shutdown. + panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err)) + } +} + +// Readable implements transport.Receiver.Readable. +func (c *ConnectedEndpoint) Readable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + + return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.EventIn)&waiter.EventIn != 0 +} + +// SendQueuedSize implements transport.Receiver.SendQueuedSize. +func (c *ConnectedEndpoint) SendQueuedSize() int64 { + // TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host + // sockets because we don't allow the sentry to call ioctl(2). + return -1 +} + +// RecvQueuedSize implements transport.Receiver.RecvQueuedSize. +func (c *ConnectedEndpoint) RecvQueuedSize() int64 { + // TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host + // sockets because we don't allow the sentry to call ioctl(2). + return -1 +} + +// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize. +func (c *ConnectedEndpoint) SendMaxQueueSize() int64 { + return int64(c.sndbuf) +} + +// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize. +func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { + // N.B. Unix sockets don't use the receive buffer. We'll claim it is + // the same size as the send buffer. + return int64(c.sndbuf) +} + +func (c *ConnectedEndpoint) destroyLocked() { + c.fd = -1 +} + +// Release implements transport.ConnectedEndpoint.Release and +// transport.Receiver.Release. +func (c *ConnectedEndpoint) Release() { + c.ref.DecRefWithDestructor(func() { + c.mu.Lock() + c.destroyLocked() + c.mu.Unlock() + }) +} + +// CloseUnread implements transport.ConnectedEndpoint.CloseUnread. +func (c *ConnectedEndpoint) CloseUnread() {} + +// SCMConnectedEndpoint represents an endpoint backed by a host fd that was +// passed through a gofer Unix socket. It resembles ConnectedEndpoint, with the +// following differences: +// - SCMConnectedEndpoint is not saveable, because the host cannot guarantee +// the same descriptor number across S/R. +// - SCMConnectedEndpoint holds ownership of its fd and notification queue. +type SCMConnectedEndpoint struct { + ConnectedEndpoint + + queue *waiter.Queue +} + +// Init will do the initialization required without holding other locks. +func (e *SCMConnectedEndpoint) Init() error { + return fdnotifier.AddFD(int32(e.fd), e.queue) +} + +// Release implements transport.ConnectedEndpoint.Release and +// transport.Receiver.Release. +func (e *SCMConnectedEndpoint) Release() { + e.ref.DecRefWithDestructor(func() { + e.mu.Lock() + if err := syscall.Close(e.fd); err != nil { + log.Warningf("Failed to close host fd %d: %v", err) + } + fdnotifier.RemoveFD(int32(e.fd)) + e.destroyLocked() + e.mu.Unlock() + }) +} + +// NewSCMEndpoint creates a new SCMConnectedEndpoint backed by a host fd that +// was passed through a Unix socket. +// +// The caller is responsible for calling Init(). Additionaly, Release needs to +// be called twice because ConnectedEndpoint is both a transport.Receiver and +// transport.ConnectedEndpoint. +func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string) (*SCMConnectedEndpoint, *syserr.Error) { + e := SCMConnectedEndpoint{ + ConnectedEndpoint: ConnectedEndpoint{ + fd: hostFD, + addr: addr, + }, + queue: queue, + } + + if err := e.init(); err != nil { + return nil, err + } + + // AtomicRefCounters start off with a single reference. We need two. + e.ref.IncRef() + e.ref.EnableLeakCheck("host.SCMConnectedEndpoint") + return &e, nil +} diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go new file mode 100644 index 000000000..584c247d2 --- /dev/null +++ b/pkg/sentry/fsimpl/host/socket_iovec.go @@ -0,0 +1,113 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +// maxIovs is the maximum number of iovecs to pass to the host. +var maxIovs = linux.UIO_MAXIOV + +// copyToMulti copies as many bytes from src to dst as possible. +func copyToMulti(dst [][]byte, src []byte) { + for _, d := range dst { + done := copy(d, src) + src = src[done:] + if len(src) == 0 { + break + } + } +} + +// copyFromMulti copies as many bytes from src to dst as possible. +func copyFromMulti(dst []byte, src [][]byte) { + for _, s := range src { + done := copy(dst, s) + dst = dst[done:] + if len(dst) == 0 { + break + } + } +} + +// buildIovec builds an iovec slice from the given []byte slice. +// +// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error. +// +// If length < the total length of bufs, err indicates why, even when returning +// a truncated iovec. +// +// If intermediate != nil, iovecs references intermediate rather than bufs and +// the caller must copy to/from bufs as necessary. +func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovecs []syscall.Iovec, intermediate []byte, err error) { + var iovsRequired int + for _, b := range bufs { + length += int64(len(b)) + if len(b) > 0 { + iovsRequired++ + } + } + + stopLen := length + if length > maxlen { + if truncate { + stopLen = maxlen + err = syserror.EAGAIN + } else { + return 0, nil, nil, syserror.EMSGSIZE + } + } + + if iovsRequired > maxIovs { + // The kernel will reject our call if we pass this many iovs. + // Use a single intermediate buffer instead. + b := make([]byte, stopLen) + + return stopLen, []syscall.Iovec{{ + Base: &b[0], + Len: uint64(stopLen), + }}, b, err + } + + var total int64 + iovecs = make([]syscall.Iovec, 0, iovsRequired) + for i := range bufs { + l := len(bufs[i]) + if l == 0 { + continue + } + + stop := int64(l) + if total+stop > stopLen { + stop = stopLen - total + } + + iovecs = append(iovecs, syscall.Iovec{ + Base: &bufs[i][0], + Len: uint64(stop), + }) + + total += stop + if total >= stopLen { + break + } + } + + return total, iovecs, nil, err +} diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go new file mode 100644 index 000000000..35ded24bc --- /dev/null +++ b/pkg/sentry/fsimpl/host/socket_unsafe.go @@ -0,0 +1,101 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +// fdReadVec receives from fd to bufs. +// +// If the total length of bufs is > maxlen, fdReadVec will do a partial read +// and err will indicate why the message was truncated. +func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (readLen int64, msgLen int64, controlLen uint64, controlTrunc bool, err error) { + flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC) + if peek { + flags |= syscall.MSG_PEEK + } + + // Always truncate the receive buffer. All socket types will truncate + // received messages. + length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true) + if err != nil && len(iovecs) == 0 { + // No partial write to do, return error immediately. + return 0, 0, 0, false, err + } + + var msg syscall.Msghdr + if len(control) != 0 { + msg.Control = &control[0] + msg.Controllen = uint64(len(control)) + } + + if len(iovecs) != 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + + rawN, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags) + if e != 0 { + // N.B. prioritize the syscall error over the buildIovec error. + return 0, 0, 0, false, e + } + n := int64(rawN) + + // Copy data back to bufs. + if intermediate != nil { + copyToMulti(bufs, intermediate) + } + + controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC + + if n > length { + return length, n, msg.Controllen, controlTrunc, err + } + + return n, n, msg.Controllen, controlTrunc, err +} + +// fdWriteVec sends from bufs to fd. +// +// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a +// partial write and err will indicate why the message was truncated. +func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int64, error) { + length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate) + if err != nil && len(iovecs) == 0 { + // No partial write to do, return error immediately. + return 0, length, err + } + + // Copy data to intermediate buf. + if intermediate != nil { + copyFromMulti(intermediate, bufs) + } + + var msg syscall.Msghdr + if len(iovecs) > 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + + n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL) + if e != 0 { + // N.B. prioritize the syscall error over the buildIovec error. + return 0, length, e + } + + return int64(n), length, err +} diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go new file mode 100644 index 000000000..4ee9270cc --- /dev/null +++ b/pkg/sentry/fsimpl/host/tty.go @@ -0,0 +1,390 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// TTYFileDescription implements vfs.FileDescriptionImpl for a host file +// descriptor that wraps a TTY FD. +type TTYFileDescription struct { + fileDescription + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // session is the session attached to this TTYFileDescription. + session *kernel.Session + + // fgProcessGroup is the foreground process group that is currently + // connected to this TTY. + fgProcessGroup *kernel.ProcessGroup + + // termios contains the terminal attributes for this TTY. + termios linux.KernelTermios +} + +// InitForegroundProcessGroup sets the foreground process group and session for +// the TTY. This should only be called once, after the foreground process group +// has been created, but before it has started running. +func (t *TTYFileDescription) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { + t.mu.Lock() + defer t.mu.Unlock() + if t.fgProcessGroup != nil { + panic("foreground process group is already set") + } + t.fgProcessGroup = pg + t.session = pg.Session() +} + +// ForegroundProcessGroup returns the foreground process for the TTY. +func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup { + t.mu.Lock() + defer t.mu.Unlock() + return t.fgProcessGroup +} + +// Release implements fs.FileOperations.Release. +func (t *TTYFileDescription) Release() { + t.mu.Lock() + t.fgProcessGroup = nil + t.mu.Unlock() + + t.fileDescription.Release() +} + +// PRead implements vfs.FileDescriptionImpl. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileDescription.PRead(ctx, dst, offset, opts) +} + +// Read implements vfs.FileDescriptionImpl. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileDescription.Read(ctx, dst, opts) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + } + return t.fileDescription.PWrite(ctx, src, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl. +func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + } + return t.fileDescription.Write(ctx, src, opts) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Ignore arg[0]. This is the real FD: + fd := t.inode.hostFD + ioctl := args[1].Uint64() + switch ioctl { + case linux.TCGETS: + termios, err := ioctlGetTermios(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TCSETS, linux.TCSETSW, linux.TCSETSF: + t.mu.Lock() + defer t.mu.Unlock() + + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + + var termios linux.Termios + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetTermios(fd, ioctl, &termios) + if err == nil { + t.termios.FromTermios(termios) + } + return 0, err + + case linux.TIOCGPGRP: + // Args: pid_t *argp + // When successful, equivalent to *argp = tcgetpgrp(fd). + // Get the process group ID of the foreground process group on this + // terminal. + + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Map the ProcessGroup into a ProcessGroupID in the task's PID namespace. + pgID := pidns.IDOfProcessGroup(t.fgProcessGroup) + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSPGRP: + // Args: const pid_t *argp + // Equivalent to tcsetpgrp(fd, *argp). + // Set the foreground process group ID of this terminal. + + task := kernel.TaskFromContext(ctx) + if task == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Check that we are allowed to set the process group. + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change() + // to -ENOTTY. + if err == syserror.EIO { + return 0, syserror.ENOTTY + } + return 0, err + } + + // Check that calling task's process group is in the TTY session. + if task.ThreadGroup().Session() != t.session { + return 0, syserror.ENOTTY + } + + var pgID kernel.ProcessGroupID + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + // pgID must be non-negative. + if pgID < 0 { + return 0, syserror.EINVAL + } + + // Process group with pgID must exist in this PID namespace. + pidns := task.PIDNamespace() + pg := pidns.ProcessGroupWithID(pgID) + if pg == nil { + return 0, syserror.ESRCH + } + + // Check that new process group is in the TTY session. + if pg.Session() != t.session { + return 0, syserror.EPERM + } + + t.fgProcessGroup = pg + return 0, nil + + case linux.TIOCGWINSZ: + // Args: struct winsize *argp + // Get window size. + winsize, err := ioctlGetWinsize(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSWINSZ: + // Args: const struct winsize *argp + // Set window size. + + // Unlike setting the termios, any process group (even background ones) can + // set the winsize. + + var winsize linux.Winsize + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetWinsize(fd, &winsize) + return 0, err + + // Unimplemented commands. + case linux.TIOCSETD, + linux.TIOCSBRK, + linux.TIOCCBRK, + linux.TCSBRK, + linux.TCSBRKP, + linux.TIOCSTI, + linux.TIOCCONS, + linux.FIONBIO, + linux.TIOCEXCL, + linux.TIOCNXCL, + linux.TIOCGEXCL, + linux.TIOCNOTTY, + linux.TIOCSCTTY, + linux.TIOCGSID, + linux.TIOCGETD, + linux.TIOCVHANGUP, + linux.TIOCGDEV, + linux.TIOCMGET, + linux.TIOCMSET, + linux.TIOCMBIC, + linux.TIOCMBIS, + linux.TIOCGICOUNT, + linux.TCFLSH, + linux.TIOCSSERIAL, + linux.TIOCGPTPEER: + + unimpl.EmitUnimplementedEvent(ctx) + fallthrough + default: + return 0, syserror.ENOTTY + } +} + +// checkChange checks that the process group is allowed to read, write, or +// change the state of the TTY. +// +// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic +// is a bit convoluted, but documented inline. +// +// Preconditions: t.mu must be held. +func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + // No task? Linux does not have an analog for this case, but + // tty_check_change only blocks specific cases and is + // surprisingly permissive. Allowing the change seems + // appropriate. + return nil + } + + tg := task.ThreadGroup() + pg := tg.ProcessGroup() + + // If the session for the task is different than the session for the + // controlling TTY, then the change is allowed. Seems like a bad idea, + // but that's exactly what linux does. + if tg.Session() != t.fgProcessGroup.Session() { + return nil + } + + // If we are the foreground process group, then the change is allowed. + if pg == t.fgProcessGroup { + return nil + } + + // We are not the foreground process group. + + // Is the provided signal blocked or ignored? + if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) { + // If the signal is SIGTTIN, then we are attempting to read + // from the TTY. Don't send the signal and return EIO. + if sig == linux.SIGTTIN { + return syserror.EIO + } + + // Otherwise, we are writing or changing terminal state. This is allowed. + return nil + } + + // If the process group is an orphan, return EIO. + if pg.IsOrphan() { + return syserror.EIO + } + + // Otherwise, send the signal to the process group and return ERESTARTSYS. + // + // Note that Linux also unconditionally sets TIF_SIGPENDING on current, + // but this isn't necessary in gVisor because the rationale given in + // 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't + // apply: the sentry will handle -ERESTARTSYS in + // kernel.runApp.execute() even if the kernel.Task isn't interrupted. + // + // Linux ignores the result of kill_pgrp(). + _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) + return kernel.ERESTARTSYS +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go new file mode 100644 index 000000000..412bdb2eb --- /dev/null +++ b/pkg/sentry/fsimpl/host/util.go @@ -0,0 +1,56 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +func toTimespec(ts linux.StatxTimestamp, omit bool) syscall.Timespec { + if omit { + return syscall.Timespec{ + Sec: 0, + Nsec: unix.UTIME_OMIT, + } + } + return syscall.Timespec{ + Sec: ts.Sec, + Nsec: int64(ts.Nsec), + } +} + +func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp { + return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec} +} + +func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { + return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)} +} + +// wouldBlock returns true for file types that can return EWOULDBLOCK +// for blocking operations, e.g. pipes, character devices, and sockets. +func wouldBlock(fileType uint32) bool { + return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK +} + +// isBlockError checks if an error is EAGAIN or EWOULDBLOCK. +// If so, they can be transformed into syserror.ErrWouldBlock. +func isBlockError(err error) bool { + return err == syserror.EAGAIN || err == syserror.EWOULDBLOCK +} diff --git a/pkg/sentry/fsimpl/host/util_unsafe.go b/pkg/sentry/fsimpl/host/util_unsafe.go new file mode 100644 index 000000000..5136ac844 --- /dev/null +++ b/pkg/sentry/fsimpl/host/util_unsafe.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +func setTimestamps(fd int, ts *[2]syscall.Timespec) error { + _, _, errno := syscall.Syscall6( + syscall.SYS_UTIMENSAT, + uintptr(fd), + 0, /* path */ + uintptr(unsafe.Pointer(ts)), + 0, /* flags */ + 0, 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD new file mode 100644 index 000000000..179df6c1e --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -0,0 +1,75 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "kernfs", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "Dentry", + }, +) + +go_template_instance( + name = "slot_list", + out = "slot_list.go", + package = "kernfs", + prefix = "slot", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*slot", + "Linker": "*slot", + }, +) + +go_library( + name = "kernfs", + srcs = [ + "dynamic_bytes_file.go", + "fd_impl_util.go", + "filesystem.go", + "fstree.go", + "inode_impl_util.go", + "kernfs.go", + "slot_list.go", + "symlink.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) + +go_test( + name = "kernfs_test", + size = "small", + srcs = ["kernfs_test.go"], + deps = [ + ":kernfs", + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/contexttest", + "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go new file mode 100644 index 000000000..6886b0876 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -0,0 +1,147 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// DynamicBytesFile implements kernfs.Inode and represents a read-only +// file whose contents are backed by a vfs.DynamicBytesSource. +// +// Must be instantiated with NewDynamicBytesFile or initialized with Init +// before first use. +// +// +stateify savable +type DynamicBytesFile struct { + InodeAttrs + InodeNoopRefCount + InodeNotDirectory + InodeNotSymlink + + locks vfs.FileLocks + data vfs.DynamicBytesSource +} + +var _ Inode = (*DynamicBytesFile)(nil) + +// Init initializes a dynamic bytes file. +func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) + f.data = data +} + +// Open implements Inode.Open. +func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &DynamicBytesFD{} + if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow +// inode attributes to be changed. Override SetStat() making it call +// f.InodeAttrs to allow it. +func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a +// DynamicBytesFile. +// +// Must be initialized with Init before first use. +// +// +stateify savable +type DynamicBytesFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DynamicBytesFileDescriptionImpl + vfs.LockFD + + vfsfd vfs.FileDescription + inode Inode +} + +// Init initializes a DynamicBytesFD. +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { + fd.LockFD.Init(locks) + if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return err + } + fd.inode = d.Impl().(*Dentry).inode + fd.SetDataSource(data) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *DynamicBytesFD) Release() {} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return syserror.EPERM +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go new file mode 100644 index 000000000..ca8b8c63b --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -0,0 +1,252 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "math" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory +// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not +// compatible with dynamic directories. +// +// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling +// IterDirents callback. The IterDirents callback therefore cannot hash or +// unhash children, or recursively call IterDirents on the same underlying +// inode. +// +// Must be initialize with Init before first use. +// +// Lock ordering: mu => children.mu. +type GenericDirectoryFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DirectoryFileDescriptionDefaultImpl + vfs.LockFD + + vfsfd vfs.FileDescription + children *OrderedChildren + + // mu protects the fields below. + mu sync.Mutex + + // off is the current directory offset. Protected by "mu". + off int64 +} + +// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its +// dentry. +func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { + fd := &GenericDirectoryFD{} + if err := fd.Init(children, locks, opts); err != nil { + return nil, err + } + if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return fd, nil +} + +// Init initializes a GenericDirectoryFD. Use it when overriding +// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the +// correct implementation. +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error { + if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { + // Can't open directories for writing. + return syserror.EISDIR + } + fd.LockFD.Init(locks) + fd.children = children + return nil +} + +// VFSFileDescription returns a pointer to the vfs.FileDescription representing +// this object. +func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription { + return &fd.vfsfd +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts) +} + +// Read implmenets vfs.FileDescriptionImpl.Read. +func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts) +} + +// PRead implmenets vfs.FileDescriptionImpl.PRead. +func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDecriptionImpl.Release. +func (fd *GenericDirectoryFD) Release() {} + +func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem() +} + +func (fd *GenericDirectoryFD) inode() Inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode +} + +// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds +// o.mu when calling cb. +func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fd.mu.Lock() + defer fd.mu.Unlock() + + opts := vfs.StatOptions{Mask: linux.STATX_INO} + // Handle ".". + if fd.off == 0 { + stat, err := fd.inode().Stat(fd.filesystem(), opts) + if err != nil { + return err + } + dirent := vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + Ino: stat.Ino, + NextOff: 1, + } + if err := cb.Handle(dirent); err != nil { + return err + } + fd.off++ + } + + // Handle "..". + if fd.off == 1 { + vfsd := fd.vfsfd.VirtualDentry().Dentry() + parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode + stat, err := parentInode.Stat(fd.filesystem(), opts) + if err != nil { + return err + } + dirent := vfs.Dirent{ + Name: "..", + Type: linux.FileMode(stat.Mode).DirentType(), + Ino: stat.Ino, + NextOff: 2, + } + if err := cb.Handle(dirent); err != nil { + return err + } + fd.off++ + } + + // Handle static children. + fd.children.mu.RLock() + defer fd.children.mu.RUnlock() + // fd.off accounts for "." and "..", but fd.children do not track + // these. + childIdx := fd.off - 2 + for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { + inode := it.Dentry.Impl().(*Dentry).inode + stat, err := inode.Stat(fd.filesystem(), opts) + if err != nil { + return err + } + dirent := vfs.Dirent{ + Name: it.Name, + Type: linux.FileMode(stat.Mode).DirentType(), + Ino: stat.Ino, + NextOff: fd.off + 1, + } + if err := cb.Handle(dirent); err != nil { + return err + } + fd.off++ + } + + var err error + relOffset := fd.off - int64(len(fd.children.set)) - 2 + fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset) + return err +} + +// Seek implements vfs.FileDecriptionImpl.Seek. +func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + // TODO(gvisor.dev/issue/1193): This can prevent new files from showing up + // if they are added after SEEK_END. + offset = math.MaxInt64 + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.filesystem() + inode := fd.inode() + return inode.Stat(fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode + return inode.SetStat(ctx, fd.filesystem(), creds, opts) +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go new file mode 100644 index 000000000..8939871c1 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -0,0 +1,840 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +// This file implements vfs.FilesystemImpl for kernfs. + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// stepExistingLocked resolves rp.Component() in parent directory vfsd. +// +// stepExistingLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// +// Postcondition: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, syserror.ENOTDIR + } + // Directory searchable? + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + // Revalidation must be skipped if name is "." or ".."; d or its parent + // respectively can't be expected to transition from invalidated back to + // valid, so detecting invalidation and retrying would loop forever. This + // is consistent with Linux: fs/namei.c:walk_component() => lookup_fast() + // calls d_revalidate(), but walk_component() => handle_dots() does not. + if name == "." { + rp.Advance() + return vfsd, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return vfsd, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return &d.parent.vfsd, nil + } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } + d.dirMu.Lock() + next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name]) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + if err := rp.CheckMount(&next.vfsd); err != nil { + return nil, err + } + // Resolve any symlink at current path component. + if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() { + targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if targetVD.Ok() { + err := rp.HandleJump(targetVD) + targetVD.DecRef() + if err != nil { + return nil, err + } + } else { + if err := rp.HandleSymlink(targetPathname); err != nil { + return nil, err + } + } + goto afterSymlink + } + rp.Advance() + return &next.vfsd, nil +} + +// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) +// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be +// nil) to verify that the returned child (or lack thereof) is correct. +// +// Preconditions: Filesystem.mu must be locked for at least reading. +// parent.dirMu must be locked. parent.isDir(). name is not "." or "..". +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) { + if child != nil { + // Cached dentry exists, revalidate. + if !child.inode.Valid(ctx) { + delete(parent.children, name) + vfsObj.InvalidateDentry(&child.vfsd) + fs.deferDecRef(&child.vfsd) // Reference from Lookup. + child = nil + } + } + if child == nil { + // Dentry isn't cached; it either doesn't exist or failed + // revalidation. Attempt to resolve it via Lookup. + // + // FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return + // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes + // that all dentries in the filesystem are (kernfs.)Dentry and performs + // vfs.DentryImpl casts accordingly. + childVFSD, err := parent.inode.Lookup(ctx, name) + if err != nil { + return nil, err + } + // Reference on childVFSD dropped by a corresponding Valid. + child = childVFSD.Impl().(*Dentry) + parent.insertChildLocked(name, child) + } + return child, nil +} + +// walkExistingLocked resolves rp to an existing file. +// +// walkExistingLocked is loosely analogous to Linux's +// fs/namei.c:path_lookupat(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { + vfsd := rp.Start() + for !rp.Done() { + var err error + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */) + if err != nil { + return nil, nil, err + } + } + d := vfsd.Impl().(*Dentry) + if rp.MustBeDir() && !d.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, d.inode, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { + vfsd := rp.Start() + for !rp.Final() { + var err error + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */) + if err != nil { + return nil, nil, err + } + } + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, d.inode, nil +} + +// checkCreateLocked checks that a file named rp.Component() may be created in +// directory parentVFSD, then returns rp.Component(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. parentInode +// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true. +func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return "", err + } + pc := rp.Component() + if pc == "." || pc == ".." { + return "", syserror.EEXIST + } + if len(pc) > linux.NAME_MAX { + return "", syserror.ENAMETOOLONG + } + // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu. + if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok { + return "", syserror.EEXIST + } + if parentVFSD.IsDead() { + return "", syserror.ENOENT + } + return pc, nil +} + +// checkDeleteLocked checks that the file represented by vfsd may be deleted. +// +// Preconditions: Filesystem.mu must be locked for at least reading. +func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { + parent := vfsd.Impl().(*Dentry).parent + if parent == nil { + return syserror.EBUSY + } + if parent.vfsd.IsDead() { + return syserror.ENOENT + } + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + return nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *Filesystem) Release() { +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *Filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + + _, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return err + } + return inode.CheckPermissions(ctx, creds, ats) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return nil, err + } + + if opts.CheckSearchable { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + vfsd.IncRef() // Ownership transferred to caller. + return vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, _, err := fs.walkParentDirLocked(ctx, rp) + if err != nil { + return nil, err + } + vfsd.IncRef() // Ownership transferred to caller. + return vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + if err != nil { + return err + } + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + + d := vd.Dentry().Impl().(*Dentry) + if d.isDir() { + return syserror.EPERM + } + + childVFSD, err := parentInode.NewLink(ctx, pc, d.inode) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + return nil +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + childVFSD, err := parentInode.NewDir(ctx, pc, opts) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + return nil +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + newVFSD, err := parentInode.NewNode(ctx, pc, opts) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry)) + return nil +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Filter out flags that are not supported by kernfs. O_DIRECTORY and + // O_NOFOLLOW have no effect here (they're handled by VFS by setting + // appropriate bits in rp), but are returned by + // FileDescriptionImpl.StatusFlags(). + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK + ats := vfs.AccessTypesForOpenFlags(&opts) + + // Do not create new file. + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return nil, err + } + if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + return nil, err + } + return inode.Open(ctx, rp, vfsd, opts) + } + + // May create new file. + mustCreate := opts.Flags&linux.O_EXCL != 0 + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + return nil, err + } + return inode.Open(ctx, rp, vfsd, opts) + } +afterTrailingSymlink: + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + pc := rp.Component() + if pc == "." || pc == ".." { + return nil, syserror.EISDIR + } + if len(pc) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } + // Determine whether or not we need to create a file. + childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */) + if err == syserror.ENOENT { + // Already checked for searchability above; now check for writability. + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + childVFSD, err = parentInode.NewFile(ctx, pc, opts) + if err != nil { + return nil, err + } + child := childVFSD.Impl().(*Dentry) + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return child.inode.Open(ctx, rp, childVFSD, opts) + } + if err != nil { + return nil, err + } + // Open existing file or follow symlink. + if mustCreate { + return nil, syserror.EEXIST + } + child := childVFSD.Impl().(*Dentry) + if rp.ShouldFollowSymlink() && child.isSymlink() { + targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if targetVD.Ok() { + err := rp.HandleJump(targetVD) + targetVD.DecRef() + if err != nil { + return nil, err + } + } else { + if err := rp.HandleSymlink(targetPathname); err != nil { + return nil, err + } + } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink + } + if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + return nil, err + } + return child.inode.Open(ctx, rp, &child.vfsd, opts) +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + d, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return "", err + } + if !d.Impl().(*Dentry).isSymlink() { + return "", syserror.EINVAL + } + return inode.Readlink(ctx) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + // Only RENAME_NOREPLACE is supported. + if opts.Flags&^linux.RENAME_NOREPLACE != 0 { + return syserror.EINVAL + } + noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 + + fs.mu.Lock() + defer fs.processDeferredDecRefsLocked() + defer fs.mu.Unlock() + + // Resolve the destination directory first to verify that it's on this + // Mount. + dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + if err != nil { + return err + } + dstDir := dstDirVFSD.Impl().(*Dentry) + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + srcDirVFSD := oldParentVD.Dentry() + srcDir := srcDirVFSD.Impl().(*Dentry) + srcDir.dirMu.Lock() + src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName]) + srcDir.dirMu.Unlock() + if err != nil { + return err + } + srcVFSD := &src.vfsd + + // Can we remove the src dentry? + if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil { + return err + } + + // Can we create the dst dentry? + var dst *Dentry + pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) + switch err { + case nil: + // Ok, continue with rename as replacement. + case syserror.EEXIST: + if noReplace { + // Won't overwrite existing node since RENAME_NOREPLACE was requested. + return syserror.EEXIST + } + dst = dstDir.children[pc] + if dst == nil { + panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) + } + default: + return err + } + var dstVFSD *vfs.Dentry + if dst != nil { + dstVFSD = &dst.vfsd + } + + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + virtfs := rp.VirtualFilesystem() + + // We can't deadlock here due to lock ordering because we're protected from + // concurrent renames by fs.mu held for writing. + srcDir.dirMu.Lock() + defer srcDir.dirMu.Unlock() + if srcDir != dstDir { + dstDir.dirMu.Lock() + defer dstDir.dirMu.Unlock() + } + + if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { + return err + } + replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD) + if err != nil { + virtfs.AbortRenameDentry(srcVFSD, dstVFSD) + return err + } + delete(srcDir.children, src.name) + if srcDir != dstDir { + fs.deferDecRef(srcDirVFSD) + dstDir.IncRef() + } + src.parent = dstDir + src.name = pc + if dstDir.children == nil { + dstDir.children = make(map[string]*Dentry) + } + dstDir.children[pc] = src + virtfs.CommitRenameReplaceDentry(srcVFSD, replaced) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { + return err + } + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return syserror.ENOTDIR + } + if inode.HasChildren() { + return syserror.ENOTEMPTY + } + virtfs := rp.VirtualFilesystem() + parentDentry := d.parent + parentDentry.dirMu.Lock() + defer parentDentry.dirMu.Unlock() + + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { + return err + } + if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { + virtfs.AbortDeleteDentry(vfsd) + return err + } + virtfs.CommitDeleteDentry(vfsd) + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + if opts.Stat.Mask == 0 { + return nil + } + return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return linux.Statx{}, err + } + return inode.Stat(fs.VFSFilesystem(), opts) +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return linux.Statfs{}, err + } + // TODO(gvisor.dev/issue/1193): actually implement statfs. + return linux.Statfs{}, syserror.ENOSYS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + childVFSD, err := parentInode.NewSymlink(ctx, pc, target) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + return nil +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, _, err := fs.walkExistingLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { + return err + } + d := vfsd.Impl().(*Dentry) + if d.isDir() { + return syserror.EISDIR + } + virtfs := rp.VirtualFilesystem() + parentDentry := d.parent + parentDentry.dirMu.Lock() + defer parentDentry.dirMu.Unlock() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { + return err + } + if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { + virtfs.AbortDeleteDentry(vfsd) + return err + } + virtfs.CommitDeleteDentry(vfsd) + return nil +} + +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return nil, err + } + if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + return nil, syserror.ECONNREFUSED +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return nil, err + } + // kernfs currently does not support extended attributes. + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return "", err + } + // kernfs currently does not support extended attributes. + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + // kernfs currently does not support extended attributes. + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + // kernfs currently does not support extended attributes. + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go new file mode 100644 index 000000000..4cb885d87 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -0,0 +1,613 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// InodeNoopRefCount partially implements the Inode interface, specifically the +// inodeRefs sub interface. InodeNoopRefCount implements a simple reference +// count for inodes, performing no extra actions when references are obtained or +// released. This is suitable for simple file inodes that don't reference any +// resources. +type InodeNoopRefCount struct { +} + +// IncRef implements Inode.IncRef. +func (InodeNoopRefCount) IncRef() { +} + +// DecRef implements Inode.DecRef. +func (InodeNoopRefCount) DecRef() { +} + +// TryIncRef implements Inode.TryIncRef. +func (InodeNoopRefCount) TryIncRef() bool { + return true +} + +// Destroy implements Inode.Destroy. +func (InodeNoopRefCount) Destroy() { +} + +// InodeDirectoryNoNewChildren partially implements the Inode interface. +// InodeDirectoryNoNewChildren represents a directory inode which does not +// support creation of new children. +type InodeDirectoryNoNewChildren struct{} + +// NewFile implements Inode.NewFile. +func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewDir implements Inode.NewDir. +func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewLink implements Inode.NewLink. +func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewSymlink implements Inode.NewSymlink. +func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewNode implements Inode.NewNode. +func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// InodeNotDirectory partially implements the Inode interface, specifically the +// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not +// represent directories can embed this to provide no-op implementations for +// directory-related functions. +type InodeNotDirectory struct { +} + +// HasChildren implements Inode.HasChildren. +func (InodeNotDirectory) HasChildren() bool { + return false +} + +// NewFile implements Inode.NewFile. +func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { + panic("NewFile called on non-directory inode") +} + +// NewDir implements Inode.NewDir. +func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { + panic("NewDir called on non-directory inode") +} + +// NewLink implements Inode.NewLinkink. +func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { + panic("NewLink called on non-directory inode") +} + +// NewSymlink implements Inode.NewSymlink. +func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + panic("NewSymlink called on non-directory inode") +} + +// NewNode implements Inode.NewNode. +func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + panic("NewNode called on non-directory inode") +} + +// Unlink implements Inode.Unlink. +func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { + panic("Unlink called on non-directory inode") +} + +// RmDir implements Inode.RmDir. +func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { + panic("RmDir called on non-directory inode") +} + +// Rename implements Inode.Rename. +func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { + panic("Rename called on non-directory inode") +} + +// Lookup implements Inode.Lookup. +func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + panic("Lookup called on non-directory inode") +} + +// IterDirents implements Inode.IterDirents. +func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + panic("IterDirents called on non-directory inode") +} + +// Valid implements Inode.Valid. +func (InodeNotDirectory) Valid(context.Context) bool { + return true +} + +// InodeNoDynamicLookup partially implements the Inode interface, specifically +// the inodeDynamicLookup sub interface. Directory inodes that do not support +// dymanic entries (i.e. entries that are not "hashed" into the +// vfs.Dentry.children) can embed this to provide no-op implementations for +// functions related to dynamic entries. +type InodeNoDynamicLookup struct{} + +// Lookup implements Inode.Lookup. +func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + return nil, syserror.ENOENT +} + +// IterDirents implements Inode.IterDirents. +func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return offset, nil +} + +// Valid implements Inode.Valid. +func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { + return true +} + +// InodeNotSymlink partially implements the Inode interface, specifically the +// inodeSymlink sub interface. All inodes that are not symlinks may embed this +// to return the appropriate errors from symlink-related functions. +type InodeNotSymlink struct{} + +// Readlink implements Inode.Readlink. +func (InodeNotSymlink) Readlink(context.Context) (string, error) { + return "", syserror.EINVAL +} + +// Getlink implements Inode.Getlink. +func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { + return vfs.VirtualDentry{}, "", syserror.EINVAL +} + +// InodeAttrs partially implements the Inode interface, specifically the +// inodeMetadata sub interface. InodeAttrs provides functionality related to +// inode attributes. +// +// Must be initialized by Init prior to first use. +type InodeAttrs struct { + devMajor uint32 + devMinor uint32 + ino uint64 + mode uint32 + uid uint32 + gid uint32 + nlink uint32 +} + +// Init initializes this InodeAttrs. +func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) { + if mode.FileType() == 0 { + panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) + } + + nlink := uint32(1) + if mode.FileType() == linux.ModeDirectory { + nlink = 2 + } + a.devMajor = devMajor + a.devMinor = devMinor + atomic.StoreUint64(&a.ino, ino) + atomic.StoreUint32(&a.mode, uint32(mode)) + atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID)) + atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID)) + atomic.StoreUint32(&a.nlink, nlink) +} + +// DevMajor returns the device major number. +func (a *InodeAttrs) DevMajor() uint32 { + return a.devMajor +} + +// DevMinor returns the device minor number. +func (a *InodeAttrs) DevMinor() uint32 { + return a.devMinor +} + +// Ino returns the inode id. +func (a *InodeAttrs) Ino() uint64 { + return atomic.LoadUint64(&a.ino) +} + +// Mode implements Inode.Mode. +func (a *InodeAttrs) Mode() linux.FileMode { + return linux.FileMode(atomic.LoadUint32(&a.mode)) +} + +// Stat partially implements Inode.Stat. Note that this function doesn't provide +// all the stat fields, and the embedder should consider extending the result +// with filesystem-specific fields. +func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK + stat.DevMajor = a.devMajor + stat.DevMinor = a.devMinor + stat.Ino = atomic.LoadUint64(&a.ino) + stat.Mode = uint16(a.Mode()) + stat.UID = atomic.LoadUint32(&a.uid) + stat.GID = atomic.LoadUint32(&a.gid) + stat.Nlink = atomic.LoadUint32(&a.nlink) + + // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. + + return stat, nil +} + +// SetStat implements Inode.SetStat. +func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { + return syserror.EPERM + } + if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + return err + } + + stat := opts.Stat + if stat.Mask&linux.STATX_MODE != 0 { + for { + old := atomic.LoadUint32(&a.mode) + new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT)) + if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped { + break + } + } + } + + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&a.uid, stat.UID) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&a.gid, stat.GID) + } + + // Note that not all fields are modifiable. For example, the file type and + // inode numbers are immutable after node creation. + + // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. + // Also, STATX_SIZE will need some special handling, because read-only static + // files should return EIO for truncate operations. + + return nil +} + +// CheckPermissions implements Inode.CheckPermissions. +func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions( + creds, + ats, + a.Mode(), + auth.KUID(atomic.LoadUint32(&a.uid)), + auth.KGID(atomic.LoadUint32(&a.gid)), + ) +} + +// IncLinks implements Inode.IncLinks. +func (a *InodeAttrs) IncLinks(n uint32) { + if atomic.AddUint32(&a.nlink, n) <= n { + panic("InodeLink.IncLinks called with no existing links") + } +} + +// DecLinks implements Inode.DecLinks. +func (a *InodeAttrs) DecLinks() { + if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) { + // Negative overflow + panic("Inode.DecLinks called at 0 links") + } +} + +type slot struct { + Name string + Dentry *vfs.Dentry + slotEntry +} + +// OrderedChildrenOptions contains initialization options for OrderedChildren. +type OrderedChildrenOptions struct { + // Writable indicates whether vfs.FilesystemImpl methods implemented by + // OrderedChildren may modify the tracked children. This applies to + // operations related to rename, unlink and rmdir. If an OrderedChildren is + // not writable, these operations all fail with EPERM. + Writable bool +} + +// OrderedChildren partially implements the Inode interface. OrderedChildren can +// be embedded in directory inodes to keep track of the children in the +// directory, and can then be used to implement a generic directory FD -- see +// GenericDirectoryFD. OrderedChildren is not compatible with dynamic +// directories. +// +// Must be initialize with Init before first use. +type OrderedChildren struct { + refs.AtomicRefCount + + // Can children be modified by user syscalls? It set to false, interface + // methods that would modify the children return EPERM. Immutable. + writable bool + + mu sync.RWMutex + order slotList + set map[string]*slot +} + +// Init initializes an OrderedChildren. +func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { + o.writable = opts.Writable + o.set = make(map[string]*slot) +} + +// DecRef implements Inode.DecRef. +func (o *OrderedChildren) DecRef() { + o.AtomicRefCount.DecRefWithDestructor(o.Destroy) +} + +// Destroy cleans up resources referenced by this OrderedChildren. +func (o *OrderedChildren) Destroy() { + o.mu.Lock() + defer o.mu.Unlock() + o.order.Reset() + o.set = nil +} + +// Populate inserts children into this OrderedChildren, and d's dentry +// cache. Populate returns the number of directories inserted, which the caller +// may use to update the link count for the parent directory. +// +// Precondition: d must represent a directory inode. children must not contain +// any conflicting entries already in o. +func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { + var links uint32 + for name, child := range children { + if child.isDir() { + links++ + } + if err := o.Insert(name, child.VFSDentry()); err != nil { + panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) + } + d.InsertChild(name, child) + } + return links +} + +// HasChildren implements Inode.HasChildren. +func (o *OrderedChildren) HasChildren() bool { + o.mu.RLock() + defer o.mu.RUnlock() + return len(o.set) > 0 +} + +// Insert inserts child into o. This ignores the writability of o, as this is +// not part of the vfs.FilesystemImpl interface, and is a lower-level operation. +func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { + o.mu.Lock() + defer o.mu.Unlock() + if _, ok := o.set[name]; ok { + return syserror.EEXIST + } + s := &slot{ + Name: name, + Dentry: child, + } + o.order.PushBack(s) + o.set[name] = s + return nil +} + +// Precondition: caller must hold o.mu for writing. +func (o *OrderedChildren) removeLocked(name string) { + if s, ok := o.set[name]; ok { + delete(o.set, name) + o.order.Remove(s) + } +} + +// Precondition: caller must hold o.mu for writing. +func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry { + if s, ok := o.set[name]; ok { + // Existing slot with given name, simply replace the dentry. + var old *vfs.Dentry + old, s.Dentry = s.Dentry, new + return old + } + + // No existing slot with given name, create and hash new slot. + s := &slot{ + Name: name, + Dentry: new, + } + o.order.PushBack(s) + o.set[name] = s + return nil +} + +// Precondition: caller must hold o.mu for reading or writing. +func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error { + s, ok := o.set[name] + if !ok { + return syserror.ENOENT + } + if s.Dentry != child { + panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child)) + } + return nil +} + +// Unlink implements Inode.Unlink. +func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { + if !o.writable { + return syserror.EPERM + } + o.mu.Lock() + defer o.mu.Unlock() + if err := o.checkExistingLocked(name, child); err != nil { + return err + } + + // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. + o.removeLocked(name) + return nil +} + +// Rmdir implements Inode.Rmdir. +func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { + // We're not responsible for checking that child is a directory, that it's + // empty, or updating any link counts; so this is the same as unlink. + return o.Unlink(ctx, name, child) +} + +type renameAcrossDifferentImplementationsError struct{} + +func (renameAcrossDifferentImplementationsError) Error() string { + return "rename across inodes with different implementations" +} + +// Rename implements Inode.Rename. +// +// Precondition: Rename may only be called across two directory inodes with +// identical implementations of Rename. Practically, this means filesystems that +// implement Rename by embedding OrderedChildren for any directory +// implementation must use OrderedChildren for all directory implementations +// that will support Rename. +// +// Postcondition: reference on any replaced dentry transferred to caller. +func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) { + dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren) + if !ok { + return nil, renameAcrossDifferentImplementationsError{} + } + if !o.writable || !dst.writable { + return nil, syserror.EPERM + } + // Note: There's a potential deadlock below if concurrent calls to Rename + // refer to the same src and dst directories in reverse. We avoid any + // ordering issues because the caller is required to serialize concurrent + // calls to Rename in accordance with the interface declaration. + o.mu.Lock() + defer o.mu.Unlock() + if dst != o { + dst.mu.Lock() + defer dst.mu.Unlock() + } + if err := o.checkExistingLocked(oldname, child); err != nil { + return nil, err + } + + // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. + replaced := dst.replaceChildLocked(newname, child) + return replaced, nil +} + +// nthLocked returns an iterator to the nth child tracked by this object. The +// iterator is valid until the caller releases o.mu. Returns nil if the +// requested index falls out of bounds. +// +// Preconditon: Caller must hold o.mu for reading. +func (o *OrderedChildren) nthLocked(i int64) *slot { + for it := o.order.Front(); it != nil && i >= 0; it = it.Next() { + if i == 0 { + return it + } + i-- + } + return nil +} + +// InodeSymlink partially implements Inode interface for symlinks. +type InodeSymlink struct { + InodeNotDirectory +} + +// Open implements Inode.Open. +func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return nil, syserror.ELOOP +} + +// StaticDirectory is a standard implementation of a directory with static +// contents. +// +// +stateify savable +type StaticDirectory struct { + InodeNotSymlink + InodeDirectoryNoNewChildren + InodeAttrs + InodeNoDynamicLookup + OrderedChildren + + locks vfs.FileLocks +} + +var _ Inode = (*StaticDirectory)(nil) + +// NewStaticDir creates a new static directory and returns its dentry. +func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { + inode := &StaticDirectory{} + inode.Init(creds, devMajor, devMinor, ino, perm) + + dentry := &Dentry{} + dentry.Init(inode) + + inode.OrderedChildren.Init(OrderedChildrenOptions{}) + links := inode.OrderedChildren.Populate(dentry, children) + inode.IncLinks(links) + + return dentry +} + +// Init initializes StaticDirectory. +func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) +} + +// Open implements kernfs.Inode. +func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// AlwaysValid partially implements kernfs.inodeDynamicLookup. +type AlwaysValid struct{} + +// Valid implements kernfs.inodeDynamicLookup. +func (*AlwaysValid) Valid(context.Context) bool { + return true +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go new file mode 100644 index 000000000..596de1edf --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -0,0 +1,456 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernfs provides the tools to implement inode-based filesystems. +// Kernfs has two main features: +// +// 1. The Inode interface, which maps VFS2's path-based filesystem operations to +// specific filesystem nodes. Kernfs uses the Inode interface to provide a +// blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as +// the synchronization mechanism for all filesystem operations by holding a +// filesystem-wide lock across all operations. +// +// 2. Various utility types which provide generic implementations for various +// parts of the Inode and vfs.FileDescription interfaces. Client filesystems +// based on kernfs can embed the appropriate set of these to avoid having to +// reimplement common filesystem operations. See inode_impl_util.go and +// fd_impl_util.go. +// +// Reference Model: +// +// Kernfs dentries represents named pointers to inodes. Dentries and inode have +// independent lifetimes and reference counts. A child dentry unconditionally +// holds a reference on its parent directory's dentry. A dentry also holds a +// reference on the inode it points to. Multiple dentries can point to the same +// inode (for example, in the case of hardlinks). File descriptors hold a +// reference to the dentry they're opened on. +// +// Dentries are guaranteed to exist while holding Filesystem.mu for +// reading. Dropping dentries require holding Filesystem.mu for writing. To +// queue dentries for destruction from a read critical section, see +// Filesystem.deferDecRef. +// +// Lock ordering: +// +// kernfs.Filesystem.mu +// kernfs.Dentry.dirMu +// vfs.VirtualFilesystem.mountMu +// vfs.Dentry.mu +// kernfs.Filesystem.droppedDentriesMu +// (inode implementation locks, if any) +package kernfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" +) + +// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory +// filesystem. Concrete implementations are expected to embed this in their own +// Filesystem type. +type Filesystem struct { + vfsfs vfs.Filesystem + + droppedDentriesMu sync.Mutex + + // droppedDentries is a list of dentries waiting to be DecRef()ed. This is + // used to defer dentry destruction until mu can be acquired for + // writing. Protected by droppedDentriesMu. + droppedDentries []*vfs.Dentry + + // mu synchronizes the lifetime of Dentries on this filesystem. Holding it + // for reading guarantees continued existence of any resolved dentries, but + // the dentry tree may be modified. + // + // Kernfs dentries can only be DecRef()ed while holding mu for writing. For + // example: + // + // fs.mu.Lock() + // defer fs.mu.Unlock() + // ... + // dentry1.DecRef() + // defer dentry2.DecRef() // Ok, will run before Unlock. + // + // If discarding dentries in a read context, use Filesystem.deferDecRef. For + // example: + // + // fs.mu.RLock() + // fs.mu.processDeferredDecRefs() + // defer fs.mu.RUnlock() + // ... + // fs.deferDecRef(dentry) + mu sync.RWMutex + + // nextInoMinusOne is used to to allocate inode numbers on this + // filesystem. Must be accessed by atomic operations. + nextInoMinusOne uint64 +} + +// deferDecRef defers dropping a dentry ref until the next call to +// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. +// +// Precondition: d must not already be pending destruction. +func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { + fs.droppedDentriesMu.Lock() + fs.droppedDentries = append(fs.droppedDentries, d) + fs.droppedDentriesMu.Unlock() +} + +// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the +// droppedDentries list. See comment on Filesystem.mu. +func (fs *Filesystem) processDeferredDecRefs() { + fs.mu.Lock() + fs.processDeferredDecRefsLocked() + fs.mu.Unlock() +} + +// Precondition: fs.mu must be held for writing. +func (fs *Filesystem) processDeferredDecRefsLocked() { + fs.droppedDentriesMu.Lock() + for _, d := range fs.droppedDentries { + d.DecRef() + } + fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. + fs.droppedDentriesMu.Unlock() +} + +// VFSFilesystem returns the generic vfs filesystem object. +func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { + return &fs.vfsfs +} + +// NextIno allocates a new inode number on this filesystem. +func (fs *Filesystem) NextIno() uint64 { + return atomic.AddUint64(&fs.nextInoMinusOne, 1) +} + +// These consts are used in the Dentry.flags field. +const ( + // Dentry points to a directory inode. + dflagsIsDir = 1 << iota + + // Dentry points to a symlink inode. + dflagsIsSymlink +) + +// Dentry implements vfs.DentryImpl. +// +// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a +// named reference to an inode. A dentry generally lives as long as it's part of +// a mounted filesystem tree. Kernfs doesn't cache dentries once all references +// to them are removed. Dentries hold a single reference to the inode they point +// to, and child dentries hold a reference on their parent. +// +// Must be initialized by Init prior to first use. +type Dentry struct { + vfsd vfs.Dentry + + refs.AtomicRefCount + + // flags caches useful information about the dentry from the inode. See the + // dflags* consts above. Must be accessed by atomic ops. + flags uint32 + + parent *Dentry + name string + + // dirMu protects children and the names of child Dentries. + dirMu sync.Mutex + children map[string]*Dentry + + inode Inode +} + +// Init initializes this dentry. +// +// Precondition: Caller must hold a reference on inode. +// +// Postcondition: Caller's reference on inode is transferred to the dentry. +func (d *Dentry) Init(inode Inode) { + d.vfsd.Init(d) + d.inode = inode + ftype := inode.Mode().FileType() + if ftype == linux.ModeDirectory { + d.flags |= dflagsIsDir + } + if ftype == linux.ModeSymlink { + d.flags |= dflagsIsSymlink + } +} + +// VFSDentry returns the generic vfs dentry for this kernfs dentry. +func (d *Dentry) VFSDentry() *vfs.Dentry { + return &d.vfsd +} + +// isDir checks whether the dentry points to a directory inode. +func (d *Dentry) isDir() bool { + return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0 +} + +// isSymlink checks whether the dentry points to a symlink inode. +func (d *Dentry) isSymlink() bool { + return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0 +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *Dentry) DecRef() { + d.AtomicRefCount.DecRefWithDestructor(d.destroy) +} + +// Precondition: Dentry must be removed from VFS' dentry cache. +func (d *Dentry) destroy() { + d.inode.DecRef() // IncRef from Init. + d.inode = nil + if d.parent != nil { + d.parent.DecRef() // IncRef from Dentry.InsertChild. + } +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// Although Linux technically supports inotify on pseudo filesystems (inotify +// is implemented at the vfs layer), it is not particularly useful. It is left +// unimplemented until someone actually needs it. +func (d *Dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} + +// Watches implements vfs.DentryImpl.Watches. +func (d *Dentry) Watches() *vfs.Watches { + return nil +} + +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +func (d *Dentry) OnZeroWatches() {} + +// InsertChild inserts child into the vfs dentry cache with the given name under +// this dentry. This does not update the directory inode, so calling this on +// its own isn't sufficient to insert a child into a directory. InsertChild +// updates the link count on d if required. +// +// Precondition: d must represent a directory inode. +func (d *Dentry) InsertChild(name string, child *Dentry) { + d.dirMu.Lock() + d.insertChildLocked(name, child) + d.dirMu.Unlock() +} + +// insertChildLocked is equivalent to InsertChild, with additional +// preconditions. +// +// Precondition: d.dirMu must be locked. +func (d *Dentry) insertChildLocked(name string, child *Dentry) { + if !d.isDir() { + panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) + } + d.IncRef() // DecRef in child's Dentry.destroy. + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*Dentry) + } + d.children[name] = child +} + +// Inode returns the dentry's inode. +func (d *Dentry) Inode() Inode { + return d.inode +} + +// The Inode interface maps filesystem-level operations that operate on paths to +// equivalent operations on specific filesystem nodes. +// +// The interface methods are groups into logical categories as sub interfaces +// below. Generally, an implementation for each sub interface can be provided by +// embedding an appropriate type from inode_impl_utils.go. The sub interfaces +// are purely organizational. Methods declared directly in the main interface +// have no generic implementations, and should be explicitly provided by the +// client filesystem. +// +// Generally, implementations are not responsible for tasks that are common to +// all filesystems. These include: +// +// - Checking that dentries passed to methods are of the appropriate file type. +// - Checking permissions. +// - Updating link and reference counts. +// +// Specific responsibilities of implementations are documented below. +type Inode interface { + // Methods related to reference counting. A generic implementation is + // provided by InodeNoopRefCount. These methods are generally called by the + // equivalent Dentry methods. + inodeRefs + + // Methods related to node metadata. A generic implementation is provided by + // InodeAttrs. + inodeMetadata + + // Method for inodes that represent symlink. InodeNotSymlink provides a + // blanket implementation for all non-symlink inodes. + inodeSymlink + + // Method for inodes that represent directories. InodeNotDirectory provides + // a blanket implementation for all non-directory inodes. + inodeDirectory + + // Method for inodes that represent dynamic directories and their + // children. InodeNoDynamicLookup provides a blanket implementation for all + // non-dynamic-directory inodes. + inodeDynamicLookup + + // Open creates a file description for the filesystem object represented by + // this inode. The returned file description should hold a reference on the + // inode for its lifetime. + // + // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing + // the inode on which Open() is being called. + Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) +} + +type inodeRefs interface { + IncRef() + DecRef() + TryIncRef() bool + // Destroy is called when the inode reaches zero references. Destroy release + // all resources (references) on objects referenced by the inode, including + // any child dentries. + Destroy() +} + +type inodeMetadata interface { + // CheckPermissions checks that creds may access this inode for the + // requested access type, per the the rules of + // fs/namei.c:generic_permission(). + CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error + + // Mode returns the (struct stat)::st_mode value for this inode. This is + // separated from Stat for performance. + Mode() linux.FileMode + + // Stat returns the metadata for this inode. This corresponds to + // vfs.FilesystemImpl.StatAt. + Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) + + // SetStat updates the metadata for this inode. This corresponds to + // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking + // if the operation can be performed (see vfs.CheckSetStat() for common + // checks). + SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error +} + +// Precondition: All methods in this interface may only be called on directory +// inodes. +type inodeDirectory interface { + // The New{File,Dir,Node,Symlink} methods below should return a new inode + // hashed into this inode. + // + // These inode constructors are inode-level operations rather than + // filesystem-level operations to allow client filesystems to mix different + // implementations based on the new node's location in the + // filesystem. + + // HasChildren returns true if the directory inode has any children. + HasChildren() bool + + // NewFile creates a new regular file inode. + NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) + + // NewDir creates a new directory inode. + NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) + + // NewLink creates a new hardlink to a specified inode in this + // directory. Implementations should create a new kernfs Dentry pointing to + // target, and update target's link count. + NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) + + // NewSymlink creates a new symbolic link inode. + NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) + + // NewNode creates a new filesystem node for a mknod syscall. + NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) + + // Unlink removes a child dentry from this directory inode. + Unlink(ctx context.Context, name string, child *vfs.Dentry) error + + // RmDir removes an empty child directory from this directory + // inode. Implementations must update the parent directory's link count, + // if required. Implementations are not responsible for checking that child + // is a directory, checking for an empty directory. + RmDir(ctx context.Context, name string, child *vfs.Dentry) error + + // Rename is called on the source directory containing an inode being + // renamed. child should point to the resolved child in the source + // directory. If Rename replaces a dentry in the destination directory, it + // should return the replaced dentry or nil otherwise. + // + // Precondition: Caller must serialize concurrent calls to Rename. + Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error) +} + +type inodeDynamicLookup interface { + // Lookup should return an appropriate dentry if name should resolve to a + // child of this dynamic directory inode. This gives the directory an + // opportunity on every lookup to resolve additional entries that aren't + // hashed into the directory. This is only called when the inode is a + // directory. If the inode is not a directory, or if the directory only + // contains a static set of children, the implementer can unconditionally + // return an appropriate error (ENOTDIR and ENOENT respectively). + // + // The child returned by Lookup will be hashed into the VFS dentry tree. Its + // lifetime can be controlled by the filesystem implementation with an + // appropriate implementation of Valid. + // + // Lookup returns the child with an extra reference and the caller owns this + // reference. + Lookup(ctx context.Context, name string) (*vfs.Dentry, error) + + // Valid should return true if this inode is still valid, or needs to + // be resolved again by a call to Lookup. + Valid(ctx context.Context) bool + + // IterDirents is used to iterate over dynamically created entries. It invokes + // cb on each entry in the directory represented by the FileDescription. + // 'offset' is the offset for the entire IterDirents call, which may include + // results from the caller (e.g. "." and ".."). 'relOffset' is the offset + // inside the entries returned by this IterDirents invocation. In other words, + // 'offset' should be used to calculate each vfs.Dirent.NextOff as well as + // the return value, while 'relOffset' is the place to start iteration. + IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) +} + +type inodeSymlink interface { + // Readlink returns the target of a symbolic link. If an inode is not a + // symlink, the implementation should return EINVAL. + Readlink(ctx context.Context) (string, error) + + // Getlink returns the target of a symbolic link, as used by path + // resolution: + // + // - If the inode is a "magic link" (a link whose target is most accurately + // represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "", + // nil). A reference is taken on the returned VirtualDentry. + // + // - If the inode is an ordinary symlink, Getlink returns (zero-value + // VirtualDentry, symlink target, nil). + // + // - If the inode is not a symlink, Getlink returns (zero-value + // VirtualDentry, "", EINVAL). + Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go new file mode 100644 index 000000000..dc407eb1d --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -0,0 +1,330 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs_test + +import ( + "bytes" + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const defaultMode linux.FileMode = 01777 +const staticFileContent = "This is sample content for a static test file." + +// RootDentryFn is a generator function for creating the root dentry of a test +// filesystem. See newTestSystem. +type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry + +// newTestSystem sets up a minimal environment for running a test, including an +// instance of a test filesystem. Tests can control the contents of the +// filesystem by providing an appropriate rootFn, which should return a +// pre-populated root dentry. +func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + v := &vfs.VirtualFilesystem{} + if err := v.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("Failed to create testfs root mount: %v", err) + } + return testutil.NewSystem(ctx, t, v, mns) +} + +type fsType struct { + rootFn RootDentryFn +} + +type filesystem struct { + kernfs.Filesystem +} + +type file struct { + kernfs.DynamicBytesFile + content string +} + +func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { + f := &file{} + f.content = content + f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777) + + d := &kernfs.Dentry{} + d.Init(f) + return d +} + +func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%s", f.content) + return nil +} + +type attrs struct { + kernfs.InodeAttrs +} + +func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +type readonlyDir struct { + attrs + kernfs.InodeNotSymlink + kernfs.InodeNoDynamicLookup + kernfs.InodeDirectoryNoNewChildren + kernfs.OrderedChildren + + locks vfs.FileLocks + + dentry kernfs.Dentry +} + +func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &readonlyDir{} + dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + dir.dentry.Init(dir) + + dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + + return &dir.dentry +} + +func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +type dir struct { + attrs + kernfs.InodeNotSymlink + kernfs.InodeNoDynamicLookup + kernfs.OrderedChildren + + locks vfs.FileLocks + + fs *filesystem + dentry kernfs.Dentry +} + +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &dir{} + dir.fs = fs + dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) + dir.dentry.Init(dir) + + dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + + return &dir.dentry +} + +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { + creds := auth.CredentialsFromContext(ctx) + dir := d.fs.newDir(creds, opts.Mode, nil) + dirVFSD := dir.VFSDentry() + if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { + dir.DecRef() + return nil, err + } + d.IncLinks(1) + return dirVFSD, nil +} + +func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { + creds := auth.CredentialsFromContext(ctx) + f := d.fs.newFile(creds, "") + fVFSD := f.VFSDentry() + if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { + f.DecRef() + return nil, err + } + return fVFSD, nil +} + +func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (fsType) Name() string { + return "kernfs" +} + +func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fs := &filesystem{} + fs.VFSFilesystem().Init(vfsObj, &fst, fs) + root := fst.rootFn(creds, fs) + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// -------------------- Remainder of the file are test cases -------------------- + +func TestBasic(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "file1": fs.newFile(creds, staticFileContent), + }) + }) + defer sys.Destroy() + sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() +} + +func TestMkdirGetDentry(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir1": fs.newDir(creds, 0755, nil), + }) + }) + defer sys.Destroy() + + pop := sys.PathOpAtRoot("dir1/a new directory") + if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { + t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) + } + sys.GetDentryOrDie(pop).DecRef() +} + +func TestReadStaticFile(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "file1": fs.newFile(creds, staticFileContent), + }) + }) + defer sys.Destroy() + + pop := sys.PathOpAtRoot("file1") + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + content, err := sys.ReadToEnd(fd) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + if diff := cmp.Diff(staticFileContent, content); diff != "" { + t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff) + } +} + +func TestCreateNewFileInStaticDir(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir1": fs.newDir(creds, 0755, nil), + }) + }) + defer sys.Destroy() + + pop := sys.PathOpAtRoot("dir1/newfile") + opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode} + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, opts) + if err != nil { + t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err) + } + + // Close the file. The file should persist. + fd.DecRef() + + fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) + } + fd.DecRef() +} + +func TestDirFDReadWrite(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, nil) + }) + defer sys.Destroy() + + pop := sys.PathOpAtRoot("/") + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + // Read/Write should fail for directory FDs. + if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { + t.Fatalf("Read for directory FD failed with unexpected error: %v", err) + } + if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EBADF { + t.Fatalf("Write for directory FD failed with unexpected error: %v", err) + } +} + +func TestDirFDIterDirents(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + // Fill root with nodes backed by various inode implementations. + "dir1": fs.newReadonlyDir(creds, 0755, nil), + "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir3": fs.newDir(creds, 0755, nil), + }), + "file1": fs.newFile(creds, staticFileContent), + }) + }) + defer sys.Destroy() + + pop := sys.PathOpAtRoot("/") + sys.AssertAllDirentTypes(sys.ListDirents(pop), map[string]testutil.DirentType{ + "dir1": linux.DT_DIR, + "dir2": linux.DT_DIR, + "file1": linux.DT_REG, + }) +} diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go new file mode 100644 index 000000000..2ab3f53fd --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -0,0 +1,66 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// StaticSymlink provides an Inode implementation for symlinks that point to +// a immutable target. +type StaticSymlink struct { + InodeAttrs + InodeNoopRefCount + InodeSymlink + + target string +} + +var _ Inode = (*StaticSymlink)(nil) + +// NewStaticSymlink creates a new symlink file pointing to 'target'. +func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry { + inode := &StaticSymlink{} + inode.Init(creds, devMajor, devMinor, ino, target) + + d := &Dentry{} + d.Init(inode) + return d +} + +// Init initializes the instance. +func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) { + s.target = target + s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) +} + +// Readlink implements Inode. +func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { + return s.target, nil +} + +// Getlink implements Inode.Getlink. +func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { + return vfs.VirtualDentry{}, s.target, nil +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD new file mode 100644 index 000000000..8cf5b35d3 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -0,0 +1,41 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "overlay", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + +go_library( + name = "overlay", + srcs = [ + "copy_up.go", + "directory.go", + "filesystem.go", + "fstree.go", + "non_directory.go", + "overlay.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go new file mode 100644 index 000000000..8f8dcfafe --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -0,0 +1,262 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "fmt" + "io" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isCopiedUp() bool { + return atomic.LoadUint32(&d.copiedUp) != 0 +} + +// copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok(). +// +// Preconditions: filesystem.renameMu must be locked. +func (d *dentry) copyUpLocked(ctx context.Context) error { + // Fast path. + if d.isCopiedUp() { + return nil + } + + ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT + switch ftype { + case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR: + // Can be copied-up. + default: + // Can't be copied-up. + return syserror.EPERM + } + + // Ensure that our parent directory is copied-up. + if d.parent == nil { + // d is a filesystem root with no upper layer. + return syserror.EROFS + } + if err := d.parent.copyUpLocked(ctx); err != nil { + return err + } + + d.copyMu.Lock() + defer d.copyMu.Unlock() + if d.upperVD.Ok() { + // Raced with another call to d.copyUpLocked(). + return nil + } + if d.vfsd.IsDead() { + // Raced with deletion of d. + return syserror.ENOENT + } + + // Perform copy-up. + vfsObj := d.fs.vfsfs.VirtualFilesystem() + newpop := vfs.PathOperation{ + Root: d.parent.upperVD, + Start: d.parent.upperVD, + Path: fspath.Parse(d.name), + } + cleanupUndoCopyUp := func() { + var err error + if ftype == linux.S_IFDIR { + err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop) + } else { + err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop) + } + if err != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err) + } + } + switch ftype { + case linux.S_IFREG: + oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVDs[0], + Start: d.lowerVDs[0], + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + return err + } + defer oldFD.DecRef() + newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL, + Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + }) + if err != nil { + return err + } + defer newFD.DecRef() + bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size + for { + readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{}) + if readErr != nil && readErr != io.EOF { + cleanupUndoCopyUp() + return readErr + } + total := int64(0) + for total < readN { + writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{}) + total += writeN + if writeErr != nil { + cleanupUndoCopyUp() + return writeErr + } + } + if readErr == io.EOF { + break + } + } + if err := newFD.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: d.uid, + GID: d.gid, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = newFD.VirtualDentry() + d.upperVD.IncRef() + + case linux.S_IFDIR: + if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{ + Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + }); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: d.uid, + GID: d.gid, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + case linux.S_IFLNK: + target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVDs[0], + Start: d.lowerVDs[0], + }) + if err != nil { + return err + } + if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID, + Mode: uint16(d.mode), + UID: d.uid, + GID: d.gid, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + case linux.S_IFBLK, linux.S_IFCHR: + lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVDs[0], + Start: d.lowerVDs[0], + }, &vfs.StatOptions{}) + if err != nil { + return err + } + if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{ + Mode: linux.FileMode(d.mode), + DevMajor: lowerStat.RdevMajor, + DevMinor: lowerStat.RdevMinor, + }); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: d.uid, + GID: d.gid, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + default: + // Should have rejected this at the beginning of this function? + panic(fmt.Sprintf("unexpected file type %o", ftype)) + } + + // TODO(gvisor.dev/issue/1199): copy up xattrs + + // Update the dentry's device and inode numbers (except for directories, + // for which these remain overlay-assigned). + if ftype != linux.S_IFDIR { + upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + }, &vfs.StatOptions{ + Mask: linux.STATX_INO, + }) + if err != nil { + d.upperVD.DecRef() + d.upperVD = vfs.VirtualDentry{} + cleanupUndoCopyUp() + return err + } + if upperStat.Mask&linux.STATX_INO == 0 { + d.upperVD.DecRef() + d.upperVD = vfs.VirtualDentry{} + cleanupUndoCopyUp() + return syserror.EREMOTE + } + atomic.StoreUint32(&d.devMajor, upperStat.DevMajor) + atomic.StoreUint32(&d.devMinor, upperStat.DevMinor) + atomic.StoreUint64(&d.ino, upperStat.Ino) + } + + atomic.StoreUint32(&d.copiedUp, 1) + return nil +} diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go new file mode 100644 index 000000000..f5c2462a5 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -0,0 +1,287 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +func (d *dentry) isDir() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR +} + +// Preconditions: d.dirMu must be locked. d.isDir(). +func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) { + vfsObj := d.fs.vfsfs.VirtualFilesystem() + var readdirErr error + whiteouts := make(map[string]bool) + var maybeWhiteouts []string + d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool { + layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + readdirErr = err + return false + } + defer layerFD.DecRef() + + // Reuse slice allocated for maybeWhiteouts from a previous layer to + // reduce allocations. + maybeWhiteouts = maybeWhiteouts[:0] + if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name == "." || dirent.Name == ".." { + return nil + } + if _, ok := whiteouts[dirent.Name]; ok { + // This file has been whited-out in a previous layer. + return nil + } + if dirent.Type == linux.DT_CHR { + // We have to determine if this is a whiteout, which doesn't + // count against the directory's emptiness. However, we can't + // do so while holding locks held by layerFD.IterDirents(). + maybeWhiteouts = append(maybeWhiteouts, dirent.Name) + return nil + } + // Non-whiteout file in the directory prevents rmdir. + return syserror.ENOTEMPTY + })); err != nil { + readdirErr = err + return false + } + + for _, maybeWhiteoutName := range maybeWhiteouts { + stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + Path: fspath.Parse(maybeWhiteoutName), + }, &vfs.StatOptions{}) + if err != nil { + readdirErr = err + return false + } + if stat.RdevMajor != 0 || stat.RdevMinor != 0 { + // This file is a real character device, not a whiteout. + readdirErr = syserror.ENOTEMPTY + return false + } + whiteouts[maybeWhiteoutName] = isUpper + } + // Continue iteration since we haven't found any non-whiteout files in + // this directory yet. + return true + }) + return whiteouts, readdirErr +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + mu sync.Mutex + off int64 + dirents []vfs.Dirent +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fd.mu.Lock() + defer fd.mu.Unlock() + + d := fd.dentry() + if fd.dirents == nil { + ds, err := d.getDirents(ctx) + if err != nil { + return err + } + fd.dirents = ds + } + + for fd.off < int64(len(fd.dirents)) { + if err := cb.Handle(fd.dirents[fd.off]); err != nil { + return err + } + fd.off++ + } + return nil +} + +// Preconditions: d.isDir(). +func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { + d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() + d.dirMu.Lock() + defer d.dirMu.Unlock() + + if d.dirents != nil { + return d.dirents, nil + } + + parent := genericParentOrSelf(d) + dirents := []vfs.Dirent{ + { + Name: ".", + Type: linux.DT_DIR, + Ino: d.ino, + NextOff: 1, + }, + { + Name: "..", + Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), + Ino: parent.ino, + NextOff: 2, + }, + } + + // Merge dirents from all layers comprising this directory. + vfsObj := d.fs.vfsfs.VirtualFilesystem() + var readdirErr error + prevDirents := make(map[string]struct{}) + var maybeWhiteouts []vfs.Dirent + d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool { + layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + readdirErr = err + return false + } + defer layerFD.DecRef() + + // Reuse slice allocated for maybeWhiteouts from a previous layer to + // reduce allocations. + maybeWhiteouts = maybeWhiteouts[:0] + if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name == "." || dirent.Name == ".." { + return nil + } + if _, ok := prevDirents[dirent.Name]; ok { + // This file is hidden by, or merged with, another file with + // the same name in a previous layer. + return nil + } + prevDirents[dirent.Name] = struct{}{} + if dirent.Type == linux.DT_CHR { + // We can't determine if this file is a whiteout while holding + // locks held by layerFD.IterDirents(). + maybeWhiteouts = append(maybeWhiteouts, dirent) + return nil + } + dirent.NextOff = int64(len(dirents) + 1) + dirents = append(dirents, dirent) + return nil + })); err != nil { + readdirErr = err + return false + } + + for _, dirent := range maybeWhiteouts { + stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + Path: fspath.Parse(dirent.Name), + }, &vfs.StatOptions{}) + if err != nil { + readdirErr = err + return false + } + if stat.RdevMajor == 0 && stat.RdevMinor == 0 { + // This file is a whiteout; don't emit a dirent for it. + continue + } + dirent.NextOff = int64(len(dirents) + 1) + dirents = append(dirents, dirent) + } + return true + }) + if readdirErr != nil { + return nil, readdirErr + } + + // Cache dirents for future directoryFDs. + d.dirents = dirents + return dirents, nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return 0, syserror.EINVAL + } + if offset == 0 { + // Ensure that the next call to fd.IterDirents() calls + // fd.dentry().getDirents(). + fd.dirents = nil + } + fd.off = offset + return fd.off, nil + case linux.SEEK_CUR: + offset += fd.off + if offset < 0 { + return 0, syserror.EINVAL + } + // Don't clear fd.dirents in this case, even if offset == 0. + fd.off = offset + return fd.off, nil + default: + return 0, syserror.EINVAL + } +} + +// Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper +// layer, if there is one. The lower layer doesn't need to sync because it +// never changes. +func (fd *directoryFD) Sync(ctx context.Context) error { + d := fd.dentry() + if !d.isCopiedUp() { + return nil + } + vfsObj := d.fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + } + upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) + if err != nil { + return err + } + err = upperFD.Sync(ctx) + upperFD.DecRef() + return err +} diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go new file mode 100644 index 000000000..ff82e1f20 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -0,0 +1,1364 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for +// opaque directories. +// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE +const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque" + +func isWhiteout(stat *linux.Statx) bool { + return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0 +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + if fs.opts.UpperRoot.Ok() { + return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx) + } + return nil +} + +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls +// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkDropLocked() + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckDrop(ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkDropLocked() + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may have a reference count of zero, and which therefore +// should be dropped once traversal is complete, are appended to ds. +// +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// !rp.Done(). +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil + } + child, err := fs.getChildLocked(ctx, d, name, ds) + if err != nil { + return nil, err + } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } + if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { + if child, ok := parent.children[name]; ok { + return child, nil + } + child, err := fs.lookupLocked(ctx, parent, name) + if err != nil { + return nil, err + } + if parent.children == nil { + parent.children = make(map[string]*dentry) + } + parent.children[name] = child + // child's refcount is initially 0, so it may be dropped after traversal. + *ds = appendDentry(*ds, child) + return child, nil +} + +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) { + childPath := fspath.Parse(name) + child := fs.newDentry() + existsOnAnyLayer := false + var lookupErr error + + vfsObj := fs.vfsfs.VirtualFilesystem() + parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool { + childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parentVD, + Start: parentVD, + Path: childPath, + }, &vfs.GetDentryOptions{}) + if err == syserror.ENOENT || err == syserror.ENAMETOOLONG { + // The file doesn't exist on this layer. Proceed to the next one. + return true + } + if err != nil { + lookupErr = err + return false + } + + mask := uint32(linux.STATX_TYPE) + if !existsOnAnyLayer { + // Mode, UID, GID, and (for non-directories) inode number come from + // the topmost layer on which the file exists. + mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + } + stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childVD, + Start: childVD, + }, &vfs.StatOptions{ + Mask: mask, + }) + if err != nil { + lookupErr = err + return false + } + if stat.Mask&mask != mask { + lookupErr = syserror.EREMOTE + return false + } + + if isWhiteout(&stat) { + // This is a whiteout, so it "doesn't exist" on this layer, and + // layers below this one are ignored. + return false + } + isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR + if existsOnAnyLayer && !isDir { + // Directories are not merged with non-directory files from lower + // layers; instead, layers including and below the first + // non-directory file are ignored. (This file must be a directory + // on previous layers, since lower layers aren't searched for + // non-directory files.) + return false + } + + // Update child to include this layer. + if isUpper { + child.upperVD = childVD + child.copiedUp = 1 + } else { + child.lowerVDs = append(child.lowerVDs, childVD) + } + if !existsOnAnyLayer { + existsOnAnyLayer = true + child.mode = uint32(stat.Mode) + child.uid = stat.UID + child.gid = stat.GID + child.devMajor = stat.DevMajor + child.devMinor = stat.DevMinor + child.ino = stat.Ino + } + + // For non-directory files, only the topmost layer that contains a file + // matters. + if !isDir { + return false + } + + // Directories are merged with directories from lower layers if they + // are not explicitly opaque. + opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childVD, + Start: childVD, + }, &vfs.GetxattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Size: 1, + }) + return !(err == nil && opaqueVal == "y") + }) + + if lookupErr != nil { + child.destroyLocked() + return nil, lookupErr + } + if !existsOnAnyLayer { + child.destroyLocked() + return nil, syserror.ENOENT + } + + // Device and inode numbers were copied from the topmost layer above; + // override them if necessary. + if child.isDir() { + child.devMajor = linux.UNNAMED_MAJOR + child.devMinor = fs.dirDevMinor + child.ino = fs.newDirIno() + } else if !child.upperVD.Ok() { + child.devMajor = linux.UNNAMED_MAJOR + child.devMinor = fs.lowerDevMinors[child.lowerVDs[0].Mount().Filesystem()] + } + + parent.IncRef() + child.parent = parent + child.name = name + return child, nil +} + +// lookupLayerLocked is similar to lookupLocked, but only returns information +// about the file rather than a dentry. +// +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) { + childPath := fspath.Parse(name) + lookupLayer := lookupLayerNone + var lookupErr error + + parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool { + stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parentVD, + Start: parentVD, + Path: childPath, + }, &vfs.StatOptions{ + Mask: linux.STATX_TYPE, + }) + if err == syserror.ENOENT || err == syserror.ENAMETOOLONG { + // The file doesn't exist on this layer. Proceed to the next + // one. + return true + } + if err != nil { + lookupErr = err + return false + } + if stat.Mask&linux.STATX_TYPE == 0 { + // Linux's overlayfs tends to return EREMOTE in cases where a file + // is unusable for reasons that are not better captured by another + // errno. + lookupErr = syserror.EREMOTE + return false + } + if isWhiteout(&stat) { + // This is a whiteout, so it "doesn't exist" on this layer, and + // layers below this one are ignored. + if isUpper { + lookupLayer = lookupLayerUpperWhiteout + } + return false + } + // The file exists; we can stop searching. + if isUpper { + lookupLayer = lookupLayerUpper + } else { + lookupLayer = lookupLayerLower + } + return false + }) + + return lookupLayer, lookupErr +} + +type lookupLayer int + +const ( + // lookupLayerNone indicates that no file exists at the given path on the + // upper layer, and is either whited out or does not exist on lower layers. + // Therefore, the file does not exist in the overlay filesystem, and file + // creation may proceed normally (if an upper layer exists). + lookupLayerNone lookupLayer = iota + + // lookupLayerLower indicates that no file exists at the given path on the + // upper layer, but exists on a lower layer. Therefore, the file exists in + // the overlay filesystem, but must be copied-up before mutation. + lookupLayerLower + + // lookupLayerUpper indicates that a non-whiteout file exists at the given + // path on the upper layer. Therefore, the file exists in the overlay + // filesystem, and is already copied-up. + lookupLayerUpper + + // lookupLayerUpperWhiteout indicates that a whiteout exists at the given + // path on the upper layer. Therefore, the file does not exist in the + // overlay filesystem, and file creation must remove the whiteout before + // proceeding. + lookupLayerUpperWhiteout +) + +func (ll lookupLayer) existsInOverlay() bool { + return ll == lookupLayerLower || ll == lookupLayerUpper +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: fs.renameMu must be locked. !rp.Done(). +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + if parent.vfsd.IsDead() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Determine if a file already exists at name. + if _, ok := parent.children[name]; ok { + return syserror.EEXIST + } + childLayer, err := fs.lookupLayerLocked(ctx, parent, name) + if err != nil { + return err + } + if childLayer.existsInOverlay() { + return syserror.EEXIST + } + + // Ensure that the parent directory is copied-up so that we can create the + // new file in the upper layer. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + // Finally create the new file. + if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil { + return err + } + parent.dirents = nil + return nil +} + +// Preconditions: pop's parent directory has been copied up. +func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) error { + return vfsObj.MknodAt(ctx, fs.creds, pop, &vfs.MknodOptions{ + Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0 + // DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV + }) +} + +func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) { + if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err) + } +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + layerVD := d.topLayer() + return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &opts) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + start := rp.Start().Impl().(*dentry) + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + old := vd.Dentry().Impl().(*dentry) + if old.isDir() { + return syserror.EPERM + } + if err := old.copyUpLocked(ctx); err != nil { + return err + } + vfsObj := fs.vfsfs.VirtualFilesystem() + newpop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil { + return err + } + } + if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: old.upperVD, + Start: old.upperVD, + }, &newpop); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) + } + return err + } + return nil + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + if haveUpperWhiteout { + // There may be directories on lower layers (previously hidden by + // the whiteout) that the new directory should not be merged with. + // Mark it opaque to prevent merging. + if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Value: "y", + }); err != nil { + if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr) + } else { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + } + return nil + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + // Disallow attempts to create whiteouts. + if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 { + return syserror.EPERM + } + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + return nil + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + mayCreate := opts.Flags&linux.O_CREAT != 0 + mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + + start := rp.Start().Impl().(*dentry) + if rp.Done() { + if mustCreate { + return nil, syserror.EEXIST + } + return start.openLocked(ctx, rp, &opts) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Determine whether or not we need to create a file. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) + if err == syserror.ENOENT && mayCreate { + fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds) + parent.dirMu.Unlock() + return fd, err + } + if err != nil { + parent.dirMu.Unlock() + return nil, err + } + // Open existing child or follow symlink. + parent.dirMu.Unlock() + if mustCreate { + return nil, syserror.EEXIST + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, &opts) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + if ats.MayWrite() { + if err := d.copyUpLocked(ctx); err != nil { + return nil, err + } + } + mnt := rp.Mount() + + // Directory FDs open FDs from each layer when directory entries are read, + // so they don't require opening an FD from d.topLayer() up front. + ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT + if ftype == linux.S_IFDIR { + // Can't open directories with O_CREAT. + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EISDIR + } + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + fd := &directoryFD{} + fd.LockFD.Init(&d.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil + } + + layerVD, isUpper := d.topLayerInfo() + layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, opts) + if err != nil { + return nil, err + } + layerFlags := layerFD.StatusFlags() + fd := &nonDirectoryFD{ + copiedUp: isUpper, + cachedFD: layerFD, + cachedFlags: layerFlags, + } + fd.LockFD.Init(&d.locks) + layerFDOpts := layerFD.Options() + if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil { + layerFD.DecRef() + return nil, err + } + return &fd.vfsfd, nil +} + +// Preconditions: parent.dirMu must be locked. parent does not already contain +// a child named rp.Component(). +func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { + creds := rp.Credentials() + if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil { + return nil, err + } + if parent.vfsd.IsDead() { + return nil, syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + defer mnt.EndWrite() + + if err := parent.copyUpLocked(ctx); err != nil { + return nil, err + } + + vfsObj := fs.vfsfs.VirtualFilesystem() + childName := rp.Component() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + // We don't know if a whiteout exists on the upper layer; speculatively + // unlink it. + // + // TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do + // know whether a whiteout exists. + var haveUpperWhiteout bool + switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err { + case nil: + haveUpperWhiteout = true + case syserror.ENOENT: + haveUpperWhiteout = false + default: + return nil, err + } + // Create the file on the upper layer, and get an FD representing it. + upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{ + Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL, + Mode: opts.Mode, + }) + if err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Change the file's owner to the caller. We can't use upperFD.SetStat() + // because it will pick up creds from ctx. + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Re-lookup to get a dentry representing the new file, which is needed for + // the returned FD. + child, err := fs.getChildLocked(ctx, parent, childName, ds) + if err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Finally construct the overlay FD. + upperFlags := upperFD.StatusFlags() + fd := &nonDirectoryFD{ + copiedUp: true, + cachedFD: upperFD, + cachedFlags: upperFlags, + } + fd.LockFD.Init(&child.locks) + upperFDOpts := upperFD.Options() + if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil { + upperFD.DecRef() + // Don't bother with cleanup; the file was created successfully, we + // just can't open it anymore for some reason. + return nil, err + } + return &fd.vfsfd, nil +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + layerVD := d.topLayer() + return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + return syserror.EINVAL + } + + var ds *[]*dentry + fs.renameMu.Lock() + defer fs.renameMuUnlockAndCheckDrop(&ds) + newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + // FIXME(gvisor.dev/issue/1199): Actually implement rename. + _ = newParent + return syserror.EXDEV +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + name := rp.Component() + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Ensure that parent is copied-up before potentially holding child.copyMu + // below. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + // Unlike UnlinkAt, we need a dentry representing the child directory being + // removed in order to verify that it's empty. + child, err := fs.getChildLocked(ctx, parent, name, &ds) + if err != nil { + return err + } + if !child.isDir() { + return syserror.ENOTDIR + } + child.dirMu.Lock() + defer child.dirMu.Unlock() + whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx) + if err != nil { + return err + } + child.copyMu.RLock() + defer child.copyMu.RUnlock() + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(name), + } + if child.upperVD.Ok() { + cleanupRecreateWhiteouts := func() { + if !child.upperVD.Ok() { + return + } + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{ + Root: child.upperVD, + Start: child.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil && err != syserror.EEXIST { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err) + } + } + } + // Remove existing whiteouts on the upper layer. + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: child.upperVD, + Start: child.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortDeleteDentry(&child.vfsd) + return err + } + } + // Remove the existing directory on the upper layer. + if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortDeleteDentry(&child.vfsd) + return err + } + } + if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil { + // Don't attempt to recover from this: the original directory is + // already gone, so any dentries representing it are invalid, and + // creating a new directory won't undo that. + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err) + vfsObj.AbortDeleteDentry(&child.vfsd) + return err + } + + vfsObj.CommitDeleteDentry(&child.vfsd) + delete(parent.children, name) + ds = appendDentry(ds, child) + parent.dirents = nil + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts.Stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + // Changes to d's attributes are serialized by d.copyMu. + d.copyMu.Lock() + defer d.copyMu.Unlock() + if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + }, &opts); err != nil { + return err + } + d.updateAfterSetStatLocked(&opts) + return nil +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + + var stat linux.Statx + if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { + layerVD := d.topLayer() + stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.StatOptions{ + Mask: layerMask, + Sync: opts.Sync, + }) + if err != nil { + return linux.Statx{}, err + } + } + d.statInternalTo(ctx, &opts, &stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statfs{}, err + } + return fs.statFS(ctx) +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + return nil + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + name := rp.Component() + if name == "." || name == ".." { + return syserror.EISDIR + } + if rp.MustBeDir() { + return syserror.ENOTDIR + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Ensure that parent is copied-up before potentially holding child.copyMu + // below. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + child := parent.children[name] + var childLayer lookupLayer + if child != nil { + if child.isDir() { + return syserror.EISDIR + } + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + // Hold child.copyMu to prevent it from being copied-up during + // deletion. + child.copyMu.RLock() + defer child.copyMu.RUnlock() + if child.upperVD.Ok() { + childLayer = lookupLayerUpper + } else { + childLayer = lookupLayerLower + } + } else { + // Determine if the file being unlinked actually exists. Holding + // parent.dirMu prevents a dentry from being instantiated for the file, + // which in turn prevents it from being copied-up, so this result is + // stable. + childLayer, err = fs.lookupLayerLocked(ctx, parent, name) + if err != nil { + return err + } + if !childLayer.existsInOverlay() { + return syserror.ENOENT + } + } + + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(name), + } + if childLayer == lookupLayerUpper { + // Remove the existing file on the upper layer. + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err + } + } + if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err) + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err + } + + if child != nil { + vfsObj.CommitDeleteDentry(&child.vfsd) + delete(parent.children, name) + ds = appendDentry(ds, child) + } + parent.dirents = nil + return nil +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + // TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr, + // but not any other xattr syscalls. For now we just reject all of them. + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) +} diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go new file mode 100644 index 000000000..a3c1f7a8d --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -0,0 +1,266 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isSymlink() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK +} + +func (d *dentry) readlink(ctx context.Context) (string, error) { + layerVD := d.topLayer() + return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }) +} + +type nonDirectoryFD struct { + fileDescription + + // If copiedUp is false, cachedFD represents + // fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents + // fileDescription.dentry().upperVD. cachedFlags is the last known value of + // cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are + // protected by mu. + mu sync.Mutex + copiedUp bool + cachedFD *vfs.FileDescription + cachedFlags uint32 +} + +func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return nil, err + } + wrappedFD.IncRef() + return wrappedFD, nil +} + +func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) { + d := fd.dentry() + statusFlags := fd.vfsfd.StatusFlags() + if !fd.copiedUp && d.isCopiedUp() { + // Switch to the copied-up file. + upperVD := d.topLayer() + upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: upperVD, + Start: upperVD, + }, &vfs.OpenOptions{ + Flags: statusFlags, + }) + if err != nil { + return nil, err + } + oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR) + if oldOffErr == nil { + if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil { + upperFD.DecRef() + return nil, err + } + } + fd.cachedFD.DecRef() + fd.copiedUp = true + fd.cachedFD = upperFD + fd.cachedFlags = statusFlags + } else if fd.cachedFlags != statusFlags { + if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil { + return nil, err + } + fd.cachedFlags = statusFlags + } + return fd.cachedFD, nil +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *nonDirectoryFD) Release() { + fd.cachedFD.DecRef() + fd.cachedFD = nil +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *nonDirectoryFD) OnClose(ctx context.Context) error { + // Linux doesn't define ovl_file_operations.flush at all (i.e. its + // equivalent to OnClose is a no-op). We pass through to + // fd.cachedFD.OnClose() without upgrading if fd.dentry() has been + // copied-up, since OnClose is mostly used to define post-close writeback, + // and if fd.cachedFD hasn't been updated then it can't have been used to + // mutate fd.dentry() anyway. + fd.mu.Lock() + if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags { + if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil { + fd.mu.Unlock() + return err + } + fd.cachedFlags = statusFlags + } + wrappedFD := fd.cachedFD + defer wrappedFD.IncRef() + fd.mu.Unlock() + return wrappedFD.OnClose(ctx) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return linux.Statx{}, err + } + stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{ + Mask: layerMask, + Sync: opts.Sync, + }) + wrappedFD.DecRef() + if err != nil { + return linux.Statx{}, err + } + } + fd.dentry().statInternalTo(ctx, &opts, &stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + d := fd.dentry() + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + mnt := fd.vfsfd.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + // Changes to d's attributes are serialized by d.copyMu. + d.copyMu.Lock() + defer d.copyMu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return err + } + if err := wrappedFD.SetStat(ctx, opts); err != nil { + return err + } + d.updateAfterSetStatLocked(&opts) + return nil +} + +// StatFS implements vfs.FileDesciptionImpl.StatFS. +func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) { + return fd.filesystem().statFS(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return 0, err + } + defer wrappedFD.DecRef() + return wrappedFD.PRead(ctx, dst, offset, opts) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // Hold fd.mu during the read to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Read(ctx, dst, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return 0, err + } + defer wrappedFD.DecRef() + return wrappedFD.PWrite(ctx, src, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // Hold fd.mu during the write to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Write(ctx, src, opts) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Hold fd.mu during the seek to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Seek(ctx, offset, whence) +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *nonDirectoryFD) Sync(ctx context.Context) error { + fd.mu.Lock() + if !fd.dentry().isCopiedUp() { + fd.mu.Unlock() + return nil + } + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + fd.mu.Unlock() + return err + } + wrappedFD.IncRef() + defer wrappedFD.DecRef() + fd.mu.Unlock() + return wrappedFD.Sync(ctx) +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return err + } + defer wrappedFD.DecRef() + return wrappedFD.ConfigureMMap(ctx, opts) +} diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go new file mode 100644 index 000000000..e720d4825 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -0,0 +1,627 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package overlay provides an overlay filesystem implementation, which +// synthesizes a filesystem by composing one or more immutable filesystems +// ("lower layers") with an optional mutable filesystem ("upper layer"). +// +// Lock order: +// +// directoryFD.mu / nonDirectoryFD.mu +// filesystem.renameMu +// dentry.dirMu +// dentry.copyMu +// +// Locking dentry.dirMu in multiple dentries requires that parent dentries are +// locked before child dentries, and that filesystem.renameMu is locked to +// stabilize this relationship. +package overlay + +import ( + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the default filesystem name. +const Name = "overlay" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to +// FilesystemType.GetFilesystem. +type FilesystemOptions struct { + // Callers passing FilesystemOptions to + // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that + // the vfs.Mounts comprising the layers of the overlay filesystem do not + // contain submounts. + + // If UpperRoot.Ok(), it is the root of the writable upper layer of the + // overlay. + UpperRoot vfs.VirtualDentry + + // LowerRoots contains the roots of the immutable lower layers of the + // overlay. LowerRoots is immutable. + LowerRoots []vfs.VirtualDentry +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // Immutable options. + opts FilesystemOptions + + // creds is a copy of the filesystem's creator's credentials, which are + // used for accesses to the filesystem's layers. creds is immutable. + creds *auth.Credentials + + // dirDevMinor is the device minor number used for directories. dirDevMinor + // is immutable. + dirDevMinor uint32 + + // lowerDevMinors maps lower layer filesystems to device minor numbers + // assigned to non-directory files originating from that filesystem. + // lowerDevMinors is immutable. + lowerDevMinors map[*vfs.Filesystem]uint32 + + // renameMu synchronizes renaming with non-renaming operations in order to + // ensure consistent lock ordering between dentry.dirMu in different + // dentries. + renameMu sync.RWMutex + + // lastDirIno is the last inode number assigned to a directory. lastDirIno + // is accessed using atomic memory operations. + lastDirIno uint64 +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + mopts := vfs.GenericParseMountOptions(opts.Data) + fsoptsRaw := opts.InternalData + fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions) + if fsoptsRaw != nil && !haveFSOpts { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) + return nil, nil, syserror.EINVAL + } + if haveFSOpts { + if len(fsopts.LowerRoots) == 0 { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty") + return nil, nil, syserror.EINVAL + } + if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified") + return nil, nil, syserror.EINVAL + } + // We don't enforce a maximum number of lower layers when not + // configured by applications; the sandbox owner can have an overlay + // filesystem with any number of lower layers. + } else { + vfsroot := vfs.RootFromContext(ctx) + defer vfsroot.DecRef() + upperPathname, ok := mopts["upperdir"] + if ok { + delete(mopts, "upperdir") + // Linux overlayfs also requires a workdir when upperdir is + // specified; we don't, so silently ignore this option. + delete(mopts, "workdir") + upperPath := fspath.Parse(upperPathname) + if !upperPath.Absolute { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) + return nil, nil, syserror.EINVAL + } + upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ + Root: vfsroot, + Start: vfsroot, + Path: upperPath, + FollowFinalSymlink: true, + }, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) + return nil, nil, err + } + defer upperRoot.DecRef() + privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) + if err != nil { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) + return nil, nil, err + } + defer privateUpperRoot.DecRef() + fsopts.UpperRoot = privateUpperRoot + } + lowerPathnamesStr, ok := mopts["lowerdir"] + if !ok { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir") + return nil, nil, syserror.EINVAL + } + delete(mopts, "lowerdir") + lowerPathnames := strings.Split(lowerPathnamesStr, ":") + const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK + if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified") + return nil, nil, syserror.EINVAL + } + if len(lowerPathnames) > maxLowerLayers { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers) + return nil, nil, syserror.EINVAL + } + for _, lowerPathname := range lowerPathnames { + lowerPath := fspath.Parse(lowerPathname) + if !lowerPath.Absolute { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) + return nil, nil, syserror.EINVAL + } + lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ + Root: vfsroot, + Start: vfsroot, + Path: lowerPath, + FollowFinalSymlink: true, + }, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) + return nil, nil, err + } + defer lowerRoot.DecRef() + privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) + if err != nil { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) + return nil, nil, err + } + defer privateLowerRoot.DecRef() + fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) + } + } + if len(mopts) != 0 { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + // Allocate device numbers. + dirDevMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + lowerDevMinors := make(map[*vfs.Filesystem]uint32) + for _, lowerRoot := range fsopts.LowerRoots { + lowerFS := lowerRoot.Mount().Filesystem() + if _, ok := lowerDevMinors[lowerFS]; !ok { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + vfsObj.PutAnonBlockDevMinor(dirDevMinor) + for _, lowerDevMinor := range lowerDevMinors { + vfsObj.PutAnonBlockDevMinor(lowerDevMinor) + } + return nil, nil, err + } + lowerDevMinors[lowerFS] = devMinor + } + } + + // Take extra references held by the filesystem. + if fsopts.UpperRoot.Ok() { + fsopts.UpperRoot.IncRef() + } + for _, lowerRoot := range fsopts.LowerRoots { + lowerRoot.IncRef() + } + + fs := &filesystem{ + opts: fsopts, + creds: creds.Fork(), + dirDevMinor: dirDevMinor, + lowerDevMinors: lowerDevMinors, + } + fs.vfsfs.Init(vfsObj, &fstype, fs) + + // Construct the root dentry. + root := fs.newDentry() + root.refs = 1 + if fs.opts.UpperRoot.Ok() { + fs.opts.UpperRoot.IncRef() + root.copiedUp = 1 + root.upperVD = fs.opts.UpperRoot + } + for _, lowerRoot := range fs.opts.LowerRoots { + lowerRoot.IncRef() + root.lowerVDs = append(root.lowerVDs, lowerRoot) + } + rootTopVD := root.topLayer() + // Get metadata from the topmost layer. See fs.lookupLocked(). + const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: rootTopVD, + Start: rootTopVD, + }, &vfs.StatOptions{ + Mask: rootStatMask, + }) + if err != nil { + root.destroyLocked() + fs.vfsfs.DecRef() + return nil, nil, err + } + if rootStat.Mask&rootStatMask != rootStatMask { + root.destroyLocked() + fs.vfsfs.DecRef() + return nil, nil, syserror.EREMOTE + } + if isWhiteout(&rootStat) { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") + root.destroyLocked() + fs.vfsfs.DecRef() + return nil, nil, syserror.EINVAL + } + root.mode = uint32(rootStat.Mode) + root.uid = rootStat.UID + root.gid = rootStat.GID + if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR { + root.devMajor = linux.UNNAMED_MAJOR + root.devMinor = fs.dirDevMinor + root.ino = fs.newDirIno() + } else if !root.upperVD.Ok() { + root.devMajor = linux.UNNAMED_MAJOR + root.devMinor = fs.lowerDevMinors[root.lowerVDs[0].Mount().Filesystem()] + root.ino = rootStat.Ino + } else { + root.devMajor = rootStat.DevMajor + root.devMinor = rootStat.DevMinor + root.ino = rootStat.Ino + } + + return &fs.vfsfs, &root.vfsd, nil +} + +// clonePrivateMount creates a non-recursive bind mount rooted at vd, not +// associated with any MountNamespace, and returns the root of the new mount. +// (This is required to ensure that each layer of an overlay comprises only a +// single mount, and therefore can't cross into e.g. the overlay filesystem +// itself, risking lock recursion.) A reference is held on the returned +// VirtualDentry. +func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) { + oldmnt := vd.Mount() + opts := oldmnt.Options() + if forceReadOnly { + opts.ReadOnly = true + } + newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts) + if err != nil { + return vfs.VirtualDentry{}, err + } + return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + vfsObj := fs.vfsfs.VirtualFilesystem() + vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) + for _, lowerDevMinor := range fs.lowerDevMinors { + vfsObj.PutAnonBlockDevMinor(lowerDevMinor) + } + if fs.opts.UpperRoot.Ok() { + fs.opts.UpperRoot.DecRef() + } + for _, lowerRoot := range fs.opts.LowerRoots { + lowerRoot.DecRef() + } +} + +func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { + // Always statfs the root of the topmost layer. Compare Linux's + // fs/overlayfs/super.c:ovl_statfs(). + var rootVD vfs.VirtualDentry + if fs.opts.UpperRoot.Ok() { + rootVD = fs.opts.UpperRoot + } else { + rootVD = fs.opts.LowerRoots[0] + } + fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{ + Root: rootVD, + Start: rootVD, + }) + if err != nil { + return linux.Statfs{}, err + } + fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC + return fsstat, nil +} + +func (fs *filesystem) newDirIno() uint64 { + return atomic.AddUint64(&fs.lastDirIno, 1) +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // mode, uid, and gid are the file mode, owner, and group of the file in + // the topmost layer (and therefore the overlay file as well), and are used + // for permission checks on this dentry. These fields are protected by + // copyMu and accessed using atomic memory operations. + mode uint32 + uid uint32 + gid uint32 + + // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and + // 0 otherwise. copiedUp is accessed using atomic memory operations. + copiedUp uint32 + + // parent is the dentry corresponding to this dentry's parent directory. + // name is this dentry's name in parent. If this dentry is a filesystem + // root, parent is nil and name is the empty string. parent and name are + // protected by fs.renameMu. + parent *dentry + name string + + // If this dentry represents a directory, children maps the names of + // children for which dentries have been instantiated to those dentries, + // and dirents (if not nil) is a cache of dirents as returned by + // directoryFDs representing this directory. children is protected by + // dirMu. + dirMu sync.Mutex + children map[string]*dentry + dirents []vfs.Dirent + + // upperVD and lowerVDs are the files from the overlay filesystem's layers + // that comprise the file on the overlay filesystem. + // + // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. + // be copied up) with copyMu locked for writing; otherwise, it is + // immutable. lowerVDs is always immutable. + copyMu sync.RWMutex + upperVD vfs.VirtualDentry + lowerVDs []vfs.VirtualDentry + + // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <= + // len(inlineLowerVDs). + inlineLowerVDs [1]vfs.VirtualDentry + + // devMajor, devMinor, and ino are the device major/minor and inode numbers + // used by this dentry. These fields are protected by copyMu and accessed + // using atomic memory operations. + devMajor uint32 + devMinor uint32 + ino uint64 + + locks vfs.FileLocks +} + +// newDentry creates a new dentry. The dentry initially has no references; it +// is the caller's responsibility to set the dentry's reference count and/or +// call dentry.destroy() as appropriate. The dentry is initially invalid in +// that it contains no layers; the caller is responsible for setting them. +func (fs *filesystem) newDentry() *dentry { + d := &dentry{ + fs: fs, + } + d.lowerVDs = d.inlineLowerVDs[:0] + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.checkDropLocked(). + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkDropLocked() + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("overlay.dentry.DecRef() called without holding a reference") + } +} + +// checkDropLocked should be called after d's reference count becomes 0 or it +// becomes deleted. +// +// Preconditions: d.fs.renameMu must be locked for writing. +func (d *dentry) checkDropLocked() { + // Dentries with a positive reference count must be retained. (The only way + // to obtain a reference on a dentry with zero references is via path + // resolution, which requires renameMu, so if d.refs is zero then it will + // remain zero while we hold renameMu for writing.) Dentries with a + // negative reference count have already been destroyed. + if atomic.LoadInt64(&d.refs) != 0 { + return + } + // Refs is still zero; destroy it. + d.destroyLocked() + return +} + +// destroyLocked destroys the dentry. +// +// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. +func (d *dentry) destroyLocked() { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("overlay.dentry.destroyLocked() called on already destroyed dentry") + default: + panic("overlay.dentry.destroyLocked() called with references on the dentry") + } + + if d.upperVD.Ok() { + d.upperVD.DecRef() + } + for _, lowerVD := range d.lowerVDs { + lowerVD.DecRef() + } + + if d.parent != nil { + d.parent.dirMu.Lock() + if !d.vfsd.IsDead() { + delete(d.parent.children, d.name) + } + d.parent.dirMu.Unlock() + // Drop the reference held by d on its parent without recursively + // locking d.fs.renameMu. + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkDropLocked() + } else if refs < 0 { + panic("overlay.dentry.DecRef() called without holding a reference") + } + } +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { + // TODO(gvisor.dev/issue/1479): Implement inotify. +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + // TODO(gvisor.dev/issue/1479): Implement inotify. + return nil +} + +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) OnZeroWatches() {} + +// iterLayers invokes yield on each layer comprising d, from top to bottom. If +// any call to yield returns false, iterLayer stops iteration. +func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { + if d.isCopiedUp() { + if !yield(d.upperVD, true) { + return + } + } + for _, lowerVD := range d.lowerVDs { + if !yield(lowerVD, false) { + return + } + } +} + +func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) { + if d.isCopiedUp() { + return d.upperVD, true + } + return d.lowerVDs[0], false +} + +func (d *dentry) topLayer() vfs.VirtualDentry { + vd, _ := d.topLayerInfo() + return vd +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +// statInternalMask is the set of stat fields that is set by +// dentry.statInternalTo(). +const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + +// statInternalTo writes fields to stat that are stored in d, and therefore do +// not requiring invoking StatAt on the overlay's layers. +func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) { + stat.Mask |= statInternalMask + if d.isDir() { + // Linux sets nlink to 1 for merged directories + // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is + // correct more often ("." and the directory's entry in its parent), + // and some of our tests expect this. + stat.Nlink = 2 + } + stat.UID = atomic.LoadUint32(&d.uid) + stat.GID = atomic.LoadUint32(&d.gid) + stat.Mode = uint16(atomic.LoadUint32(&d.mode)) + stat.Ino = atomic.LoadUint64(&d.ino) + stat.DevMajor = atomic.LoadUint32(&d.devMajor) + stat.DevMinor = atomic.LoadUint32(&d.devMinor) +} + +// Preconditions: d.copyMu must be locked for writing. +func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { + if opts.Stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT)) + } + if opts.Stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, opts.Stat.UID) + } + if opts.Stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.gid, opts.Stat.GID) + } +} + +// fileDescription is embedded by overlay implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD new file mode 100644 index 000000000..5950a2d59 --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "pipefs", + srcs = ["pipefs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go new file mode 100644 index 000000000..dd7eaf4a8 --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -0,0 +1,165 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pipefs provides the filesystem implementation backing +// Kernel.PipeMount. +package pipefs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type filesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (filesystemType) Name() string { + return "pipefs" +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("pipefs.filesystemType.GetFilesystem should never be called") +} + +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + +// NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.Filesystem.VFSFilesystem(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) + b.PrependComponent(fmt.Sprintf("pipe:[%d]", inode.ino)) + return vfs.PrependPathSyntheticError{} +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeNoopRefCount + + locks vfs.FileLocks + pipe *pipe.VFSPipe + + ino uint64 + uid auth.KUID + gid auth.KGID + // We use the creation timestamp for all of atime, mtime, and ctime. + ctime ktime.Time +} + +func newInode(ctx context.Context, fs *filesystem) *inode { + creds := auth.CredentialsFromContext(ctx) + return &inode{ + pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + ino: fs.Filesystem.NextIno(), + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + ctime: ktime.NowFromContext(ctx), + } +} + +const pipeMode = 0600 | linux.S_IFIFO + +// CheckPermissions implements kernfs.Inode.CheckPermissions. +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid) +} + +// Mode implements kernfs.Inode.Mode. +func (i *inode) Mode() linux.FileMode { + return pipeMode +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) + return linux.Statx{ + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: usermem.PageSize, + Nlink: 1, + UID: uint32(i.uid), + GID: uint32(i.gid), + Mode: pipeMode, + Ino: i.ino, + Size: 0, + Blocks: 0, + Atime: ts, + Ctime: ts, + Mtime: ts, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: vfsfs.Impl().(*filesystem).devMinor, + }, nil +} + +// SetStat implements kernfs.Inode.SetStat. +func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + return syserror.EPERM +} + +// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement +// statfs, from which we should indicate PIPEFS_MAGIC. + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks) +} + +// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read +// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2). +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + fs := mnt.Filesystem().Impl().(*filesystem) + inode := newInode(ctx, fs) + var d kernfs.Dentry + d.Init(inode) + defer d.DecRef() + return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) +} diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD new file mode 100644 index 000000000..6014138ff --- /dev/null +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -0,0 +1,67 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "proc", + srcs = [ + "filesystem.go", + "subtasks.go", + "task.go", + "task_fds.go", + "task_files.go", + "task_net.go", + "tasks.go", + "tasks_files.go", + "tasks_sys.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/log", + "//pkg/refs", + "//pkg/safemem", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/mm", + "//pkg/sentry/socket", + "//pkg/sentry/socket/unix", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/tcpip/header", + "//pkg/usermem", + ], +) + +go_test( + name = "proc_test", + size = "small", + srcs = [ + "tasks_sys_test.go", + "tasks_test.go", + ], + library = ":proc", + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/contexttest", + "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go new file mode 100644 index 000000000..609210253 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -0,0 +1,117 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package proc implements a partial in-memory file system for procfs. +package proc + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Name is the default filesystem name. +const Name = "proc" + +// FilesystemType is the factory class for procfs. +// +// +stateify savable +type FilesystemType struct{} + +var _ vfs.FilesystemType = (*FilesystemType)(nil) + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, nil, fmt.Errorf("procfs requires a kernel") + } + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return nil, nil, fmt.Errorf("procfs requires a PID namespace") + } + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + procfs := &filesystem{ + devMinor: devMinor, + } + procfs.VFSFilesystem().Init(vfsObj, &ft, procfs) + + var cgroups map[string]string + if opts.InternalData != nil { + data := opts.InternalData.(*InternalData) + cgroups = data.Cgroups + } + + _, dentry := procfs.newTasksInode(k, pidns, cgroups) + return procfs.VFSFilesystem(), dentry.VFSDentry(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// dynamicInode is an overfitted interface for common Inodes with +// dynamicByteSource types used in procfs. +type dynamicInode interface { + kernfs.Inode + vfs.DynamicBytesSource + + Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) +} + +func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +type staticFile struct { + kernfs.DynamicBytesFile + vfs.StaticData +} + +var _ dynamicInode = (*staticFile)(nil) + +func newStaticFile(data string) *staticFile { + return &staticFile{StaticData: vfs.StaticData{Data: data}} +} + +// InternalData contains internal data passed in to the procfs mount via +// vfs.GetFilesystemOptions.InternalData. +type InternalData struct { + Cgroups map[string]string +} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go new file mode 100644 index 000000000..36a89540c --- /dev/null +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -0,0 +1,182 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// subtasksInode represents the inode for /proc/[pid]/task/ directory. +// +// +stateify savable +type subtasksInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + + locks vfs.FileLocks + + fs *filesystem + task *kernel.Task + pidns *kernel.PIDNamespace + cgroupControllers map[string]string +} + +var _ kernfs.Inode = (*subtasksInode)(nil) + +func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry { + subInode := &subtasksInode{ + fs: fs, + task: task, + pidns: pidns, + cgroupControllers: cgroupControllers, + } + // Note: credentials are overridden by taskOwnedInode. + subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + inode := &taskOwnedInode{Inode: subInode, owner: task} + dentry := &kernfs.Dentry{} + dentry.Init(inode) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + tid, err := strconv.ParseUint(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + + subTask := i.pidns.TaskWithID(kernel.ThreadID(tid)) + if subTask == nil { + return nil, syserror.ENOENT + } + if subTask.ThreadGroup() != i.task.ThreadGroup() { + return nil, syserror.ENOENT + } + + subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) + return subTaskDentry.VFSDentry(), nil +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + tasks := i.task.ThreadGroup().MemberIDs(i.pidns) + if len(tasks) == 0 { + return offset, syserror.ENOENT + } + if relOffset >= int64(len(tasks)) { + return offset, nil + } + + tids := make([]int, 0, len(tasks)) + for _, tid := range tasks { + tids = append(tids, int(tid)) + } + + sort.Ints(tids) + for _, tid := range tids[relOffset:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(tid), 10), + Type: linux.DT_DIR, + Ino: i.fs.NextIno(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} + +type subtasksFD struct { + kernfs.GenericDirectoryFD + + task *kernel.Task +} + +func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return syserror.ENOENT + } + return fd.GenericDirectoryFD.IterDirents(ctx, cb) +} + +// Seek implements vfs.FileDecriptionImpl.Seek. +func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return 0, syserror.ENOENT + } + return fd.GenericDirectoryFD.Seek(ctx, offset, whence) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return linux.Statx{}, syserror.ENOENT + } + return fd.GenericDirectoryFD.Stat(ctx, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return syserror.ENOENT + } + return fd.GenericDirectoryFD.SetStat(ctx, opts) +} + +// Open implements kernfs.Inode. +func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &subtasksFD{task: i.task} + if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil { + return nil, err + } + if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// Stat implements kernfs.Inode. +func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(vsfs, opts) + if err != nil { + return linux.Statx{}, err + } + if opts.Mask&linux.STATX_NLINK != 0 { + stat.Nlink += uint32(i.task.ThreadGroup().Count()) + } + return stat, nil +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go new file mode 100644 index 000000000..8bb2b0ce1 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task.go @@ -0,0 +1,239 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// taskInode represents the inode for /proc/PID/ directory. +// +// +stateify savable +type taskInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNoDynamicLookup + kernfs.InodeAttrs + kernfs.OrderedChildren + + locks vfs.FileLocks + + task *kernel.Task +} + +var _ kernfs.Inode = (*taskInode)(nil) + +func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { + // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited. + contents := map[string]*kernfs.Dentry{ + "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}), + "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": fs.newComm(task, fs.NextIno(), 0444), + "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "exe": fs.newExeSymlink(task, fs.NextIno()), + "fd": fs.newFDDirInode(task), + "fdinfo": fs.newFDInfoDirInode(task), + "gid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}), + "mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}), + "net": fs.newTaskNetDir(task), + "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{ + "net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"), + "pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"), + "user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"), + }), + "oom_score": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}), + "stat": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}), + "status": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), + } + if isThreadGroup { + contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers) + } + if len(cgroupControllers) > 0 { + contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) + } + + taskInode := &taskInode{task: task} + // Note: credentials are overridden by taskOwnedInode. + taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + + inode := &taskOwnedInode{Inode: taskInode, owner: task} + dentry := &kernfs.Dentry{} + dentry.Init(inode) + + taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := taskInode.OrderedChildren.Populate(dentry, contents) + taskInode.IncLinks(links) + + return dentry +} + +// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long +// as the task is still running. When it's dead, another tasks with the same +// PID could replace it. +func (i *taskInode) Valid(ctx context.Context) bool { + return i.task.ExitState() != kernel.TaskExitDead +} + +// Open implements kernfs.Inode. +func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// taskOwnedInode implements kernfs.Inode and overrides inode owner with task +// effective user and group. +type taskOwnedInode struct { + kernfs.Inode + + // owner is the task that owns this inode. + owner *kernel.Task +} + +var _ kernfs.Inode = (*taskOwnedInode)(nil) + +func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) + + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d +} + +func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &kernfs.StaticDirectory{} + + // Note: credentials are overridden by taskOwnedInode. + dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) + + inode := &taskOwnedInode{Inode: dir, owner: task} + d := &kernfs.Dentry{} + d.Init(inode) + + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := dir.OrderedChildren.Populate(d, children) + dir.IncLinks(links) + + return d +} + +// Stat implements kernfs.Inode. +func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.Inode.Stat(fs, opts) + if err != nil { + return linux.Statx{}, err + } + if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 { + uid, gid := i.getOwner(linux.FileMode(stat.Mode)) + if opts.Mask&linux.STATX_UID != 0 { + stat.UID = uint32(uid) + } + if opts.Mask&linux.STATX_GID != 0 { + stat.GID = uint32(gid) + } + } + return stat, nil +} + +// CheckPermissions implements kernfs.Inode. +func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := i.Mode() + uid, gid := i.getOwner(mode) + return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid) +} + +func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) { + // By default, set the task owner as the file owner. + creds := i.owner.Credentials() + uid := creds.EffectiveKUID + gid := creds.EffectiveKGID + + // Linux doesn't apply dumpability adjustments to world readable/executable + // directories so that applications can stat /proc/PID to determine the + // effective UID of a process. See fs/proc/base.c:task_dump_owner. + if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 { + return uid, gid + } + + // If the task is not dumpable, then root (in the namespace preferred) + // owns the file. + m := getMM(i.owner) + if m == nil { + return auth.RootKUID, auth.RootKGID + } + if m.Dumpability() != mm.UserDumpable { + uid = auth.RootKUID + if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() { + uid = kuid + } + gid = auth.RootKGID + if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() { + gid = kgid + } + } + return uid, gid +} + +func newIO(t *kernel.Task, isThreadGroup bool) *ioData { + if isThreadGroup { + return &ioData{ioUsage: t.ThreadGroup()} + } + return &ioData{ioUsage: t} +} + +// newCgroupData creates inode that shows cgroup information. +// From man 7 cgroups: "For each cgroup hierarchy of which the process is a +// member, there is one entry containing three colon-separated fields: +// hierarchy-ID:controller-list:cgroup-path" +func newCgroupData(controllers map[string]string) dynamicInode { + var buf bytes.Buffer + + // The hierarchy ids must be positive integers (for cgroup v1), but the + // exact number does not matter, so long as they are unique. We can + // just use a counter, but since linux sorts this file in descending + // order, we must count down to preserve this behavior. + i := len(controllers) + for name, dir := range controllers { + fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir) + i-- + } + return newStaticFile(buf.String()) +} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go new file mode 100644 index 000000000..fea29e5f0 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -0,0 +1,307 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) { + var ( + file *vfs.FileDescription + flags kernel.FDFlags + ) + t.WithMuLocked(func(t *kernel.Task) { + if fdt := t.FDTable(); fdt != nil { + file, flags = fdt.GetVFS2(fd) + } + }) + return file, flags +} + +func taskFDExists(t *kernel.Task, fd int32) bool { + file, _ := getTaskFD(t, fd) + if file == nil { + return false + } + file.DecRef() + return true +} + +type fdDir struct { + locks vfs.FileLocks + + fs *filesystem + task *kernel.Task + + // When produceSymlinks is set, dirents produces for the FDs are reported + // as symlink. Otherwise, they are reported as regular files. + produceSymlink bool +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + var fds []int32 + i.task.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.GetFDs() + } + }) + + typ := uint8(linux.DT_REG) + if i.produceSymlink { + typ = linux.DT_LNK + } + + // Find the appropriate starting point. + idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) }) + if idx >= len(fds) { + return offset, nil + } + for _, fd := range fds[idx:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(fd), 10), + Type: typ, + Ino: i.fs.NextIno(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} + +// fdDirInode represents the inode for /proc/[pid]/fd directory. +// +// +stateify savable +type fdDirInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + fdDir +} + +var _ kernfs.Inode = (*fdDirInode)(nil) + +func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { + inode := &fdDirInode{ + fdDir: fdDir{ + fs: fs, + task: task, + produceSymlink: true, + }, + } + inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + fdInt, err := strconv.ParseInt(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + fd := int32(fdInt) + if !taskFDExists(i.task, fd) { + return nil, syserror.ENOENT + } + taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) + return taskDentry.VFSDentry(), nil +} + +// Open implements kernfs.Inode. +func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// CheckPermissions implements kernfs.Inode. +// +// This is to match Linux, which uses a special permission handler to guarantee +// that a process can still access /proc/self/fd after it has executed +// setuid. See fs/proc/fd.c:proc_fd_permission. +func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + err := i.InodeAttrs.CheckPermissions(ctx, creds, ats) + if err == nil { + // Access granted, no extra check needed. + return nil + } + if t := kernel.TaskFromContext(ctx); t != nil { + // Allow access if the task trying to access it is in the thread group + // corresponding to this directory. + if i.task.ThreadGroup() == t.ThreadGroup() { + // Access granted (overridden). + return nil + } + } + return err +} + +// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. +// +// +stateify savable +type fdSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + task *kernel.Task + fd int32 +} + +var _ kernfs.Inode = (*fdSymlink)(nil) + +func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { + inode := &fdSymlink{ + task: task, + fd: fd, + } + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { + file, _ := getTaskFD(s.task, s.fd) + if file == nil { + return "", syserror.ENOENT + } + defer file.DecRef() + root := vfs.RootFromContext(ctx) + defer root.DecRef() + return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) +} + +func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + file, _ := getTaskFD(s.task, s.fd) + if file == nil { + return vfs.VirtualDentry{}, "", syserror.ENOENT + } + defer file.DecRef() + vd := file.VirtualDentry() + vd.IncRef() + return vd, "", nil +} + +// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. +// +// +stateify savable +type fdInfoDirInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + fdDir +} + +var _ kernfs.Inode = (*fdInfoDirInode)(nil) + +func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { + inode := &fdInfoDirInode{ + fdDir: fdDir{ + fs: fs, + task: task, + }, + } + inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + fdInt, err := strconv.ParseInt(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + fd := int32(fdInt) + if !taskFDExists(i.task, fd) { + return nil, syserror.ENOENT + } + data := &fdInfoData{ + task: i.task, + fd: fd, + } + dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data) + return dentry.VFSDentry(), nil +} + +// Open implements kernfs.Inode. +func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. +// +// +stateify savable +type fdInfoData struct { + kernfs.DynamicBytesFile + refs.AtomicRefCount + + task *kernel.Task + fd int32 +} + +var _ dynamicInode = (*fdInfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + file, descriptorFlags := getTaskFD(d.task, d.fd) + if file == nil { + return syserror.ENOENT + } + defer file.DecRef() + // TODO(b/121266871): Include pos, locks, and other data. For now we only + // have flags. + // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt + flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags() + fmt.Fprintf(buf, "flags:\t0%o\n", flags) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go new file mode 100644 index 000000000..9af43b859 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -0,0 +1,902 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// "There is an (arbitrary) limit on the number of lines in the file. As at +// Linux 3.18, the limit is five lines." - user_namespaces(7) +const maxIDMapLines = 5 + +// mm gets the kernel task's MemoryManager. No additional reference is taken on +// mm here. This is safe because MemoryManager.destroy is required to leave the +// MemoryManager in a state where it's still usable as a DynamicBytesSource. +func getMM(task *kernel.Task) *mm.MemoryManager { + var tmm *mm.MemoryManager + task.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + tmm = mm + } + }) + return tmm +} + +// getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the +// MemoryManager's users count is incremented, and must be decremented by the +// caller when it is no longer in use. +func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { + if task.ExitState() == kernel.TaskExitDead { + return nil, syserror.ESRCH + } + var m *mm.MemoryManager + task.WithMuLocked(func(t *kernel.Task) { + m = t.MemoryManager() + }) + if m == nil || !m.IncUsers() { + return nil, io.EOF + } + return m, nil +} + +func checkTaskState(t *kernel.Task) error { + switch t.ExitState() { + case kernel.TaskExitZombie: + return syserror.EACCES + case kernel.TaskExitDead: + return syserror.ESRCH + } + return nil +} + +type bufferWriter struct { + buf *bytes.Buffer +} + +// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns +// the number of bytes written. It may return a partial write without an +// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not +// return a full write with an error (i.e. srcs.NumBytes(), err) where err +// != nil). +func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + written := srcs.NumBytes() + for !srcs.IsEmpty() { + w.buf.Write(srcs.Head().ToSlice()) + srcs = srcs.Tail() + } + return written, nil +} + +// auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv. +// +// +stateify savable +type auxvData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*auxvData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { + m, err := getMMIncRef(d.task) + if err != nil { + return err + } + defer m.DecUsers(ctx) + + auxv := m.Auxv() + // Space for buffer with AT_NULL (0) terminator at the end. + buf.Grow((len(auxv) + 1) * 16) + for _, e := range auxv { + var tmp [16]byte + usermem.ByteOrder.PutUint64(tmp[:8], e.Key) + usermem.ByteOrder.PutUint64(tmp[8:], uint64(e.Value)) + buf.Write(tmp[:]) + } + var atNull [16]byte + buf.Write(atNull[:]) + + return nil +} + +// execArgType enumerates the types of exec arguments that are exposed through +// proc. +type execArgType int + +const ( + cmdlineDataArg execArgType = iota + environDataArg +) + +// cmdlineData implements vfs.DynamicBytesSource for /proc/[pid]/cmdline. +// +// +stateify savable +type cmdlineData struct { + kernfs.DynamicBytesFile + + task *kernel.Task + + // arg is the type of exec argument this file contains. + arg execArgType +} + +var _ dynamicInode = (*cmdlineData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { + m, err := getMMIncRef(d.task) + if err != nil { + return err + } + defer m.DecUsers(ctx) + + // Figure out the bounds of the exec arg we are trying to read. + var ar usermem.AddrRange + switch d.arg { + case cmdlineDataArg: + ar = usermem.AddrRange{ + Start: m.ArgvStart(), + End: m.ArgvEnd(), + } + case environDataArg: + ar = usermem.AddrRange{ + Start: m.EnvvStart(), + End: m.EnvvEnd(), + } + default: + panic(fmt.Sprintf("unknown exec arg type %v", d.arg)) + } + if ar.Start == 0 || ar.End == 0 { + // Don't attempt to read before the start/end are set up. + return io.EOF + } + + // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true + // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading + // cmdline and environment"). + writer := &bufferWriter{buf: buf} + if n, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil { + // Nothing to copy or something went wrong. + return err + } + + // On Linux, if the NULL byte at the end of the argument vector has been + // overwritten, it continues reading the environment vector as part of + // the argument vector. + if d.arg == cmdlineDataArg && buf.Bytes()[buf.Len()-1] != 0 { + if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 { + // If we found a NULL character somewhere else in argv, truncate the + // return up to the NULL terminator (including it). + buf.Truncate(end) + return nil + } + + // There is no NULL terminator in the string, return into envp. + arEnvv := usermem.AddrRange{ + Start: m.EnvvStart(), + End: m.EnvvEnd(), + } + + // Upstream limits the returned amount to one page of slop. + // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208 + // we'll return one page total between argv and envp because of the + // above page restrictions. + if buf.Len() >= usermem.PageSize { + // Returned at least one page already, nothing else to add. + return nil + } + remaining := usermem.PageSize - buf.Len() + if int(arEnvv.Length()) > remaining { + end, ok := arEnvv.Start.AddLength(uint64(remaining)) + if !ok { + return syserror.EFAULT + } + arEnvv.End = end + } + if _, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil { + return err + } + + // Linux will return envp up to and including the first NULL character, + // so find it. + envStart := int(ar.Length()) + if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { + buf.Truncate(envStart + nullIdx) + } + } + + return nil +} + +// +stateify savable +type commInode struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { + inode := &commInode{task: task} + inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + // This file can always be read or written by members of the same thread + // group. See fs/proc/base.c:proc_tid_comm_permission. + // + // N.B. This check is currently a no-op as we don't yet support writing and + // this file is world-readable anyways. + t := kernel.TaskFromContext(ctx) + if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() { + return nil + } + + return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats) +} + +// commData implements vfs.DynamicBytesSource for /proc/[pid]/comm. +// +// +stateify savable +type commData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*commData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(d.task.Name()) + buf.WriteString("\n") + return nil +} + +// idMapData implements vfs.WritableDynamicBytesSource for +// /proc/[pid]/{gid_map|uid_map}. +// +// +stateify savable +type idMapData struct { + kernfs.DynamicBytesFile + + task *kernel.Task + gids bool +} + +var _ dynamicInode = (*idMapData)(nil) + +// Generate implements vfs.WritableDynamicBytesSource.Generate. +func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var entries []auth.IDMapEntry + if d.gids { + entries = d.task.UserNamespace().GIDMap() + } else { + entries = d.task.UserNamespace().UIDMap() + } + for _, e := range entries { + fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) + } + return nil +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + // "In addition, the number of bytes written to the file must be less than + // the system page size, and the write must be performed at the start of + // the file ..." - user_namespaces(7) + srclen := src.NumBytes() + if srclen >= usermem.PageSize || offset != 0 { + return 0, syserror.EINVAL + } + b := make([]byte, srclen) + if _, err := src.CopyIn(ctx, b); err != nil { + return 0, err + } + + // Truncate from the first NULL byte. + var nul int64 + nul = int64(bytes.IndexByte(b, 0)) + if nul == -1 { + nul = srclen + } + b = b[:nul] + // Remove the last \n. + if nul >= 1 && b[nul-1] == '\n' { + b = b[:nul-1] + } + lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) + if len(lines) > maxIDMapLines { + return 0, syserror.EINVAL + } + + entries := make([]auth.IDMapEntry, len(lines)) + for i, l := range lines { + var e auth.IDMapEntry + _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) + if err != nil { + return 0, syserror.EINVAL + } + entries[i] = e + } + var err error + if d.gids { + err = d.task.UserNamespace().SetGIDMap(ctx, entries) + } else { + err = d.task.UserNamespace().SetUIDMap(ctx, entries) + } + if err != nil { + return 0, err + } + + // On success, Linux's kernel/user_namespace.c:map_write() always returns + // count, even if fewer bytes were used. + return int64(srclen), nil +} + +// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. +// +// +stateify savable +type mapsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := getMM(d.task); mm != nil { + mm.ReadMapsDataInto(ctx, buf) + } + return nil +} + +// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. +// +// +stateify savable +type smapsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*smapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := getMM(d.task); mm != nil { + mm.ReadSmapsDataInto(ctx, buf) + } + return nil +} + +// +stateify savable +type taskStatData struct { + kernfs.DynamicBytesFile + + task *kernel.Task + + // If tgstats is true, accumulate fault stats (not implemented) and CPU + // time across all tasks in t's thread group. + tgstats bool + + // pidns is the PID namespace associated with the proc filesystem that + // includes the file using this statData. + pidns *kernel.PIDNamespace +} + +var _ dynamicInode = (*taskStatData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task)) + fmt.Fprintf(buf, "(%s) ", s.task.Name()) + fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0]) + ppid := kernel.ThreadID(0) + if parent := s.task.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "%d ", ppid) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session())) + fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) + fmt.Fprintf(buf, "0 " /* flags */) + fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) + var cputime usage.CPUStats + if s.tgstats { + cputime = s.task.ThreadGroup().CPUStats() + } else { + cputime = s.task.CPUStats() + } + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + cputime = s.task.ThreadGroup().JoinedChildCPUStats() + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness()) + fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count()) + + // itrealvalue. Since kernel 2.6.17, this field is no longer + // maintained, and is hard coded as 0. + fmt.Fprintf(buf, "0 ") + + // Start time is relative to boot time, expressed in clock ticks. + fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) + + var vss, rss uint64 + s.task.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) + + // rsslim. + fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur) + + fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) + fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) + fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) + terminationSignal := linux.Signal(0) + if s.task == s.task.ThreadGroup().Leader() { + terminationSignal = s.task.ThreadGroup().TerminationSignal() + } + fmt.Fprintf(buf, "%d ", terminationSignal) + fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) + fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) + fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) + fmt.Fprintf(buf, "0\n" /* exit_code */) + + return nil +} + +// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. +// +// +stateify savable +type statmData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*statmData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var vss, rss uint64 + s.task.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + + fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize) + return nil +} + +// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. +// +// +stateify savable +type statusData struct { + kernfs.DynamicBytesFile + + task *kernel.Task + pidns *kernel.PIDNamespace +} + +var _ dynamicInode = (*statusData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) + fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) + fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) + fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) + ppid := kernel.ThreadID(0) + if parent := s.task.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "PPid:\t%d\n", ppid) + tpid := kernel.ThreadID(0) + if tracer := s.task.Tracer(); tracer != nil { + tpid = s.pidns.IDOfTask(tracer) + } + fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) + var fds int + var vss, rss, data uint64 + s.task.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() + } + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } + }) + fmt.Fprintf(buf, "FDSize:\t%d\n", fds) + fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) + fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) + fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) + fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) + creds := s.task.Credentials() + fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) + fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) + fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) + fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) + fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(buf, "Mems_allowed:\t1\n") + fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") + return nil +} + +// ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. +type ioUsage interface { + // IOUsage returns the io usage data. + IOUsage() *usage.IO +} + +// +stateify savable +type ioData struct { + kernfs.DynamicBytesFile + + ioUsage +} + +var _ dynamicInode = (*ioData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { + io := usage.IO{} + io.Accumulate(i.IOUsage()) + + fmt.Fprintf(buf, "char: %d\n", io.CharsRead) + fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) + fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) + fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) + fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) + fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) + fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) + return nil +} + +// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file. +// +// +stateify savable +type oomScoreAdj struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { + if o.task.ExitState() == kernel.TaskExitDead { + return syserror.ESRCH + } + fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) + return nil +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + + if o.task.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH + } + if err := o.task.SetOOMScoreAdj(v); err != nil { + return 0, err + } + + return n, nil +} + +// exeSymlink is an symlink for the /proc/[pid]/exe file. +// +// +stateify savable +type exeSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + task *kernel.Task +} + +var _ kernfs.Inode = (*exeSymlink)(nil) + +func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { + inode := &exeSymlink{task: task} + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +// Readlink implements kernfs.Inode. +func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return "", syserror.EACCES + } + + // Pull out the executable for /proc/[pid]/exe. + exec, err := s.executable() + if err != nil { + return "", err + } + defer exec.DecRef() + + return exec.PathnameWithDeleted(ctx), nil +} + +// Getlink implements kernfs.Inode.Getlink. +func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return vfs.VirtualDentry{}, "", syserror.EACCES + } + + exec, err := s.executable() + if err != nil { + return vfs.VirtualDentry{}, "", err + } + defer exec.DecRef() + + vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() + vd.IncRef() + return vd, "", nil +} + +func (s *exeSymlink) executable() (file fsbridge.File, err error) { + if err := checkTaskState(s.task); err != nil { + return nil, err + } + + s.task.WithMuLocked(func(t *kernel.Task) { + mm := t.MemoryManager() + if mm == nil { + err = syserror.EACCES + return + } + + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + file = mm.Executable() + if file == nil { + err = syserror.ESRCH + } + }) + return +} + +// mountInfoData is used to implement /proc/[pid]/mountinfo. +// +// +stateify savable +type mountInfoData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mountInfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var fsctx *kernel.FSContext + i.task.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() + }) + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return nil + } + rootDir := fsctx.RootDirectoryVFS2() + if !rootDir.Ok() { + // Root has been destroyed. Don't try to read mounts. + return nil + } + defer rootDir.DecRef() + i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) + return nil +} + +// mountsData is used to implement /proc/[pid]/mounts. +// +// +stateify savable +type mountsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mountsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var fsctx *kernel.FSContext + i.task.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() + }) + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return nil + } + rootDir := fsctx.RootDirectoryVFS2() + if !rootDir.Ok() { + // Root has been destroyed. Don't try to read mounts. + return nil + } + defer rootDir.DecRef() + i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) + return nil +} + +type namespaceSymlink struct { + kernfs.StaticSymlink + + task *kernel.Task +} + +func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { + // Namespace symlinks should contain the namespace name and the inode number + // for the namespace instance, so for example user:[123456]. We currently fake + // the inode number by sticking the symlink inode in its place. + target := fmt.Sprintf("%s:[%d]", ns, ino) + + inode := &namespaceSymlink{task: task} + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) + + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d +} + +// Readlink implements Inode. +func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { + if err := checkTaskState(s.task); err != nil { + return "", err + } + return s.StaticSymlink.Readlink(ctx) +} + +// Getlink implements Inode.Getlink. +func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + if err := checkTaskState(s.task); err != nil { + return vfs.VirtualDentry{}, "", err + } + + // Create a synthetic inode to represent the namespace. + dentry := &kernfs.Dentry{} + dentry.Init(&namespaceInode{}) + vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) + vd.IncRef() + dentry.DecRef() + return vd, "", nil +} + +// namespaceInode is a synthetic inode created to represent a namespace in +// /proc/[pid]/ns/*. +type namespaceInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + locks vfs.FileLocks +} + +var _ kernfs.Inode = (*namespaceInode)(nil) + +// Init initializes a namespace inode. +func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) +} + +// Open implements Inode.Open. +func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &namespaceFD{inode: i} + i.IncRef() + fd.LockFD.Init(&i.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// namespace FD is a synthetic file that represents a namespace in +// /proc/[pid]/ns/*. +type namespaceFD struct { + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + vfsfd vfs.FileDescription + inode *namespaceInode +} + +var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) + +// Stat implements FileDescriptionImpl. +func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(vfs, opts) +} + +// SetStat implements FileDescriptionImpl. +func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + creds := auth.CredentialsFromContext(ctx) + return fd.inode.SetStat(ctx, vfs, creds, opts) +} + +// Release implements FileDescriptionImpl. +func (fd *namespaceFD) Release() { + fd.inode.DecRef() +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go new file mode 100644 index 000000000..6bde27376 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -0,0 +1,810 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "io" + "reflect" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry { + k := task.Kernel() + pidns := task.PIDNamespace() + root := auth.NewRootCredentials(pidns.UserNamespace()) + + var contents map[string]*kernfs.Dentry + if stack := task.NetworkNamespace().Stack(); stack != nil { + const ( + arp = "IP address HW type Flags HW address Mask Device\n" + netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n" + packet = "sk RefCnt Type Proto Iface R Rmem User Inode\n" + protocols = "protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n" + ptype = "Type Device Function\n" + upd6 = " sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n" + ) + psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)) + + // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task + // network namespace. + contents = map[string]*kernfs.Dentry{ + "dev": fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}), + "snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}), + + // The following files are simple stubs until they are implemented in + // netstack, if the file contains a header the stub is just the header + // otherwise it is an empty file. + "arp": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)), + "netlink": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)), + "netstat": fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}), + "packet": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)), + "protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)), + + // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, + // high res timer ticks per sec (ClockGetres returns 1ns resolution). + "psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)), + "ptype": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)), + "route": fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}), + "tcp": fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}), + "udp": fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}), + "unix": fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}), + } + + if stack.SupportsIPv6() { + contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")) + contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6)) + } + } + + return fs.newTaskOwnedDir(task, fs.NextIno(), 0555, contents) +} + +// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. +// +// +stateify savable +type ifinet6 struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*ifinet6)(nil) + +func (n *ifinet6) contents() []string { + var lines []string + nics := n.stack.Interfaces() + for id, naddrs := range n.stack.InterfaceAddrs() { + nic, ok := nics[id] + if !ok { + // NIC was added after NICNames was called. We'll just ignore it. + continue + } + + for _, a := range naddrs { + // IPv6 only. + if a.Family != linux.AF_INET6 { + continue + } + + // Fields: + // IPv6 address displayed in 32 hexadecimal chars without colons + // Netlink device number (interface index) in hexadecimal (use nic id) + // Prefix length in hexadecimal + // Scope value (use 0) + // Interface flags + // Device name + lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) + } + } + return lines +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { + for _, l := range n.contents() { + buf.WriteString(l) + } + return nil +} + +// netDevData implements vfs.DynamicBytesSource for /proc/net/dev. +// +// +stateify savable +type netDevData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netDevData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error { + interfaces := n.stack.Interfaces() + buf.WriteString("Inter-| Receive | Transmit\n") + buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") + + for _, i := range interfaces { + // Implements the same format as + // net/core/net-procfs.c:dev_seq_printf_stats. + var stats inet.StatDev + if err := n.stack.Statistics(&stats, i.Name); err != nil { + log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) + continue + } + fmt.Fprintf( + buf, + "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", + i.Name, + // Received + stats[0], // bytes + stats[1], // packets + stats[2], // errors + stats[3], // dropped + stats[4], // fifo + stats[5], // frame + stats[6], // compressed + stats[7], // multicast + // Transmitted + stats[8], // bytes + stats[9], // packets + stats[10], // errors + stats[11], // dropped + stats[12], // fifo + stats[13], // frame + stats[14], // compressed + stats[15], // multicast + ) + } + + return nil +} + +// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix. +// +// +stateify savable +type netUnixData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netUnixData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") + for _, se := range n.kernel.ListSockets() { + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) + continue + } + if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { + s.DecRef() + // Not a unix socket. + continue + } + sops := s.Impl().(*unix.SocketVFS2) + + addr, err := sops.Endpoint().GetLocalAddress() + if err != nil { + log.Warningf("Failed to retrieve socket name from %+v: %v", s, err) + addr.Addr = "<unknown>" + } + + sockFlags := 0 + if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok { + if ce.Listening() { + // For unix domain sockets, linux reports a single flag + // value if the socket is listening, of __SO_ACCEPTCON. + sockFlags = linux.SO_ACCEPTCON + } + } + + // Get inode number. + var ino uint64 + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO}) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve ino for socket file: %v", statErr) + } else { + ino = stat.Ino + } + + // In the socket entry below, the value for the 'Num' field requires + // some consideration. Linux prints the address to the struct + // unix_sock representing a socket in the kernel, but may redact the + // value for unprivileged users depending on the kptr_restrict + // sysctl. + // + // One use for this field is to allow a privileged user to + // introspect into the kernel memory to determine information about + // a socket not available through procfs, such as the socket's peer. + // + // In gvisor, returning a pointer to our internal structures would + // be pointless, as it wouldn't match the memory layout for struct + // unix_sock, making introspection difficult. We could populate a + // struct unix_sock with the appropriate data, but even that + // requires consideration for which kernel version to emulate, as + // the definition of this struct changes over time. + // + // For now, we always redact this pointer. + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", + (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. + s.Refs()-1, // RefCount, don't count our own ref. + 0, // Protocol, always 0 for UDS. + sockFlags, // Flags. + sops.Endpoint().Type(), // Type. + sops.State(), // State. + ino, // Inode. + ) + + // Path + if len(addr.Addr) != 0 { + if addr.Addr[0] == 0 { + // Abstract path. + fmt.Fprintf(buf, " @%s", string(addr.Addr[1:])) + } else { + fmt.Fprintf(buf, " %s", string(addr.Addr)) + } + } + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +func networkToHost16(n uint16) uint16 { + // n is in network byte order, so is big-endian. The most-significant byte + // should be stored in the lower address. + // + // We manually inline binary.BigEndian.Uint16() because Go does not support + // non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to + // binary.BigEndian.Uint16() require a read of binary.BigEndian and an + // interface method call, defeating inlining. + buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)} + return usermem.ByteOrder.Uint16(buf[:]) +} + +func writeInetAddr(w io.Writer, family int, i linux.SockAddr) { + switch family { + case linux.AF_INET: + var a linux.SockAddrInet + if i != nil { + a = *i.(*linux.SockAddrInet) + } + + // linux.SockAddrInet.Port is stored in the network byte order and is + // printed like a number in host byte order. Note that all numbers in host + // byte order are printed with the most-significant byte first when + // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux. + port := networkToHost16(a.Port) + + // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order + // (i.e. most-significant byte in index 0). Linux represents this as a + // __be32 which is a typedef for an unsigned int, and is printed with + // %X. This means that for a little-endian machine, Linux prints the + // least-significant byte of the address first. To emulate this, we first + // invert the byte order for the address using usermem.ByteOrder.Uint32, + // which makes it have the equivalent encoding to a __be32 on a little + // endian machine. Note that this operation is a no-op on a big endian + // machine. Then similar to Linux, we format it with %X, which will print + // the most-significant byte of the __be32 address first, which is now + // actually the least-significant byte of the original address in + // linux.SockAddrInet.Addr on little endian machines, due to the conversion. + addr := usermem.ByteOrder.Uint32(a.Addr[:]) + + fmt.Fprintf(w, "%08X:%04X ", addr, port) + case linux.AF_INET6: + var a linux.SockAddrInet6 + if i != nil { + a = *i.(*linux.SockAddrInet6) + } + + port := networkToHost16(a.Port) + addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4]) + addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8]) + addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12]) + addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16]) + fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port) + } +} + +func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error { + // t may be nil here if our caller is not part of a task goroutine. This can + // happen for example if we're here for "sentryctl cat". When t is nil, + // degrade gracefully and retrieve what we can. + t := kernel.TaskFromContext(ctx) + + for _, se := range k.ListSockets() { + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) + continue + } + sops, ok := s.Impl().(socket.SocketVFS2) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) + } + if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { + s.DecRef() + // Not tcp4 sockets. + continue + } + + // Linux's documentation for the fields below can be found at + // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. + // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). + // Note that the header doesn't contain labels for all the fields. + + // Field: sl; entry number. + fmt.Fprintf(buf, "%4d: ", se.ID) + + // Field: local_adddress. + var localAddr linux.SockAddr + if t != nil { + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = local + } + } + writeInetAddr(buf, family, localAddr) + + // Field: rem_address. + var remoteAddr linux.SockAddr + if t != nil { + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = remote + } + } + writeInetAddr(buf, family, remoteAddr) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when; timer active state and number of jiffies + // until timer expires. Unimplemented. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt; number of unrecovered RTO timeouts. + // Unimplemented. + fmt.Fprintf(buf, "%08X ", 0) + + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + + // Field: uid. + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) + fmt.Fprintf(buf, "%5d ", 0) + } else { + creds := auth.CredentialsFromContext(ctx) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) + } + + // Field: timeout; number of unanswered 0-window probes. + // Unimplemented. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } + + // Field: refcount. Don't count the ref we obtain while deferencing + // the weakref to this socket. + fmt.Fprintf(buf, "%d ", s.Refs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: retransmit timeout. Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: predicted tick of soft clock (delayed ACK control data). + // Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: sending congestion window, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. + // Unimplemented, report as large threshold. + fmt.Fprintf(buf, "%d", -1) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + + return nil +} + +// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp. +// +// +stateify savable +type netTCPData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netTCPData)(nil) + +func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") + return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET) +} + +// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6. +// +// +stateify savable +type netTCP6Data struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netTCP6Data)(nil) + +func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n") + return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6) +} + +// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp. +// +// +stateify savable +type netUDPData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netUDPData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // t may be nil here if our caller is not part of a task goroutine. This can + // happen for example if we're here for "sentryctl cat". When t is nil, + // degrade gracefully and retrieve what we can. + t := kernel.TaskFromContext(ctx) + + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops \n") + + for _, se := range d.kernel.ListSockets() { + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) + continue + } + sops, ok := s.Impl().(socket.SocketVFS2) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) + } + if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { + s.DecRef() + // Not udp4 socket. + continue + } + + // For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock(). + + // Field: sl; entry number. + fmt.Fprintf(buf, "%5d: ", se.ID) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if t != nil { + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = *local.(*linux.SockAddrInet) + } + } + writeInetAddr(buf, linux.AF_INET, &localAddr) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if t != nil { + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = *remote.(*linux.SockAddrInet) + } + } + writeInetAddr(buf, linux.AF_INET, &remoteAddr) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when. Always 0 for UDP. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt. Always 0 for UDP. + fmt.Fprintf(buf, "%08X ", 0) + + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + + // Field: uid. + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) + fmt.Fprintf(buf, "%5d ", 0) + } else { + creds := auth.CredentialsFromContext(ctx) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) + } + + // Field: timeout. Always 0 for UDP. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } + + // Field: ref; reference count on the socket inode. Don't count the ref + // we obtain while deferencing the weakref to this socket. + fmt.Fprintf(buf, "%d ", s.Refs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: drops; number of dropped packets. Unimplemented. + fmt.Fprintf(buf, "%d", 0) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp. +// +// +stateify savable +type netSnmpData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netSnmpData)(nil) + +type snmpLine struct { + prefix string + header string +} + +var snmp = []snmpLine{ + { + prefix: "Ip", + header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates", + }, + { + prefix: "Icmp", + header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps", + }, + { + prefix: "IcmpMsg", + }, + { + prefix: "Tcp", + header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors", + }, + { + prefix: "Udp", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, + { + prefix: "UdpLite", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, +} + +func toSlice(a interface{}) []uint64 { + v := reflect.Indirect(reflect.ValueOf(a)) + return v.Slice(0, v.Len()).Interface().([]uint64) +} + +func sprintSlice(s []uint64) string { + if len(s) == 0 { + return "" + } + r := fmt.Sprint(s) + return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. +} + +// Generate implements vfs.DynamicBytesSource. +func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { + types := []interface{}{ + &inet.StatSNMPIP{}, + &inet.StatSNMPICMP{}, + nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats. + &inet.StatSNMPTCP{}, + &inet.StatSNMPUDP{}, + &inet.StatSNMPUDPLite{}, + } + for i, stat := range types { + line := snmp[i] + if stat == nil { + fmt.Fprintf(buf, "%s:\n", line.prefix) + fmt.Fprintf(buf, "%s:\n", line.prefix) + continue + } + if err := d.stack.Statistics(stat, line.prefix); err != nil { + if err == syserror.EOPNOTSUPP { + log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } else { + log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } + } + + fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header) + + if line.prefix == "Tcp" { + tcp := stat.(*inet.StatSNMPTCP) + // "Tcp" needs special processing because MaxConn is signed. RFC 2012. + fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) + } else { + fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) + } + } + return nil +} + +// netRouteData implements vfs.DynamicBytesSource for /proc/net/route. +// +// +stateify savable +type netRouteData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netRouteData)(nil) + +// Generate implements vfs.DynamicBytesSource. +// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. +func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") + + interfaces := d.stack.Interfaces() + for _, rt := range d.stack.RouteTable() { + // /proc/net/route only includes ipv4 routes. + if rt.Family != linux.AF_INET { + continue + } + + // /proc/net/route does not include broadcast or multicast routes. + if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST { + continue + } + + iface, ok := interfaces[rt.OutputInterface] + if !ok || iface.Name == "lo" { + continue + } + + var ( + gw uint32 + prefix uint32 + flags = linux.RTF_UP + ) + if len(rt.GatewayAddr) == header.IPv4AddressSize { + flags |= linux.RTF_GATEWAY + gw = usermem.ByteOrder.Uint32(rt.GatewayAddr) + } + if len(rt.DstAddr) == header.IPv4AddressSize { + prefix = usermem.ByteOrder.Uint32(rt.DstAddr) + } + l := fmt.Sprintf( + "%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d", + iface.Name, + prefix, + gw, + flags, + 0, // RefCnt. + 0, // Use. + 0, // Metric. + (uint32(1)<<rt.DstLen)-1, + 0, // MTU. + 0, // Window. + 0, // RTT. + ) + fmt.Fprintf(buf, "%-127s\n", l) + } + return nil +} + +// netStatData implements vfs.DynamicBytesSource for /proc/net/netstat. +// +// +stateify savable +type netStatData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netStatData)(nil) + +// Generate implements vfs.DynamicBytesSource. +// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. +func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " + + "EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps " + + "LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive " + + "PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost " + + "ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog " + + "TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser " + + "TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging " + + "TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo " + + "TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit " + + "TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans " + + "TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes " + + "TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail " + + "TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent " + + "TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose " + + "TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed " + + "TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld " + + "TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected " + + "TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback " + + "TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter " + + "TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail " + + "TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK " + + "TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail " + + "TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow " + + "TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets " + + "TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv " + + "TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect " + + "TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " + + "TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " + + "TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " + + "TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n") + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go new file mode 100644 index 000000000..2f214d0c2 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -0,0 +1,256 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + selfName = "self" + threadSelfName = "thread-self" +) + +// tasksInode represents the inode for /proc/ directory. +// +// +stateify savable +type tasksInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + + locks vfs.FileLocks + + fs *filesystem + pidns *kernel.PIDNamespace + + // '/proc/self' and '/proc/thread-self' have custom directory offsets in + // Linux. So handle them outside of OrderedChildren. + selfSymlink *vfs.Dentry + threadSelfSymlink *vfs.Dentry + + // cgroupControllers is a map of controller name to directory in the + // cgroup hierarchy. These controllers are immutable and will be listed + // in /proc/pid/cgroup if not nil. + cgroupControllers map[string]string +} + +var _ kernfs.Inode = (*tasksInode)(nil) + +func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { + root := auth.NewRootCredentials(pidns.UserNamespace()) + contents := map[string]*kernfs.Dentry{ + "cpuinfo": fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), + "filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}), + "loadavg": fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}), + "sys": fs.newSysDir(root, k), + "meminfo": fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"), + "net": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"), + "stat": fs.newDentry(root, fs.NextIno(), 0444, &statData{}), + "uptime": fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}), + "version": fs.newDentry(root, fs.NextIno(), 0444, &versionData{}), + } + + inode := &tasksInode{ + pidns: pidns, + fs: fs, + selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), + threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), + cgroupControllers: cgroupControllers, + } + inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := inode.OrderedChildren.Populate(dentry, contents) + inode.IncLinks(links) + + return inode, dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + // Try to lookup a corresponding task. + tid, err := strconv.ParseUint(name, 10, 64) + if err != nil { + // If it failed to parse, check if it's one of the special handled files. + switch name { + case selfName: + return i.selfSymlink, nil + case threadSelfName: + return i.threadSelfSymlink, nil + } + return nil, syserror.ENOENT + } + + task := i.pidns.TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return nil, syserror.ENOENT + } + + taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) + return taskDentry.VFSDentry(), nil +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { + // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 + const FIRST_PROCESS_ENTRY = 256 + + // Use maxTaskID to shortcut searches that will result in 0 entries. + const maxTaskID = kernel.TasksLimit + 1 + if offset >= maxTaskID { + return offset, nil + } + + // According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories + // start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by + // '/proc/thread-self' and then '/proc/[pid]'. + if offset < FIRST_PROCESS_ENTRY { + offset = FIRST_PROCESS_ENTRY + } + + if offset == FIRST_PROCESS_ENTRY { + dirent := vfs.Dirent{ + Name: selfName, + Type: linux.DT_LNK, + Ino: i.fs.NextIno(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + if offset == FIRST_PROCESS_ENTRY+1 { + dirent := vfs.Dirent{ + Name: threadSelfName, + Type: linux.DT_LNK, + Ino: i.fs.NextIno(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + + // Collect all tasks that TGIDs are greater than the offset specified. Per + // Linux we only include in directory listings if it's the leader. But for + // whatever crazy reason, you can still walk to the given node. + var tids []int + startTid := offset - FIRST_PROCESS_ENTRY - 2 + for _, tg := range i.pidns.ThreadGroups() { + tid := i.pidns.IDOfThreadGroup(tg) + if int64(tid) < startTid { + continue + } + if leader := tg.Leader(); leader != nil { + tids = append(tids, int(tid)) + } + } + + if len(tids) == 0 { + return offset, nil + } + + sort.Ints(tids) + for _, tid := range tids { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(tid), 10), + Type: linux.DT_DIR, + Ino: i.fs.NextIno(), + NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return maxTaskID, nil +} + +// Open implements kernfs.Inode. +func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(vsfs, opts) + if err != nil { + return linux.Statx{}, err + } + + if opts.Mask&linux.STATX_NLINK != 0 { + // Add dynamic children to link count. + for _, tg := range i.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + stat.Nlink++ + } + } + } + + return stat, nil +} + +// staticFileSetStat implements a special static file that allows inode +// attributes to be set. This is to support /proc files that are readonly, but +// allow attributes to be set. +type staticFileSetStat struct { + dynamicBytesFileSetAttr + vfs.StaticData +} + +var _ dynamicInode = (*staticFileSetStat)(nil) + +func newStaticFileSetStat(data string) *staticFileSetStat { + return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}} +} + +func cpuInfoData(k *kernel.Kernel) string { + features := k.FeatureSet() + if features == nil { + // Kernel is always initialized with a FeatureSet. + panic("cpuinfo read with nil FeatureSet") + } + var buf bytes.Buffer + for i, max := uint(0), k.ApplicationCores(); i < max; i++ { + features.WriteCPUInfoTo(i, &buf) + } + return buf.String() +} + +func shmData(v uint64) dynamicInode { + return newStaticFile(strconv.FormatUint(v, 10)) +} diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go new file mode 100644 index 000000000..7d8983aa5 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -0,0 +1,384 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type selfSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*selfSymlink)(nil) + +func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { + inode := &selfSymlink{pidns: pidns} + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // Who is reading this link? + return "", syserror.EINVAL + } + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + if tgid == 0 { + return "", syserror.ENOENT + } + return strconv.FormatUint(uint64(tgid), 10), nil +} + +func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) + return vfs.VirtualDentry{}, target, err +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +type threadSelfSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*threadSelfSymlink)(nil) + +func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { + inode := &threadSelfSymlink{pidns: pidns} + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // Who is reading this link? + return "", syserror.EINVAL + } + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + tid := s.pidns.IDOfTask(t) + if tid == 0 || tgid == 0 { + return "", syserror.ENOENT + } + return fmt.Sprintf("%d/task/%d", tgid, tid), nil +} + +func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) + return vfs.VirtualDentry{}, target, err +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// dynamicBytesFileSetAttr implements a special file that allows inode +// attributes to be set. This is to support /proc files that are readonly, but +// allow attributes to be set. +type dynamicBytesFileSetAttr struct { + kernfs.DynamicBytesFile +} + +// SetStat implements Inode.SetStat. +func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts) +} + +// cpuStats contains the breakdown of CPU time for /proc/stat. +type cpuStats struct { + // user is time spent in userspace tasks with non-positive niceness. + user uint64 + + // nice is time spent in userspace tasks with positive niceness. + nice uint64 + + // system is time spent in non-interrupt kernel context. + system uint64 + + // idle is time spent idle. + idle uint64 + + // ioWait is time spent waiting for IO. + ioWait uint64 + + // irq is time spent in interrupt context. + irq uint64 + + // softirq is time spent in software interrupt context. + softirq uint64 + + // steal is involuntary wait time. + steal uint64 + + // guest is time spent in guests with non-positive niceness. + guest uint64 + + // guestNice is time spent in guests with positive niceness. + guestNice uint64 +} + +// String implements fmt.Stringer. +func (c cpuStats) String() string { + return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice) +} + +// statData implements vfs.DynamicBytesSource for /proc/stat. +// +// +stateify savable +type statData struct { + dynamicBytesFileSetAttr +} + +var _ dynamicInode = (*statData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/37226836): We currently export only zero CPU stats. We could + // at least provide some aggregate stats. + var cpu cpuStats + fmt.Fprintf(buf, "cpu %s\n", cpu) + + k := kernel.KernelFromContext(ctx) + for c, max := uint(0), k.ApplicationCores(); c < max; c++ { + fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) + } + + // The total number of interrupts is dependent on the CPUs and PCI + // devices on the system. See arch_probe_nr_irqs. + // + // Since we don't report real interrupt stats, just choose an arbitrary + // value from a representative VM. + const numInterrupts = 256 + + // The Kernel doesn't handle real interrupts, so report all zeroes. + // TODO(b/37226836): We could count page faults as #PF. + fmt.Fprintf(buf, "intr 0") // total + for i := 0; i < numInterrupts; i++ { + fmt.Fprintf(buf, " 0") + } + fmt.Fprintf(buf, "\n") + + // Total number of context switches. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "ctxt 0\n") + + // CLOCK_REALTIME timestamp from boot, in seconds. + fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds()) + + // Total number of clones. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "processes 0\n") + + // Number of runnable tasks. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "procs_running 0\n") + + // Number of tasks waiting on IO. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "procs_blocked 0\n") + + // Number of each softirq handled. + fmt.Fprintf(buf, "softirq 0") // total + for i := 0; i < linux.NumSoftIRQ; i++ { + fmt.Fprintf(buf, " 0") + } + fmt.Fprintf(buf, "\n") + return nil +} + +// loadavgData backs /proc/loadavg. +// +// +stateify savable +type loadavgData struct { + dynamicBytesFileSetAttr +} + +var _ dynamicInode = (*loadavgData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/62345059): Include real data in fields. + // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. + // Column 4-5: currently running processes and the total number of processes. + // Column 6: the last process ID used. + fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0) + return nil +} + +// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. +// +// +stateify savable +type meminfoData struct { + dynamicBytesFileSetAttr +} + +var _ dynamicInode = (*meminfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + mf := k.MemoryFile() + mf.UpdateUsage() + snapshot, totalUsage := usage.MemoryAccounting.Copy() + totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) + anon := snapshot.Anonymous + snapshot.Tmpfs + file := snapshot.PageCache + snapshot.Mapped + // We don't actually have active/inactive LRUs, so just make up numbers. + activeFile := (file / 2) &^ (usermem.PageSize - 1) + inactiveFile := file - activeFile + + fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } + // We use MemFree as MemAvailable because we don't swap. + // TODO(rahat): When reclaim is implemented the value of MemAvailable + // should change. + fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree/1024) + fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree/1024) + fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices + fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) + // Emulate a system with no swap, which disables inactivation of anon pages. + fmt.Fprintf(buf, "SwapCache: 0 kB\n") + fmt.Fprintf(buf, "Active: %8d kB\n", (anon+activeFile)/1024) + fmt.Fprintf(buf, "Inactive: %8d kB\n", inactiveFile/1024) + fmt.Fprintf(buf, "Active(anon): %8d kB\n", anon/1024) + fmt.Fprintf(buf, "Inactive(anon): 0 kB\n") + fmt.Fprintf(buf, "Active(file): %8d kB\n", activeFile/1024) + fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024) + fmt.Fprintf(buf, "Unevictable: 0 kB\n") // TODO(b/31823263) + fmt.Fprintf(buf, "Mlocked: 0 kB\n") // TODO(b/31823263) + fmt.Fprintf(buf, "SwapTotal: 0 kB\n") + fmt.Fprintf(buf, "SwapFree: 0 kB\n") + fmt.Fprintf(buf, "Dirty: 0 kB\n") + fmt.Fprintf(buf, "Writeback: 0 kB\n") + fmt.Fprintf(buf, "AnonPages: %8d kB\n", anon/1024) + fmt.Fprintf(buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know + fmt.Fprintf(buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) + return nil +} + +// uptimeData implements vfs.DynamicBytesSource for /proc/uptime. +// +// +stateify savable +type uptimeData struct { + dynamicBytesFileSetAttr +} + +var _ dynamicInode = (*uptimeData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + now := time.NowFromContext(ctx) + + // Pretend that we've spent zero time sleeping (second number). + fmt.Fprintf(buf, "%.2f 0.00\n", now.Sub(k.Timekeeper().BootTime()).Seconds()) + return nil +} + +// versionData implements vfs.DynamicBytesSource for /proc/version. +// +// +stateify savable +type versionData struct { + dynamicBytesFileSetAttr +} + +var _ dynamicInode = (*versionData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + init := k.GlobalInit() + if init == nil { + // Attempted to read before the init Task is created. This can + // only occur during startup, which should never need to read + // this file. + panic("Attempted to read version before initial Task is available") + } + + // /proc/version takes the form: + // + // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST) + // (COMPILER_VERSION) VERSION" + // + // where: + // - SYSNAME, RELEASE, and VERSION are the same as returned by + // sys_utsname + // - COMPILE_USER is the user that build the kernel + // - COMPILE_HOST is the hostname of the machine on which the kernel + // was built + // - COMPILER_VERSION is the version reported by the building compiler + // + // Since we don't really want to expose build information to + // applications, those fields are omitted. + // + // FIXME(mpratt): Using Version from the init task SyscallTable + // disregards the different version a task may have (e.g., in a uts + // namespace). + ver := init.Leader().SyscallTable().Version + fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) + return nil +} + +// filesystemsData backs /proc/filesystems. +// +// +stateify savable +type filesystemsData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*filesystemsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + k.VFS().GenerateProcFilesystems(buf) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go new file mode 100644 index 000000000..6dac2afa4 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -0,0 +1,209 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// newSysDir returns the dentry corresponding to /proc/sys directory. +func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { + return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}), + "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)), + "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)), + "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)), + }), + "vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}), + "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")), + }), + "net": fs.newSysNetDir(root, k), + }) +} + +// newSysNetDir returns the dentry corresponding to /proc/sys/net directory. +func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { + var contents map[string]*kernfs.Dentry + + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. + if stack := k.RootNetworkNamespace().Stack(); stack != nil { + contents = map[string]*kernfs.Dentry{ + "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), + + // The following files are simple stubs until they are implemented in + // netstack, most of these files are configuration related. We use the + // value closest to the actual netstack behavior or any empty file, all + // of these files will have mode 0444 (read-only for all users). + "ip_local_port_range": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000 65535")), + "ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "ipfrag_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")), + "ip_nonlocal_bind": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "ip_no_pmtu_disc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + + // tcp_allowed_congestion_control tell the user what they are able to + // do as an unprivledged process so we leave it empty. + "tcp_allowed_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), + "tcp_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), + + // Many of the following stub files are features netstack doesn't + // support. The unsupported features return "0" to indicate they are + // disabled. + "tcp_base_mss": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")), + "tcp_dsack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_early_retrans": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen_key": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "tcp_invalid_ratelimit": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_intvl": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_probes": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")), + "tcp_mtu_probing": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_no_metrics_save": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_probe_interval": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_probe_threshold": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_retries1": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), + "tcp_retries2": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")), + "tcp_rfc1337": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_synack_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), + "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), + "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + }), + "core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")), + "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")), + "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), + "optmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "rmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "rmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "somaxconn": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")), + "wmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "wmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + }), + } + } + + return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents) +} + +// mmapMinAddrData implements vfs.DynamicBytesSource for +// /proc/sys/vm/mmap_min_addr. +// +// +stateify savable +type mmapMinAddrData struct { + kernfs.DynamicBytesFile + + k *kernel.Kernel +} + +var _ dynamicInode = (*mmapMinAddrData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) + return nil +} + +// hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname. +// +// +stateify savable +type hostnameData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*hostnameData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error { + utsns := kernel.UTSNamespaceFromContext(ctx) + buf.WriteString(utsns.HostName()) + buf.WriteString("\n") + return nil +} + +// tcpSackData implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/tcp_sack. +// +// +stateify savable +type tcpSackData struct { + kernfs.DynamicBytesFile + + stack inet.Stack `state:"wait"` + enabled *bool +} + +var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil) + +// Generate implements vfs.DynamicBytesSource. +func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if d.enabled == nil { + sack, err := d.stack.TCPSACKEnabled() + if err != nil { + return err + } + d.enabled = &sack + } + + val := "0\n" + if *d.enabled { + // Technically, this is not quite compatible with Linux. Linux stores these + // as an integer, so if you write "2" into tcp_sack, you should get 2 back. + // Tough luck. + val = "1\n" + } + buf.WriteString(val) + return nil +} + +func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit the amount of memory allocated. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return n, err + } + if d.enabled == nil { + d.enabled = new(bool) + } + *d.enabled = v != 0 + return n, d.stack.SetTCPSACKEnabled(*d.enabled) +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go new file mode 100644 index 000000000..be54897bb --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go @@ -0,0 +1,78 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/inet" +) + +func newIPv6TestStack() *inet.TestStack { + s := inet.NewTestStack() + s.SupportsIPv6Flag = true + return s +} + +func TestIfinet6NoAddresses(t *testing.T) { + n := &ifinet6{stack: newIPv6TestStack()} + var buf bytes.Buffer + n.Generate(contexttest.Context(t), &buf) + if buf.Len() > 0 { + t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{}) + } +} + +func TestIfinet6(t *testing.T) { + s := newIPv6TestStack() + s.InterfacesMap[1] = inet.Interface{Name: "eth0"} + s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"), + }, + } + s.InterfacesMap[2] = inet.Interface{Name: "eth1"} + s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"), + }, + } + want := map[string]struct{}{ + "000102030405060708090a0b0c0d0e0f 01 80 00 00 eth0\n": {}, + "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, + } + + n := &ifinet6{stack: s} + contents := n.contents() + if len(contents) != len(want) { + t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) + } + got := map[string]struct{}{} + for _, l := range contents { + got[l] = struct{}{} + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("Got n.contents() = %v, want = %v", got, want) + } +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go new file mode 100644 index 000000000..19abb5034 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -0,0 +1,505 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "math" + "path" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +var ( + // Next offset 256 by convention. Adds 1 for the next offset. + selfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 0 + 1} + threadSelfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 1 + 1} + + // /proc/[pid] next offset starts at 256+2 (files above), then adds the + // PID, and adds 1 for the next offset. + proc1 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 1 + 1} + proc2 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 2 + 1} + proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1} +) + +var ( + tasksStaticFiles = map[string]testutil.DirentType{ + "cpuinfo": linux.DT_REG, + "filesystems": linux.DT_REG, + "loadavg": linux.DT_REG, + "meminfo": linux.DT_REG, + "mounts": linux.DT_LNK, + "net": linux.DT_LNK, + "self": linux.DT_LNK, + "stat": linux.DT_REG, + "sys": linux.DT_DIR, + "thread-self": linux.DT_LNK, + "uptime": linux.DT_REG, + "version": linux.DT_REG, + } + tasksStaticFilesNextOffs = map[string]int64{ + "self": selfLink.NextOff, + "thread-self": threadSelfLink.NextOff, + } + taskStaticFiles = map[string]testutil.DirentType{ + "auxv": linux.DT_REG, + "cgroup": linux.DT_REG, + "cmdline": linux.DT_REG, + "comm": linux.DT_REG, + "environ": linux.DT_REG, + "exe": linux.DT_LNK, + "fd": linux.DT_DIR, + "fdinfo": linux.DT_DIR, + "gid_map": linux.DT_REG, + "io": linux.DT_REG, + "maps": linux.DT_REG, + "mountinfo": linux.DT_REG, + "mounts": linux.DT_REG, + "net": linux.DT_DIR, + "ns": linux.DT_DIR, + "oom_score": linux.DT_REG, + "oom_score_adj": linux.DT_REG, + "smaps": linux.DT_REG, + "stat": linux.DT_REG, + "statm": linux.DT_REG, + "status": linux.DT_REG, + "task": linux.DT_DIR, + "uid_map": linux.DT_REG, + } +) + +func setup(t *testing.T) *testutil.System { + k, err := testutil.Boot() + if err != nil { + t.Fatalf("Error creating kernel: %v", err) + } + + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + + k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("NewMountNamespace(): %v", err) + } + pop := &vfs.PathOperation{ + Root: mntns.Root(), + Start: mntns.Root(), + Path: fspath.Parse("/proc"), + } + if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil { + t.Fatalf("MkDir(/proc): %v", err) + } + + pop = &vfs.PathOperation{ + Root: mntns.Root(), + Start: mntns.Root(), + Path: fspath.Parse("/proc"), + } + mntOpts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: &InternalData{ + Cgroups: map[string]string{ + "cpuset": "/foo/cpuset", + "memory": "/foo/memory", + }, + }, + }, + } + if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil { + t.Fatalf("MountAt(/proc): %v", err) + } + return testutil.NewSystem(ctx, t, k.VFS(), mntns) +} + +func TestTasksEmpty(t *testing.T) { + s := setup(t) + defer s.Destroy() + + collector := s.ListDirents(s.PathOpAtRoot("/proc")) + s.AssertAllDirentTypes(collector, tasksStaticFiles) + s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) +} + +func TestTasks(t *testing.T) { + s := setup(t) + defer s.Destroy() + + expectedDirents := make(map[string]testutil.DirentType) + for n, d := range tasksStaticFiles { + expectedDirents[n] = d + } + + k := kernel.KernelFromContext(s.Ctx) + var tasks []*kernel.Task + for i := 0; i < 5; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + tasks = append(tasks, task) + expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR + } + + collector := s.ListDirents(s.PathOpAtRoot("/proc")) + s.AssertAllDirentTypes(collector, expectedDirents) + s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) + + lastPid := 0 + dirents := collector.OrderedDirents() + doneSkippingNonTaskDirs := false + for _, d := range dirents { + pid, err := strconv.Atoi(d.Name) + if err != nil { + if !doneSkippingNonTaskDirs { + // We haven't gotten to the task dirs yet. + continue + } + t.Fatalf("Invalid process directory %q", d.Name) + } + doneSkippingNonTaskDirs = true + if lastPid > pid { + t.Errorf("pids not in order: %v", dirents) + } + found := false + for _, t := range tasks { + if k.TaskSet().Root.IDOfTask(t) == kernel.ThreadID(pid) { + found = true + } + } + if !found { + t.Errorf("Additional task ID %d listed: %v", pid, tasks) + } + // Next offset starts at 256+2 ('self' and 'thread-self'), then adds the + // PID, and adds 1 for the next offset. + if want := int64(256 + 2 + pid + 1); d.NextOff != want { + t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d) + } + } + if !doneSkippingNonTaskDirs { + t.Fatalf("Never found any process directories.") + } + + // Test lookup. + for _, path := range []string{"/proc/1", "/proc/2"} { + fd, err := s.VFS.OpenAt( + s.Ctx, + s.Creds, + s.PathOpAtRoot(path), + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) + } + defer fd.DecRef() + buf := make([]byte, 1) + bufIOSeq := usermem.BytesIOSequence(buf) + if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { + t.Errorf("wrong error reading directory: %v", err) + } + } + + if _, err := s.VFS.OpenAt( + s.Ctx, + s.Creds, + s.PathOpAtRoot("/proc/9999"), + &vfs.OpenOptions{}, + ); err != syserror.ENOENT { + t.Fatalf("wrong error from vfsfs.OpenAt(/proc/9999): %v", err) + } +} + +func TestTasksOffset(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + for i := 0; i < 3; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil { + t.Fatalf("CreateTask(): %v", err) + } + } + + for _, tc := range []struct { + name string + offset int64 + wants map[string]vfs.Dirent + }{ + { + name: "small offset", + offset: 100, + wants: map[string]vfs.Dirent{ + "self": selfLink, + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "offset at start", + offset: 256, + wants: map[string]vfs.Dirent{ + "self": selfLink, + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip /proc/self", + offset: 257, + wants: map[string]vfs.Dirent{ + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip symlinks", + offset: 258, + wants: map[string]vfs.Dirent{ + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip first process", + offset: 260, + wants: map[string]vfs.Dirent{ + "2": proc2, + "3": proc3, + }, + }, + { + name: "last process", + offset: 261, + wants: map[string]vfs.Dirent{ + "3": proc3, + }, + }, + { + name: "after last", + offset: 262, + wants: nil, + }, + { + name: "TaskLimit+1", + offset: kernel.TasksLimit + 1, + wants: nil, + }, + { + name: "max", + offset: math.MaxInt64, + wants: nil, + }, + } { + t.Run(tc.name, func(t *testing.T) { + s := s.WithSubtest(t) + fd, err := s.VFS.OpenAt( + s.Ctx, + s.Creds, + s.PathOpAtRoot("/proc"), + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + defer fd.DecRef() + if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { + t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) + } + + var collector testutil.DirentCollector + if err := fd.IterDirents(s.Ctx, &collector); err != nil { + t.Fatalf("IterDirent(): %v", err) + } + + expectedTypes := make(map[string]testutil.DirentType) + expectedOffsets := make(map[string]int64) + for name, want := range tc.wants { + expectedTypes[name] = want.Type + if want.NextOff != 0 { + expectedOffsets[name] = want.NextOff + } + } + + collector.SkipDotsChecks(true) // We seek()ed past the dots. + s.AssertAllDirentTypes(&collector, expectedTypes) + s.AssertDirentOffsets(&collector, expectedOffsets) + }) + } +} + +func TestTask(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + _, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + + collector := s.ListDirents(s.PathOpAtRoot("/proc/1")) + s.AssertAllDirentTypes(collector, taskStaticFiles) +} + +func TestProcSelf(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + + collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{ + Root: s.Root, + Start: s.Root, + Path: fspath.Parse("/proc/self/"), + FollowFinalSymlink: true, + }) + s.AssertAllDirentTypes(collector, taskStaticFiles) +} + +func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) { + t.Logf("Iterating: %s", fd.MappedName(ctx)) + + var collector testutil.DirentCollector + if err := fd.IterDirents(ctx, &collector); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + if err := collector.Contains(".", linux.DT_DIR); err != nil { + t.Error(err.Error()) + } + if err := collector.Contains("..", linux.DT_DIR); err != nil { + t.Error(err.Error()) + } + + for _, d := range collector.Dirents() { + if d.Name == "." || d.Name == ".." { + continue + } + absPath := path.Join(fd.MappedName(ctx), d.Name) + if d.Type == linux.DT_LNK { + link, err := s.VFS.ReadlinkAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)}, + ) + if err != nil { + t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", absPath, err) + } else { + t.Logf("Skipping symlink: %s => %s", absPath, link) + } + continue + } + + t.Logf("Opening: %s", absPath) + child, err := s.VFS.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err) + continue + } + defer child.DecRef() + stat, err := child.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Errorf("Stat(%v) failed: %v", absPath, err) + } + if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type { + t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type) + } + if d.Type == linux.DT_DIR { + // Found another dir, let's do it again! + iterateDir(ctx, t, s, child) + } + } +} + +// TestTree iterates all directories and stats every file. +func TestTree(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + + pop := &vfs.PathOperation{ + Root: s.Root, + Start: s.Root, + Path: fspath.Parse("test-file"), + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_CREAT, + Mode: 0777, + } + file, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, opts) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + defer file.DecRef() + + var tasks []*kernel.Task + for i := 0; i < 5; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + // Add file to populate /proc/[pid]/fd and fdinfo directories. + task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{}) + tasks = append(tasks, task) + } + + ctx := tasks[0] + fd, err := s.VFS.OpenAt( + ctx, + auth.CredentialsFromContext(s.Ctx), + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err) + } + iterateDir(ctx, t, s, fd) + fd.DecRef() +} diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD new file mode 100644 index 000000000..067c1657f --- /dev/null +++ b/pkg/sentry/fsimpl/signalfd/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "signalfd", + srcs = ["signalfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/sentry/kernel", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go new file mode 100644 index 000000000..242ba9b5d --- /dev/null +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -0,0 +1,136 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package signalfd + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SignalFileDescription implements FileDescriptionImpl for signal fds. +type SignalFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // target is the original signal target task. + // + // The semantics here are a bit broken. Linux will always use current + // for all reads, regardless of where the signalfd originated. We can't + // do exactly that because we need to plumb the context through + // EventRegister in order to support proper blocking behavior. This + // will undoubtedly become very complicated quickly. + target *kernel.Task + + // mu protects mask. + mu sync.Mutex + + // mask is the signal mask. Protected by mu. + mask linux.SignalSet +} + +var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil) + +// New creates a new signal fd. +func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[signalfd]") + defer vd.DecRef() + sfd := &SignalFileDescription{ + target: target, + mask: mask, + } + if err := sfd.vfsfd.Init(sfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &sfd.vfsfd, nil +} + +// Mask returns the signal mask. +func (sfd *SignalFileDescription) Mask() linux.SignalSet { + sfd.mu.Lock() + defer sfd.mu.Unlock() + return sfd.mask +} + +// SetMask sets the signal mask. +func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) { + sfd.mu.Lock() + defer sfd.mu.Unlock() + sfd.mask = mask +} + +// Read implements FileDescriptionImpl.Read. +func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + // Attempt to dequeue relevant signals. + info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) + if err != nil { + // There must be no signal available. + return 0, syserror.ErrWouldBlock + } + + // Copy out the signal info using the specified format. + var buf [128]byte + binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + Signo: uint32(info.Signo), + Errno: info.Errno, + Code: info.Code, + PID: uint32(info.Pid()), + UID: uint32(info.Uid()), + Status: info.Status(), + Overrun: uint32(info.Overrun()), + Addr: info.Addr(), + }) + n, err := dst.CopyOut(ctx, buf[:]) + return int64(n), err +} + +// Readiness implements waiter.Waitable.Readiness. +func (sfd *SignalFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + sfd.mu.Lock() + defer sfd.mu.Unlock() + if mask&waiter.EventIn != 0 && sfd.target.PendingSignals()&sfd.mask != 0 { + return waiter.EventIn // Pending signals. + } + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (sfd *SignalFileDescription) EventRegister(entry *waiter.Entry, _ waiter.EventMask) { + sfd.mu.Lock() + defer sfd.mu.Unlock() + // Register for the signal set; ignore the passed events. + sfd.target.SignalRegister(entry, waiter.EventMask(sfd.mask)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { + // Unregister the original entry. + sfd.target.SignalUnregister(entry) +} + +// Release implements FileDescriptionImpl.Release() +func (sfd *SignalFileDescription) Release() {} diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD new file mode 100644 index 000000000..9453277b8 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -0,0 +1,18 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "sockfs", + srcs = ["sockfs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go new file mode 100644 index 000000000..ee0828a15 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -0,0 +1,109 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sockfs provides a filesystem implementation for anonymous sockets. +package sockfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("sockfs.filesystemType.GetFilesystem should never be called") +} + +// Name implements FilesystemType.Name. +// +// Note that registering sockfs is unnecessary, except for the fact that it +// will not show up under /proc/filesystems as a result. This is a very minor +// discrepancy from Linux. +func (filesystemType) Name() string { + return "sockfs" +} + +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + +// NewFilesystem sets up and returns a new sockfs filesystem. +// +// Note that there should only ever be one instance of sockfs.Filesystem, +// backing a global socket mount. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.Filesystem.VFSFilesystem(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) + b.PrependComponent(fmt.Sprintf("socket:[%d]", inode.InodeAttrs.Ino())) + return vfs.PrependPathSyntheticError{} +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeAttrs + kernfs.InodeNoopRefCount +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return nil, syserror.ENXIO +} + +// NewDentry constructs and returns a sockfs dentry. +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry { + fs := mnt.Filesystem().Impl().(*filesystem) + + // File mode matches net/socket.c:sock_alloc. + filemode := linux.FileMode(linux.S_IFSOCK | 0600) + i := &inode{} + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode) + + d := &kernfs.Dentry{} + d.Init(i) + return d.VFSDentry() +} diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD new file mode 100644 index 000000000..a741e2bb6 --- /dev/null +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -0,0 +1,34 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "sys", + srcs = [ + "sys.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "sys_test", + srcs = ["sys_test.go"], + deps = [ + ":sys", + "//pkg/abi/linux", + "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go new file mode 100644 index 000000000..01ce30a4d --- /dev/null +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -0,0 +1,151 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sys implements sysfs. +package sys + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the default filesystem name. +const Name = "sysfs" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + } + fs.VFSFilesystem().Init(vfsObj, &fsType, fs) + k := kernel.KernelFromContext(ctx) + maxCPUCores := k.ApplicationCores() + defaultSysDirMode := linux.FileMode(0755) + + root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "block": fs.newDir(creds, defaultSysDirMode, nil), + "bus": fs.newDir(creds, defaultSysDirMode, nil), + "class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "power_supply": fs.newDir(creds, defaultSysDirMode, nil), + }), + "dev": fs.newDir(creds, defaultSysDirMode, nil), + "devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "cpu": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + }), + }), + }), + "firmware": fs.newDir(creds, defaultSysDirMode, nil), + "fs": fs.newDir(creds, defaultSysDirMode, nil), + "kernel": fs.newDir(creds, defaultSysDirMode, nil), + "module": fs.newDir(creds, defaultSysDirMode, nil), + "power": fs.newDir(creds, defaultSysDirMode, nil), + }) + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// dir implements kernfs.Inode. +type dir struct { + kernfs.InodeAttrs + kernfs.InodeNoDynamicLookup + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.OrderedChildren + + locks vfs.FileLocks + + dentry kernfs.Dentry +} + +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + d := &dir{} + d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) + d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + d.dentry.Init(d) + + d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents)) + + return &d.dentry +} + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// Open implements kernfs.Inode.Open. +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// cpuFile implements kernfs.Inode. +type cpuFile struct { + kernfs.DynamicBytesFile + maxCores uint +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "0-%d\n", c.maxCores-1) + return nil +} + +func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry { + c := &cpuFile{maxCores: maxCores} + c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) + d := &kernfs.Dentry{} + d.Init(c) + return d +} diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go new file mode 100644 index 000000000..242d5fd12 --- /dev/null +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -0,0 +1,89 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sys_test + +import ( + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func newTestSystem(t *testing.T) *testutil.System { + k, err := testutil.Boot() + if err != nil { + t.Fatalf("Failed to create test kernel: %v", err) + } + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("Failed to create new mount namespace: %v", err) + } + return testutil.NewSystem(ctx, t, k.VFS(), mns) +} + +func TestReadCPUFile(t *testing.T) { + s := newTestSystem(t) + defer s.Destroy() + k := kernel.KernelFromContext(s.Ctx) + maxCPUCores := k.ApplicationCores() + + expected := fmt.Sprintf("0-%d\n", maxCPUCores-1) + + for _, fname := range []string{"online", "possible", "present"} { + pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname)) + fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{}) + if err != nil { + t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) + } + defer fd.DecRef() + content, err := s.ReadToEnd(fd) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + if diff := cmp.Diff(expected, content); diff != "" { + t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff) + } + } +} + +func TestSysRootContainsExpectedEntries(t *testing.T) { + s := newTestSystem(t) + defer s.Destroy() + pop := s.PathOpAtRoot("/") + s.AssertAllDirentTypes(s.ListDirents(pop), map[string]testutil.DirentType{ + "block": linux.DT_DIR, + "bus": linux.DT_DIR, + "class": linux.DT_DIR, + "dev": linux.DT_DIR, + "devices": linux.DT_DIR, + "firmware": linux.DT_DIR, + "fs": linux.DT_DIR, + "kernel": linux.DT_DIR, + "module": linux.DT_DIR, + "power": linux.DT_DIR, + }) +} diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD new file mode 100644 index 000000000..0e4053a46 --- /dev/null +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -0,0 +1,37 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "testutil", + testonly = 1, + srcs = [ + "kernel.go", + "testutil.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/cpuid", + "//pkg/fspath", + "//pkg/memutil", + "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/mm", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + "//pkg/sentry/time", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/usermem", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go new file mode 100644 index 000000000..e743e8114 --- /dev/null +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -0,0 +1,176 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "flag" + "fmt" + "os" + "runtime" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + + // Platforms are plugable. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +var ( + platformFlag = flag.String("platform", "ptrace", "specify which platform to use") +) + +// Boot initializes a new bare bones kernel for test. +func Boot() (*kernel.Kernel, error) { + platformCtr, err := platform.Lookup(*platformFlag) + if err != nil { + return nil, fmt.Errorf("platform not found: %v", err) + } + deviceFile, err := platformCtr.OpenDevice() + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + plat, err := platformCtr.New(deviceFile) + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + + kernel.VFS2Enabled = true + k := &kernel.Kernel{ + Platform: plat, + } + + mf, err := createMemoryFile() + if err != nil { + return nil, err + } + k.SetMemoryFile(mf) + + // Pass k as the platform since it is savable, unlike the actual platform. + vdso, err := loader.PrepareVDSO(k) + if err != nil { + return nil, fmt.Errorf("creating vdso: %v", err) + } + + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) + + creds := auth.NewRootCredentials(auth.NewRootUserNamespace()) + + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + ApplicationCores: uint(runtime.GOMAXPROCS(-1)), + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + Vdso: vdso, + RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace), + RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), + RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), + }); err != nil { + return nil, fmt.Errorf("initializing kernel: %v", err) + } + + k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + ls, err := limits.NewLinuxLimitSet() + if err != nil { + return nil, err + } + tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) + k.TestOnly_SetGlobalInit(tg) + + return k, nil +} + +// CreateTask creates a new bare bones task for tests. +func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { + k := kernel.KernelFromContext(ctx) + exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root) + if err != nil { + return nil, err + } + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) + m.SetExecutable(fsbridge.NewVFSFile(exe)) + + config := &kernel.TaskConfig{ + Kernel: k, + ThreadGroup: tc, + TaskContext: &kernel.TaskContext{Name: name, MemoryManager: m}, + Credentials: auth.CredentialsFromContext(ctx), + NetworkNamespace: k.RootNetworkNamespace(), + AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), + UTSNamespace: kernel.UTSNamespaceFromContext(ctx), + IPCNamespace: kernel.IPCNamespaceFromContext(ctx), + AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + MountNamespaceVFS2: mntns, + FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), + FDTable: k.NewFDTable(), + } + return k.TaskSet().NewTask(config) +} + +func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) { + const name = "executable" + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_CREAT, + Mode: 0777, + } + return vfsObj.OpenAt(ctx, creds, pop, opts) +} + +func createMemoryFile() (*pgalloc.MemoryFile, error) { + const memfileName = "test-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + return nil, fmt.Errorf("error creating memfd: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) + } + return mf, nil +} diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go new file mode 100644 index 000000000..0556af877 --- /dev/null +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -0,0 +1,284 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package testutil provides common test utilities for kernfs-based +// filesystems. +package testutil + +import ( + "fmt" + "io" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +// System represents the context for a single test. +// +// Test systems must be explicitly destroyed with System.Destroy. +type System struct { + t *testing.T + Ctx context.Context + Creds *auth.Credentials + VFS *vfs.VirtualFilesystem + Root vfs.VirtualDentry + MntNs *vfs.MountNamespace +} + +// NewSystem constructs a System. +// +// Precondition: Caller must hold a reference on MntNs, whose ownership +// is transferred to the new System. +func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System { + s := &System{ + t: t, + Ctx: ctx, + Creds: auth.CredentialsFromContext(ctx), + VFS: v, + MntNs: mns, + Root: mns.Root(), + } + return s +} + +// WithSubtest creates a temporary test system with a new test harness, +// referencing all other resources from the original system. This is useful when +// a system is reused for multiple subtests, and the T needs to change for each +// case. Note that this is safe when test cases run in parallel, as all +// resources referenced by the system are immutable, or handle interior +// mutations in a thread-safe manner. +// +// The returned system must not outlive the original and should not be destroyed +// via System.Destroy. +func (s *System) WithSubtest(t *testing.T) *System { + return &System{ + t: t, + Ctx: s.Ctx, + Creds: s.Creds, + VFS: s.VFS, + MntNs: s.MntNs, + Root: s.Root, + } +} + +// WithTemporaryContext constructs a temporary test system with a new context +// ctx. The temporary system borrows all resources and references from the +// original system. The returned temporary system must not outlive the original +// system, and should not be destroyed via System.Destroy. +func (s *System) WithTemporaryContext(ctx context.Context) *System { + return &System{ + t: s.t, + Ctx: ctx, + Creds: s.Creds, + VFS: s.VFS, + MntNs: s.MntNs, + Root: s.Root, + } +} + +// Destroy release resources associated with a test system. +func (s *System) Destroy() { + s.Root.DecRef() + s.MntNs.DecRef() // Reference on MntNs passed to NewSystem. +} + +// ReadToEnd reads the contents of fd until EOF to a string. +func (s *System) ReadToEnd(fd *vfs.FileDescription) (string, error) { + buf := make([]byte, usermem.PageSize) + bufIOSeq := usermem.BytesIOSequence(buf) + opts := vfs.ReadOptions{} + + var content strings.Builder + for { + n, err := fd.Read(s.Ctx, bufIOSeq, opts) + if n == 0 || err != nil { + if err == io.EOF { + err = nil + } + return content.String(), err + } + content.Write(buf[:n]) + } +} + +// PathOpAtRoot constructs a PathOperation with the given path from +// the root of the filesystem. +func (s *System) PathOpAtRoot(path string) *vfs.PathOperation { + return &vfs.PathOperation{ + Root: s.Root, + Start: s.Root, + Path: fspath.Parse(path), + } +} + +// GetDentryOrDie attempts to resolve a dentry referred to by the +// provided path operation. If unsuccessful, the test fails. +func (s *System) GetDentryOrDie(pop *vfs.PathOperation) vfs.VirtualDentry { + vd, err := s.VFS.GetDentryAt(s.Ctx, s.Creds, pop, &vfs.GetDentryOptions{}) + if err != nil { + s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err) + } + return vd +} + +// DirentType is an alias for values for linux_dirent64.d_type. +type DirentType = uint8 + +// ListDirents lists the Dirents for a directory at pop. +func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector { + fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{Flags: linux.O_RDONLY}) + if err != nil { + s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + collector := &DirentCollector{} + if err := fd.IterDirents(s.Ctx, collector); err != nil { + s.t.Fatalf("IterDirent failed: %v", err) + } + return collector +} + +// AssertAllDirentTypes verifies that the set of dirents in collector contains +// exactly the specified set of expected entries. AssertAllDirentTypes respects +// collector.skipDots, and implicitly checks for "." and ".." accordingly. +func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) { + if expected == nil { + expected = make(map[string]DirentType) + } + // Also implicitly check for "." and "..", if enabled. + if !collector.skipDots { + expected["."] = linux.DT_DIR + expected[".."] = linux.DT_DIR + } + + dentryTypes := make(map[string]DirentType) + collector.mu.Lock() + for _, dirent := range collector.dirents { + dentryTypes[dirent.Name] = dirent.Type + } + collector.mu.Unlock() + if diff := cmp.Diff(expected, dentryTypes); diff != "" { + s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff) + } +} + +// AssertDirentOffsets verifies that collector contains at least the entries +// specified in expected, with the given NextOff field. Entries specified in +// expected but missing from collector result in failure. Extra entries in +// collector are ignored. AssertDirentOffsets respects collector.skipDots, and +// implicitly checks for "." and ".." accordingly. +func (s *System) AssertDirentOffsets(collector *DirentCollector, expected map[string]int64) { + // Also implicitly check for "." and "..", if enabled. + if !collector.skipDots { + expected["."] = 1 + expected[".."] = 2 + } + + dentryNextOffs := make(map[string]int64) + collector.mu.Lock() + for _, dirent := range collector.dirents { + // Ignore extra entries in dentries that are not in expected. + if _, ok := expected[dirent.Name]; ok { + dentryNextOffs[dirent.Name] = dirent.NextOff + } + } + collector.mu.Unlock() + if diff := cmp.Diff(expected, dentryNextOffs); diff != "" { + s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff) + } +} + +// DirentCollector provides an implementation for vfs.IterDirentsCallback for +// testing. It simply iterates to the end of a given directory FD and collects +// all dirents emitted by the callback. +type DirentCollector struct { + mu sync.Mutex + order []*vfs.Dirent + dirents map[string]*vfs.Dirent + // When the collector is used in various Assert* functions, should "." and + // ".." be implicitly checked? + skipDots bool +} + +// SkipDotsChecks enables or disables the implicit checks on "." and ".." when +// the collector is used in various Assert* functions. Note that "." and ".." +// are still collected if passed to d.Handle, so the caller should only disable +// the checks when they aren't expected. +func (d *DirentCollector) SkipDotsChecks(value bool) { + d.skipDots = value +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (d *DirentCollector) Handle(dirent vfs.Dirent) error { + d.mu.Lock() + if d.dirents == nil { + d.dirents = make(map[string]*vfs.Dirent) + } + d.order = append(d.order, &dirent) + d.dirents[dirent.Name] = &dirent + d.mu.Unlock() + return nil +} + +// Count returns the number of dirents currently in the collector. +func (d *DirentCollector) Count() int { + d.mu.Lock() + defer d.mu.Unlock() + return len(d.dirents) +} + +// Contains checks whether the collector has a dirent with the given name and +// type. +func (d *DirentCollector) Contains(name string, typ uint8) error { + d.mu.Lock() + defer d.mu.Unlock() + dirent, ok := d.dirents[name] + if !ok { + return fmt.Errorf("No dirent named %q found", name) + } + if dirent.Type != typ { + return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent) + } + return nil +} + +// Dirents returns all dirents discovered by this collector. +func (d *DirentCollector) Dirents() map[string]*vfs.Dirent { + d.mu.Lock() + dirents := make(map[string]*vfs.Dirent) + for n, d := range d.dirents { + dirents[n] = d + } + d.mu.Unlock() + return dirents +} + +// OrderedDirents returns an ordered list of dirents as discovered by this +// collector. +func (d *DirentCollector) OrderedDirents() []*vfs.Dirent { + d.mu.Lock() + dirents := make([]*vfs.Dirent, len(d.order)) + copy(dirents, d.order) + d.mu.Unlock() + return dirents +} diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD new file mode 100644 index 000000000..fbb02a271 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "timerfd", + srcs = ["timerfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go new file mode 100644 index 000000000..2dc90d484 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -0,0 +1,144 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package timerfd implements timer fds. +package timerfd + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// implements ktime.TimerListener. +type TimerFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + events waiter.Queue + timer *ktime.Timer + + // val is the number of timer expirations since the last successful + // call to PRead, or SetTime. val must be accessed using atomic memory + // operations. + val uint64 +} + +var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) +var _ ktime.TimerListener = (*TimerFileDescription)(nil) + +// New returns a new timer fd. +func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[timerfd]") + defer vd.DecRef() + tfd := &TimerFileDescription{} + tfd.timer = ktime.NewTimer(clock, tfd) + if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &tfd.vfsfd, nil +} + +// Read implements FileDescriptionImpl.Read. +func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + const sizeofUint64 = 8 + if dst.NumBytes() < sizeofUint64 { + return 0, syserror.EINVAL + } + if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], val) + if _, err := dst.CopyOut(ctx, buf[:]); err != nil { + // Linux does not undo consuming the number of + // expirations even if writing to userspace fails. + return 0, err + } + return sizeofUint64, nil + } + return 0, syserror.ErrWouldBlock +} + +// Clock returns the timer fd's Clock. +func (tfd *TimerFileDescription) Clock() ktime.Clock { + return tfd.timer.Clock() +} + +// GetTime returns the associated Timer's setting and the time at which it was +// observed. +func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { + return tfd.timer.Get() +} + +// SetTime atomically changes the associated Timer's setting, resets the number +// of expirations to 0, and returns the previous setting and the time at which +// it was observed. +func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { + return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + if atomic.LoadUint64(&tfd.val) != 0 { + ready |= waiter.EventIn + } + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + tfd.events.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { + tfd.events.EventUnregister(e) +} + +// PauseTimer pauses the associated Timer. +func (tfd *TimerFileDescription) PauseTimer() { + tfd.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (tfd *TimerFileDescription) ResumeTimer() { + tfd.timer.Resume() +} + +// Release implements FileDescriptionImpl.Release() +func (tfd *TimerFileDescription) Release() { + tfd.timer.Destroy() +} + +// Notify implements ktime.TimerListener.Notify. +func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + atomic.AddUint64(&tfd.val, exp) + tfd.events.Notify(waiter.EventIn) + return ktime.Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (tfd *TimerFileDescription) Destroy() {} diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD new file mode 100644 index 000000000..e73732a6b --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -0,0 +1,112 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "tmpfs", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "tmpfs", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + +go_library( + name = "tmpfs", + srcs = [ + "dentry_list.go", + "device_file.go", + "directory.go", + "filesystem.go", + "fstree.go", + "named_pipe.go", + "regular_file.go", + "socket_file.go", + "symlink.go", + "tmpfs.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/context", + "//pkg/fspath", + "//pkg/log", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/sentry/vfs/memxattr", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) + +go_test( + name = "benchmark_test", + size = "small", + srcs = ["benchmark_test.go"], + deps = [ + ":tmpfs", + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/refs", + "//pkg/sentry/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "tmpfs_test", + size = "small", + srcs = [ + "pipe_test.go", + "regular_file_test.go", + "stat_test.go", + "tmpfs_test.go", + ], + library = ":tmpfs", + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/contexttest", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go new file mode 100644 index 000000000..2fb5c4d84 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -0,0 +1,486 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package benchmark_test + +import ( + "fmt" + "runtime" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Differences from stat_benchmark: +// +// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are +// not included. +// +// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp. +// Non-MountStat benchmarks use a tmpfs root mount and no submounts. +// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a +// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus +// stat_benchmark at depth 1 does a comparable amount of work to *MountStat +// benchmarks at depth 2, and non-MountStat benchmarks at depth 3. +var depths = []int{1, 2, 3, 8, 64, 100} + +const ( + mountPointName = "tmp" + filename = "gvisor_test_temp_0_1557494568" +) + +// This is copied from syscalls/linux/sys_file.go, with the dependency on +// kernel.Task stripped out. +func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error { + var ( + d *fs.Dirent // The file. + rel *fs.Dirent // The relative directory for search (if required.) + err error + ) + + // Extract the working directory (maybe). + if len(path) > 0 && path[0] == '/' { + // Absolute path; rel can be nil. + } else if dirFD == linux.AT_FDCWD { + // Need to reference the working directory. + rel = wd + } else { + // Need to extract the given FD. + return syserror.EBADF + } + + // Lookup the node. + remainingTraversals := uint(linux.MaxSymlinkTraversals) + if resolve { + d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals) + } else { + d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals) + } + if err != nil { + return err + } + + err = fn(root, d) + d.DecRef() + return err +} + +func BenchmarkVFS1TmpfsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + + // Create VFS. + tmpfsFS, ok := fs.FindFilesystem("tmpfs") + if !ok { + b.Fatalf("failed to find tmpfs filesystem type") + } + rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + mntns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + b.Fatalf("failed to create mount namespace: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + root := mntns.Root() + defer root.DecRef() + d := root + d.IncRef() + defer d.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + next, err := d.Walk(ctx, root, name) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + d.DecRef() + d = next + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + file.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + dirPath := false + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(ctx) + if err != nil { + return err + } + // Sanity check. + if uattr.Perms.User.Execute { + b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) + } + return nil + }) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() + }) + } +} + +func BenchmarkVFS2TmpfsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + b.Fatalf("VFS init: %v", err) + } + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + root := mntns.Root() + defer root.DecRef() + vd := root + vd.IncRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + pop := vfs.PathOperation{ + Root: root, + Start: vd, + Path: fspath.Parse(name), + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + vd.DecRef() + vd = nextVD + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: vd, + Path: fspath.Parse(filename), + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + vd.DecRef() + vd = vfs.VirtualDentry{} + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + defer fd.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filePath), + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Mode&^linux.S_IFMT != 0644 { + b.Fatalf("got wrong permissions (%0o)", stat.Mode) + } + } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() + }) + } +} + +func BenchmarkVFS1TmpfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + + // Create VFS. + tmpfsFS, ok := fs.FindFilesystem("tmpfs") + if !ok { + b.Fatalf("failed to find tmpfs filesystem type") + } + rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + mntns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + b.Fatalf("failed to create mount namespace: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create and mount the submount. + root := mntns.Root() + defer root.DecRef() + if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create mount point: %v", err) + } + mountPoint, err := root.Walk(ctx, root, mountPointName) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs submount: %v", err) + } + if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + filePathBuilder.WriteString(mountPointName) + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + d, err := root.Walk(ctx, root, mountPointName) + if err != nil { + b.Fatalf("failed to walk to mount root: %v", err) + } + defer d.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + next, err := d.Walk(ctx, root, name) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + d.DecRef() + d = next + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + file.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + dirPath := false + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(ctx) + if err != nil { + return err + } + // Sanity check. + if uattr.Perms.User.Execute { + b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) + } + return nil + }) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() + }) + } +} + +func BenchmarkVFS2TmpfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + b.Fatalf("VFS init: %v", err) + } + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create the mount point. + root := mntns.Root() + defer root.DecRef() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mountPointName), + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create mount point: %v", err) + } + // Save the mount point for later use. + mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + // Create and mount the submount. + if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + filePathBuilder.WriteString(mountPointName) + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount root: %v", err) + } + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + pop := vfs.PathOperation{ + Root: root, + Start: vd, + Path: fspath.Parse(name), + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + vd.DecRef() + vd = nextVD + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: vd, + Path: fspath.Parse(filename), + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + vd.DecRef() + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + fd.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filePath), + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Mode&^linux.S_IFMT != 0644 { + b.Fatalf("got wrong permissions (%0o)", stat.Mode) + } + } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() + }) + } +} + +func init() { + // Turn off reference leak checking for a fair comparison between vfs1 and + // vfs2. + refs.SetLeakMode(refs.NoLeakChecking) +} diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go new file mode 100644 index 000000000..ac54d420d --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/device_file.go @@ -0,0 +1,49 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type deviceFile struct { + inode inode + kind vfs.DeviceKind + major uint32 + minor uint32 +} + +func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode { + file := &deviceFile{ + kind: kind, + major: major, + minor: minor, + } + switch kind { + case vfs.BlockDevice: + mode |= linux.S_IFBLK + case vfs.CharDevice: + mode |= linux.S_IFCHR + default: + panic(fmt.Sprintf("invalid DeviceKind: %v", kind)) + } + file.inode.init(file, fs, kuid, kgid, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go new file mode 100644 index 000000000..0a1ad4765 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -0,0 +1,232 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +type directory struct { + // Since directories can't be hard-linked, each directory can only be + // associated with a single dentry, which we can store in the directory + // struct. + dentry dentry + inode inode + + // childMap maps the names of the directory's children to their dentries. + // childMap is protected by filesystem.mu. + childMap map[string]*dentry + + // numChildren is len(childMap), but accessed using atomic memory + // operations to avoid locking in inode.statTo(). + numChildren int64 + + // childList is a list containing (1) child dentries and (2) fake dentries + // (with inode == nil) that represent the iteration position of + // directoryFDs. childList is used to support directoryFD.IterDirents() + // efficiently. childList is protected by iterMu. + iterMu sync.Mutex + childList dentryList +} + +func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory { + dir := &directory{} + dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode) + dir.inode.nlink = 2 // from "." and parent directory or ".." for root + dir.dentry.inode = &dir.inode + dir.dentry.vfsd.Init(&dir.dentry) + return dir +} + +// Preconditions: filesystem.mu must be locked for writing. dir must not +// already contain a child with the given name. +func (dir *directory) insertChildLocked(child *dentry, name string) { + child.parent = &dir.dentry + child.name = name + if dir.childMap == nil { + dir.childMap = make(map[string]*dentry) + } + dir.childMap[name] = child + atomic.AddInt64(&dir.numChildren, 1) + dir.iterMu.Lock() + dir.childList.PushBack(child) + dir.iterMu.Unlock() +} + +// Preconditions: filesystem.mu must be locked for writing. +func (dir *directory) removeChildLocked(child *dentry) { + delete(dir.childMap, child.name) + atomic.AddInt64(&dir.numChildren, -1) + dir.iterMu.Lock() + dir.childList.Remove(child) + dir.iterMu.Unlock() +} + +func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error { + return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid))) +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + // Protected by directory.iterMu. + iter *dentry + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { + if fd.iter != nil { + dir := fd.inode().impl.(*directory) + dir.iterMu.Lock() + dir.childList.Remove(fd.iter) + dir.iterMu.Unlock() + fd.iter = nil + } +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fs := fd.filesystem() + dir := fd.inode().impl.(*directory) + + defer fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + + // fs.mu is required to read d.parent and dentry.name. + fs.mu.RLock() + defer fs.mu.RUnlock() + dir.iterMu.Lock() + defer dir.iterMu.Unlock() + + fd.inode().touchAtime(fd.vfsfd.Mount()) + + if fd.off == 0 { + if err := cb.Handle(vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + Ino: dir.inode.ino, + NextOff: 1, + }); err != nil { + return err + } + fd.off++ + } + + if fd.off == 1 { + parentInode := genericParentOrSelf(&dir.dentry).inode + if err := cb.Handle(vfs.Dirent{ + Name: "..", + Type: parentInode.direntType(), + Ino: parentInode.ino, + NextOff: 2, + }); err != nil { + return err + } + fd.off++ + } + + var child *dentry + if fd.iter == nil { + // Start iteration at the beginning of dir. + child = dir.childList.Front() + fd.iter = &dentry{} + } else { + // Continue iteration from where we left off. + child = fd.iter.Next() + dir.childList.Remove(fd.iter) + } + for child != nil { + // Skip other directoryFD iterators. + if child.inode != nil { + if err := cb.Handle(vfs.Dirent{ + Name: child.name, + Type: child.inode.direntType(), + Ino: child.inode.ino, + NextOff: fd.off + 1, + }); err != nil { + dir.childList.InsertBefore(child, fd.iter) + return err + } + fd.off++ + } + child = child.Next() + } + dir.childList.PushBack(fd.iter) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + dir := fd.inode().impl.(*directory) + dir.iterMu.Lock() + defer dir.iterMu.Unlock() + + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + + // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't + // seek even if doing so might reposition the iterator due to concurrent + // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek(). + if fd.off == offset { + return offset, nil + } + + fd.off = offset + // Compensate for "." and "..". + remChildren := int64(0) + if offset >= 2 { + remChildren = offset - 2 + } + + // Ensure that fd.iter exists and is not linked into dir.childList. + if fd.iter == nil { + fd.iter = &dentry{} + } else { + dir.childList.Remove(fd.iter) + } + // Insert fd.iter before the remChildren'th child, or at the end of the + // list if remChildren >= number of children. + child := dir.childList.Front() + for child != nil { + // Skip other directoryFD iterators. + if child.inode != nil { + if remChildren == 0 { + dir.childList.InsertBefore(child, fd.iter) + return offset, nil + } + remChildren-- + } + child = child.Next() + } + dir.childList.PushBack(fd.iter) + return offset, nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go new file mode 100644 index 000000000..a0f20c2d4 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -0,0 +1,859 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// stepLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { + dir, ok := d.inode.impl.(*directory) + if !ok { + return nil, syserror.ENOTDIR + } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil + } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } + child, ok := dir.childMap[name] + if !ok { + return nil, syserror.ENOENT + } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } + if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // Symlink traversal updates access time. + child.inode.touchAtime(rp.Mount()) + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { + for !rp.Final() { + next, err := stepLocked(rp, d) + if err != nil { + return nil, err + } + d = next + } + dir, ok := d.inode.impl.(*directory) + if !ok { + return nil, syserror.ENOTDIR + } + return dir, nil +} + +// resolveLocked resolves rp to an existing file. +// +// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). +// +// Preconditions: filesystem.mu must be locked. +func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + next, err := stepLocked(rp, d) + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// doCreateAt is loosely analogous to a conjunction of Linux's +// fs/namei.c:filename_create() and done_path_create(). +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + if len(name) > linux.NAME_MAX { + return syserror.ENAMETOOLONG + } + if _, ok := parentDir.childMap[name]; ok { + return syserror.EEXIST + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only + // be dead if it was deleted. + if parentDir.dentry.vfsd.IsDead() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := create(parentDir, name); err != nil { + return err + } + + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parentDir.inode.touchCMtime() + return nil +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return err + } + return d.inode.checkPermissions(creds, ats) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return nil, err + } + dir.dentry.IncRef() + return &dir.dentry.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + d := vd.Dentry().Impl().(*dentry) + i := d.inode + if i.isDir() { + return syserror.EPERM + } + if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + return err + } + if i.nlink == 0 { + return syserror.ENOENT + } + if i.nlink == maxLinks { + return syserror.EMLINK + } + i.incLinksLocked() + i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) + parentDir.insertChildLocked(fs.newDentry(i), name) + return nil + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + creds := rp.Credentials() + if parentDir.inode.nlink == maxLinks { + return syserror.EMLINK + } + parentDir.inode.incLinksLocked() // from child's ".." + childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) + parentDir.insertChildLocked(&childDir.dentry, name) + return nil + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + creds := rp.Credentials() + var childInode *inode + switch opts.Mode.FileType() { + case linux.S_IFREG: + childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) + case linux.S_IFIFO: + childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) + case linux.S_IFBLK: + childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor) + case linux.S_IFCHR: + childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) + case linux.S_IFSOCK: + childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint) + default: + return syserror.EINVAL + } + child := fs.newDentry(childInode) + parentDir.insertChildLocked(child, name) + return nil + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + if opts.Flags&linux.O_TMPFILE != 0 { + // Not yet supported. + return nil, syserror.EOPNOTSUPP + } + + // Handle O_CREAT and !O_CREAT separately, since in the latter case we + // don't need fs.mu for writing. + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + return d.open(ctx, rp, &opts, false /* afterCreate */) + } + + mustCreate := opts.Flags&linux.O_EXCL != 0 + start := rp.Start().Impl().(*dentry) + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + return start.open(ctx, rp, &opts, false /* afterCreate */) + } +afterTrailingSymlink: + parentDir, err := walkParentDirLocked(rp, start) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + name := rp.Component() + if name == "." || name == ".." { + return nil, syserror.EISDIR + } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } + // Determine whether or not we need to create a file. + child, ok := parentDir.childMap[name] + if !ok { + // Already checked for searchability above; now check for writability. + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + creds := rp.Credentials() + child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)) + parentDir.insertChildLocked(child, name) + fd, err := child.open(ctx, rp, &opts, true) + if err != nil { + return nil, err + } + parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + parentDir.inode.touchCMtime() + return fd, nil + } + if mustCreate { + return nil, syserror.EEXIST + } + // Is the file mounted over? + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } + // Do we need to resolve a trailing symlink? + if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // Symlink traversal updates access time. + child.inode.touchAtime(rp.Mount()) + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + start = &parentDir.dentry + goto afterTrailingSymlink + } + // Open existing file. + if mustCreate { + return nil, syserror.EEXIST + } + return child.open(ctx, rp, &opts, false) +} + +func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if !afterCreate { + if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + } + switch impl := d.inode.impl.(type) { + case *regularFile: + var fd regularFileFD + fd.LockFD.Init(&d.inode.locks) + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { + return nil, err + } + if !afterCreate && opts.Flags&linux.O_TRUNC != 0 { + if _, err := impl.truncate(0); err != nil { + return nil, err + } + } + return &fd.vfsfd, nil + case *directory: + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + var fd directoryFD + fd.LockFD.Init(&d.inode.locks) + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case *symlink: + // TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH. + return nil, syserror.ELOOP + case *namedPipe: + return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) + case *deviceFile: + return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) + case *socketFile: + return nil, syserror.ENXIO + default: + panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) + } +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return "", err + } + symlink, ok := d.inode.impl.(*symlink) + if !ok { + return "", syserror.EINVAL + } + symlink.inode.touchAtime(rp.Mount()) + return symlink.target, nil +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + // TODO(b/145974740): Support renameat2 flags. + return syserror.EINVAL + } + + // Resolve newParent first to verify that it's on this Mount. + fs.mu.Lock() + defer fs.mu.Unlock() + newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory) + if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + renamed, ok := oldParentDir.childMap[oldName] + if !ok { + return syserror.ENOENT + } + if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { + return err + } + // Note that we don't need to call rp.CheckMount(), since if renamed is a + // mount point then we want to rename the mount point, not anything in the + // mounted filesystem. + if renamed.inode.isDir() { + if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { + return syserror.EINVAL + } + if oldParentDir != newParentDir { + // Writability is needed to change renamed's "..". + if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + replaced, ok := newParentDir.childMap[newName] + if ok { + replacedDir, ok := replaced.inode.impl.(*directory) + if ok { + if !renamed.inode.isDir() { + return syserror.EISDIR + } + if len(replacedDir.childMap) != 0 { + return syserror.ENOTEMPTY + } + } else { + if rp.MustBeDir() { + return syserror.ENOTDIR + } + if renamed.inode.isDir() { + return syserror.ENOTDIR + } + } + } else { + if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks { + return syserror.EMLINK + } + } + // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can + // only be dead if it was deleted. + if newParentDir.dentry.vfsd.IsDead() { + return syserror.ENOENT + } + + // Linux places this check before some of those above; we do it here for + // simplicity, under the assumption that applications are not intentionally + // doing noop renames expecting them to succeed where non-noop renames + // would fail. + if renamed == replaced { + return nil + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + } + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { + return err + } + if replaced != nil { + newParentDir.removeChildLocked(replaced) + if replaced.inode.isDir() { + newParentDir.inode.decLinksLocked() // from replaced's ".." + } + replaced.inode.decLinksLocked() + } + oldParentDir.removeChildLocked(renamed) + newParentDir.insertChildLocked(renamed, newName) + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + oldParentDir.inode.touchCMtime() + if oldParentDir != newParentDir { + if renamed.inode.isDir() { + oldParentDir.inode.decLinksLocked() + newParentDir.inode.incLinksLocked() + } + newParentDir.inode.touchCMtime() + } + renamed.inode.touchCtime() + + vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir()) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + name := rp.Component() + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + child, ok := parentDir.childMap[name] + if !ok { + return syserror.ENOENT + } + if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { + return err + } + childDir, ok := child.inode.impl.(*directory) + if !ok { + return syserror.ENOTDIR + } + if len(childDir.childMap) != 0 { + return syserror.ENOTEMPTY + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + parentDir.removeChildLocked(child) + parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + // Remove links for child, child/., and child/.. + child.inode.decLinksLocked() + child.inode.decLinksLocked() + parentDir.inode.decLinksLocked() + vfsObj.CommitDeleteDentry(&child.vfsd) + parentDir.inode.touchCMtime() + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + d, err := resolveLocked(rp) + if err != nil { + fs.mu.RUnlock() + return err + } + if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil { + fs.mu.RUnlock() + return err + } + fs.mu.RUnlock() + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0, vfs.InodeEvent) + } + return nil +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return linux.Statx{}, err + } + var stat linux.Statx + d.inode.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + if _, err := resolveLocked(rp); err != nil { + return linux.Statfs{}, err + } + statfs := linux.Statfs{ + Type: linux.TMPFS_MAGIC, + BlockSize: usermem.PageSize, + FragmentSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + // TODO(b/29637826): Allow configuring a tmpfs size and enforce it. + Blocks: 0, + BlocksFree: 0, + } + return statfs, nil +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + creds := rp.Credentials() + child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target)) + parentDir.insertChildLocked(child, name) + return nil + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EISDIR + } + child, ok := parentDir.childMap[name] + if !ok { + return syserror.ENOENT + } + if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { + return err + } + if child.inode.isDir() { + return syserror.EISDIR + } + if rp.MustBeDir() { + return syserror.ENOTDIR + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + + // Generate inotify events. Note that this must take place before the link + // count of the child is decremented, or else the watches may be dropped + // before these events are added. + vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name) + + parentDir.removeChildLocked(child) + child.inode.decLinksLocked() + vfsObj.CommitDeleteDentry(&child.vfsd) + parentDir.inode.touchCMtime() + return nil +} + +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + switch impl := d.inode.impl.(type) { + case *socketFile: + return impl.ep, nil + default: + return nil, syserror.ECONNREFUSED + } +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + return d.inode.listxattr(size) +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return "", err + } + return d.inode.getxattr(rp.Credentials(), &opts) +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + d, err := resolveLocked(rp) + if err != nil { + fs.mu.RUnlock() + return err + } + if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil { + fs.mu.RUnlock() + return err + } + fs.mu.RUnlock() + + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + d, err := resolveLocked(rp) + if err != nil { + fs.mu.RUnlock() + return err + } + if err := d.inode.removexattr(rp.Credentials(), name); err != nil { + fs.mu.RUnlock() + return err + } + fs.mu.RUnlock() + + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + mnt := vd.Mount() + d := vd.Dentry().Impl().(*dentry) + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + if d.name != "" { + // This must be an anonymous memfd file. + b.PrependComponent("/" + d.name) + return vfs.PrependPathSyntheticError{} + } + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go new file mode 100644 index 000000000..739350cf0 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -0,0 +1,38 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/usermem" +) + +type namedPipe struct { + inode inode + + pipe *pipe.VFSPipe +} + +// Preconditions: +// * fs.mu must be locked. +// * rp.Mount().CheckBeginWrite() has been called successfully. +func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { + file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)} + file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode) + file.inode.nlink = 1 // Only the parent has a link. + return &file.inode +} diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go new file mode 100644 index 000000000..1614f2c39 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -0,0 +1,238 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "bytes" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const fileName = "mypipe" + +func TestSeparateFDs(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the read side. This is done in a concurrently because opening + // One end the pipe blocks until the other end is opened. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + } + rfdchan := make(chan *vfs.FileDescription) + go func() { + openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY} + rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + rfdchan <- rfd + }() + + // Open the write side. + openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY} + wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer wfd.DecRef() + + rfd, ok := <-rfdchan + if !ok { + t.Fatalf("failed to open pipe for reading %q", fileName) + } + defer rfd.DecRef() + + const msg = "vamos azul" + checkEmpty(ctx, t, rfd) + checkWrite(ctx, t, wfd, msg) + checkRead(ctx, t, rfd, msg) +} + +func TestNonblockingRead(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the read side as nonblocking. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK} + rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for reading %q: %v", fileName, err) + } + defer rfd.DecRef() + + // Open the write side. + openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY} + wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer wfd.DecRef() + + const msg = "geh blau" + checkEmpty(ctx, t, rfd) + checkWrite(ctx, t, wfd, msg) + checkRead(ctx, t, rfd, msg) +} + +func TestNonblockingWriteError(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the write side as nonblocking, which should return ENXIO. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK} + _, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != syserror.ENXIO { + t.Fatalf("expected ENXIO, but got error: %v", err) + } +} + +func TestSingleFD(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the pipe as readable and writable. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_RDWR} + fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer fd.DecRef() + + const msg = "forza blu" + checkEmpty(ctx, t, fd) + checkWrite(ctx, t, fd, msg) + checkRead(ctx, t, fd, msg) +} + +// setup creates a VFS with a pipe in the root directory at path fileName. The +// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal +// upon failure. +func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("failed to create tmpfs root mount: %v", err) + } + + // Create the pipe. + root := mntns.Root() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + } + mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644} + if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil { + t.Fatalf("failed to create file %q: %v", fileName, err) + } + + // Sanity check: the file pipe exists and has the correct mode. + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + t.Fatalf("stat(%q) failed: %v", fileName, err) + } + if stat.Mode&^linux.S_IFMT != 0644 { + t.Errorf("got wrong permissions (%0o)", stat.Mode) + } + if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe { + t.Errorf("got wrong file type (%0o)", stat.Mode) + } + + return ctx, creds, vfsObj, root +} + +// checkEmpty calls t.Fatal if the pipe in fd is not empty. +func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { + readData := make([]byte, 1) + dst := usermem.BytesIOSequence(readData) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) + if err != syserror.ErrWouldBlock { + t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) + } + if bytesRead != 0 { + t.Fatalf("expected to read 0 bytes, but got %d", bytesRead) + } +} + +// checkWrite calls t.Fatal if it fails to write all of msg to fd. +func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { + writeData := []byte(msg) + src := usermem.BytesIOSequence(writeData) + bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{}) + if err != nil { + t.Fatalf("error writing to pipe %q: %v", fileName, err) + } + if bytesWritten != int64(len(writeData)) { + t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten) + } +} + +// checkRead calls t.Fatal if it fails to read msg from fd. +func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { + readData := make([]byte, len(msg)) + dst := usermem.BytesIOSequence(readData) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) + if err != nil { + t.Fatalf("error reading from pipe %q: %v", fileName, err) + } + if bytesRead != int64(len(msg)) { + t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead) + } + if !bytes.Equal(readData, []byte(msg)) { + t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData)) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go new file mode 100644 index 000000000..1cdb46e6f --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -0,0 +1,626 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "io" + "math" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// regularFile is a regular (=S_IFREG) tmpfs file. +type regularFile struct { + inode inode + + // memFile is a platform.File used to allocate pages to this regularFile. + memFile *pgalloc.MemoryFile + + // mapsMu protects mappings. + mapsMu sync.Mutex `state:"nosave"` + + // mappings tracks mappings of the file into memmap.MappingSpaces. + // + // Protected by mapsMu. + mappings memmap.MappingSet + + // writableMappingPages tracks how many pages of virtual memory are mapped + // as potentially writable from this file. If a page has multiple mappings, + // each mapping is counted separately. + // + // This counter is susceptible to overflow as we can potentially count + // mappings from many VMAs. We count pages rather than bytes to slightly + // mitigate this. + // + // Protected by mapsMu. + writableMappingPages uint64 + + // dataMu protects the fields below. + dataMu sync.RWMutex + + // data maps offsets into the file to offsets into memFile that store + // the file's data. + // + // Protected by dataMu. + data fsutil.FileRangeSet + + // seals represents file seals on this inode. + // + // Protected by dataMu. + seals uint32 + + // size is the size of data. + // + // Protected by both dataMu and inode.mu; reading it requires holding + // either mutex, while writing requires holding both AND using atomics. + // Readers that do not require consistency (like Stat) may read the + // value atomically without holding either lock. + size uint64 +} + +func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { + file := ®ularFile{ + memFile: fs.memFile, + seals: linux.F_SEAL_SEAL, + } + file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} + +// truncate grows or shrinks the file to the given size. It returns true if the +// file size was updated. +func (rf *regularFile) truncate(newSize uint64) (bool, error) { + rf.inode.mu.Lock() + defer rf.inode.mu.Unlock() + return rf.truncateLocked(newSize) +} + +// Preconditions: rf.inode.mu must be held. +func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { + oldSize := rf.size + if newSize == oldSize { + // Nothing to do. + return false, nil + } + + // Need to hold inode.mu and dataMu while modifying size. + rf.dataMu.Lock() + if newSize > oldSize { + // Can we grow the file? + if rf.seals&linux.F_SEAL_GROW != 0 { + rf.dataMu.Unlock() + return false, syserror.EPERM + } + // We only need to update the file size. + atomic.StoreUint64(&rf.size, newSize) + rf.dataMu.Unlock() + return true, nil + } + + // We are shrinking the file. First check if this is allowed. + if rf.seals&linux.F_SEAL_SHRINK != 0 { + rf.dataMu.Unlock() + return false, syserror.EPERM + } + + // Update the file size. + atomic.StoreUint64(&rf.size, newSize) + rf.dataMu.Unlock() + + // Invalidate past translations of truncated pages. + oldpgend := fs.OffsetPageEnd(int64(oldSize)) + newpgend := fs.OffsetPageEnd(int64(newSize)) + if newpgend < oldpgend { + rf.mapsMu.Lock() + rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/shmem.c:shmem_setattr() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + rf.mapsMu.Unlock() + } + + // We are now guaranteed that there are no translations of truncated pages, + // and can remove them. + rf.dataMu.Lock() + rf.data.Truncate(newSize, rf.memFile) + rf.dataMu.Unlock() + return true, nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + + // Reject writable mapping if F_SEAL_WRITE is set. + if rf.seals&linux.F_SEAL_WRITE != 0 && writable { + return syserror.EPERM + } + + rf.mappings.AddMapping(ms, ar, offset, writable) + if writable { + pagesBefore := rf.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + rf.writableMappingPages += uint64(ar.Length() / usermem.PageSize) + + if rf.writableMappingPages < pagesBefore { + panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) + } + } + + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + + rf.mappings.RemoveMapping(ms, ar, offset, writable) + + if writable { + pagesBefore := rf.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + rf.writableMappingPages -= uint64(ar.Length() / usermem.PageSize) + + if rf.writableMappingPages > pagesBefore { + panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) + } + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return rf.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + rf.dataMu.Lock() + defer rf.dataMu.Unlock() + + // Constrain translations to f.attr.Size (rounded up) to prevent + // translation to pages that may be concurrently truncated. + pgend := fs.OffsetPageEnd(int64(rf.size)) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + // Newly-allocated pages are zeroed, so we don't need to do anything. + return dsts.NumBytes(), nil + }) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + ts = append(ts, memmap.Translation{ + Source: segMR, + File: rf.memFile, + Offset: seg.FileRangeOf(segMR).Start, + Perms: usermem.AnyAccess, + }) + translatedEnd = segMR.End + } + + // Don't return the error returned by f.data.Fill if it occurred outside of + // required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (*regularFile) InvalidateUnsavable(context.Context) error { + return nil +} + +type regularFileFD struct { + fileDescription + + // off is the file offset. off is accessed using atomic memory operations. + // offMu serializes operations that may mutate off. + off int64 + offMu sync.Mutex +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { + // noop +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + f := fd.inode().impl.(*regularFile) + + f.inode.mu.Lock() + defer f.inode.mu.Unlock() + oldSize := f.size + size := offset + length + if oldSize >= size { + return nil + } + _, err := f.truncateLocked(size) + return err +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since + // all state is in-memory. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + f := fd.inode().impl.(*regularFile) + rw := getRegularFileReadWriter(f, offset) + n, err := dst.CopyOutFrom(ctx, rw) + putRegularFileReadWriter(rw) + fd.inode().touchAtime(fd.vfsfd.Mount()) + return n, err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since + // all state is in-memory. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { + return 0, syserror.EOPNOTSUPP + } + + srclen := src.NumBytes() + if srclen == 0 { + return 0, nil + } + f := fd.inode().impl.(*regularFile) + if end := offset + srclen; end < offset { + // Overflow. + return 0, syserror.EINVAL + } + + var err error + srclen, err = vfs.CheckLimit(ctx, offset, srclen) + if err != nil { + return 0, err + } + src = src.TakeFirst64(srclen) + + f.inode.mu.Lock() + rw := getRegularFileReadWriter(f, offset) + n, err := src.CopyInTo(ctx, rw) + fd.inode().touchCMtimeLocked() + f.inode.mu.Unlock() + putRegularFileReadWriter(rw) + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.offMu.Lock() + defer fd.offMu.Unlock() + switch whence { + case linux.SEEK_SET: + // use offset as specified + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size)) + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + file := fd.inode().impl.(*regularFile) + return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts) +} + +// regularFileReadWriter implements safemem.Reader and Safemem.Writer. +type regularFileReadWriter struct { + file *regularFile + + // Offset into the file to read/write at. Note that this may be + // different from the FD offset if PRead/PWrite is used. + off uint64 +} + +var regularFileReadWriterPool = sync.Pool{ + New: func() interface{} { + return ®ularFileReadWriter{} + }, +} + +func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter { + rw := regularFileReadWriterPool.Get().(*regularFileReadWriter) + rw.file = file + rw.off = uint64(offset) + return rw +} + +func putRegularFileReadWriter(rw *regularFileReadWriter) { + rw.file = nil + regularFileReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + rw.file.dataMu.RLock() + defer rw.file.dataMu.RUnlock() + size := rw.file.size + + // Compute the range to read (limited by file size and overflow-checked). + if rw.off >= size { + return 0, io.EOF + } + end := size + if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { + end = rend + } + + var done uint64 + seg, gap := rw.file.data.Find(uint64(rw.off)) + for rw.off < end { + mr := memmap.MappableRange{uint64(rw.off), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings. + ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.off += uint64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Tmpfs holes are zero-filled. + gapmr := gap.Range().Intersect(mr) + dst := dsts.TakeFirst64(gapmr.Length()) + n, err := safemem.ZeroSeq(dst) + done += n + rw.off += uint64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: inode.mu must be held. +func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + // Hold dataMu so we can modify size. + rw.file.dataMu.Lock() + defer rw.file.dataMu.Unlock() + + // Compute the range to write (overflow-checked). + end := rw.off + srcs.NumBytes() + if end <= rw.off { + end = math.MaxInt64 + } + + // Check if seals prevent either file growth or all writes. + switch { + case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed + return 0, syserror.EPERM + case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed + // When growth is sealed, Linux effectively allows writes which would + // normally grow the file to partially succeed up to the current EOF, + // rounded down to the page boundary before the EOF. + // + // This happens because writes (and thus the growth check) for tmpfs + // files proceed page-by-page on Linux, and the final write to the page + // containing EOF fails, resulting in a partial write up to the start of + // that page. + // + // To emulate this behaviour, artifically truncate the write to the + // start of the page containing the current EOF. + // + // See Linux, mm/filemap.c:generic_perform_write() and + // mm/shmem.c:shmem_write_begin(). + if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart { + end = pgstart + } + if end <= rw.off { + // Truncation would result in no data being written. + return 0, syserror.EPERM + } + } + + // Page-aligned mr for when we need to allocate memory. RoundUp can't + // overflow since end is an int64. + pgstartaddr := usermem.Addr(rw.off).RoundDown() + pgendaddr, _ := usermem.Addr(end).RoundUp() + pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)} + + var ( + done uint64 + retErr error + ) + seg, gap := rw.file.data.Find(uint64(rw.off)) + for rw.off < end { + mr := memmap.MappableRange{uint64(rw.off), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings. + ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write) + if err != nil { + retErr = err + goto exitLoop + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.off += uint64(n) + srcs = srcs.DropFirst64(n) + if err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Allocate memory for the write. + gapMR := gap.Range().Intersect(pgMR) + fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs) + if err != nil { + retErr = err + goto exitLoop + } + + // Write to that memory as usual. + seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{} + } + } +exitLoop: + // If the write ends beyond the file's previous size, it causes the + // file to grow. + if rw.off > rw.file.size { + rw.file.size = rw.off + } + + return done, retErr +} + +// GetSeals returns the current set of seals on a memfd inode. +func GetSeals(fd *vfs.FileDescription) (uint32, error) { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return 0, syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + return rf.seals, nil +} + +// AddSeals adds new file seals to a memfd inode. +func AddSeals(fd *vfs.FileDescription, val uint32) error { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + + if rf.seals&linux.F_SEAL_SEAL != 0 { + // Seal applied which prevents addition of any new seals. + return syserror.EPERM + } + + // F_SEAL_WRITE can only be added if there are no active writable maps. + if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { + if rf.writableMappingPages > 0 { + return syserror.EBUSY + } + } + + // Seals can only be added, never removed. + rf.seals |= val + return nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go new file mode 100644 index 000000000..146c7fdfe --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -0,0 +1,349 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "bytes" + "fmt" + "io" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Test that we can write some data to a file and read it back.` +func TestSimpleWriteRead(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Write. + data := []byte("foobarbaz") + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want { + t.Errorf("fd.Write left offset at %d, want %d", got, want) + } + + // Seek back to beginning. + if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil { + t.Fatalf("fd.Seek failed: %v", err) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want { + t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want) + } + + // Read. + buf := make([]byte, len(data)) + n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) + if err != nil && err != io.EOF { + t.Fatalf("fd.Read failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Read got short read length %d, want %d", n, len(data)) + } + if got, want := string(buf), string(data); got != want { + t.Errorf("Read got %q want %s", got, want) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want { + t.Errorf("fd.Write left offset at %d, want %d", got, want) + } +} + +func TestPWrite(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Fill file with 1k 'a's. + data := bytes.Repeat([]byte{'a'}, 1000) + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + + // Write "gVisor is awesome" at various offsets. + buf := []byte("gVisor is awesome") + offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1} + for _, offset := range offsets { + name := fmt.Sprintf("PWrite offset=%d", offset) + t.Run(name, func(t *testing.T) { + n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{}) + if err != nil { + t.Errorf("fd.PWrite got err %v want nil", err) + } + if n != int64(len(buf)) { + t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf)) + } + + // Update data to reflect expected file contents. + if len(data) < offset+len(buf) { + data = append(data, make([]byte, (offset+len(buf))-len(data))...) + } + copy(data[offset:], buf) + + // Read the whole file and compare with data. + readBuf := make([]byte, len(data)) + n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{}) + if err != nil { + t.Fatalf("fd.PRead failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.PRead got short read length %d, want %d", n, len(data)) + } + if got, want := string(readBuf), string(data); got != want { + t.Errorf("PRead got %q want %s", got, want) + } + + }) + } +} + +func TestLocks(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + uid1 := 123 + uid2 := 456 + if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { + t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) + } + if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { + t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) + } + if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want { + t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) + } + if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil { + t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) + } +} + +func TestPRead(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Write 100 sequences of 'gVisor is awesome'. + data := bytes.Repeat([]byte("gVisor is awsome"), 100) + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + + // Read various sizes from various offsets. + sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000} + offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1} + + for _, size := range sizes { + for _, offset := range offsets { + name := fmt.Sprintf("PRead offset=%d size=%d", offset, size) + t.Run(name, func(t *testing.T) { + var ( + wantRead []byte + wantErr error + ) + if offset < len(data) { + wantRead = data[offset:] + } else if size > 0 { + wantErr = io.EOF + } + if offset+size < len(data) { + wantRead = wantRead[:size] + } + buf := make([]byte, size) + n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{}) + if err != wantErr { + t.Errorf("fd.PRead got err %v want %v", err, wantErr) + } + if n != int64(len(wantRead)) { + t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead)) + } + if got := string(buf[:n]); got != string(wantRead) { + t.Errorf("fd.PRead got %q want %q", got, string(wantRead)) + } + }) + } + } +} + +func TestTruncate(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Fill the file with some data. + data := bytes.Repeat([]byte("gVisor is awsome"), 100) + written, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + + // Size should be same as written. + sizeStatOpts := vfs.StatOptions{Mask: linux.STATX_SIZE} + stat, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := int64(stat.Size), written; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + + // Truncate down. + newSize := uint64(10) + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + // Size should be updated. + statAfterTruncateDown, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := statAfterTruncateDown.Size, newSize; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + // We should only read newSize worth of data. + buf := make([]byte, 1000) + if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF { + t.Fatalf("fd.PRead failed: %v", err) + } else if uint64(n) != newSize { + t.Errorf("fd.PRead got size %d, want %d", n, newSize) + } + // Mtime and Ctime should be bumped. + if got := statAfterTruncateDown.Mtime.ToNsec(); got <= stat.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want > %v", got, stat.Mtime) + } + if got := statAfterTruncateDown.Ctime.ToNsec(); got <= stat.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime) + } + + // Truncate up. + newSize = 100 + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + // Size should be updated. + statAfterTruncateUp, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := statAfterTruncateUp.Size, newSize; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + // We should read newSize worth of data. + buf = make([]byte, 1000) + if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF { + t.Fatalf("fd.PRead failed: %v", err) + } else if uint64(n) != newSize { + t.Errorf("fd.PRead got size %d, want %d", n, newSize) + } + // Bytes should be null after 10, since we previously truncated to 10. + for i := uint64(10); i < newSize; i++ { + if buf[i] != 0 { + t.Errorf("fd.PRead got byte %d=%x, want 0", i, buf[i]) + break + } + } + // Mtime and Ctime should be bumped. + if got := statAfterTruncateUp.Mtime.ToNsec(); got <= statAfterTruncateDown.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want > %v", got, statAfterTruncateDown.Mtime) + } + if got := statAfterTruncateUp.Ctime.ToNsec(); got <= statAfterTruncateDown.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime) + } + + // Truncate to the current size. + newSize = statAfterTruncateUp.Size + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + statAfterTruncateNoop, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + // Mtime and Ctime should not be bumped, since operation is a noop. + if got := statAfterTruncateNoop.Mtime.ToNsec(); got != statAfterTruncateUp.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want %v", got, statAfterTruncateUp.Mtime) + } + if got := statAfterTruncateNoop.Ctime.ToNsec(); got != statAfterTruncateUp.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want %v", got, statAfterTruncateUp.Ctime) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go new file mode 100644 index 000000000..3ed650474 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go @@ -0,0 +1,34 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" +) + +// socketFile is a socket (=S_IFSOCK) tmpfs file. +type socketFile struct { + inode inode + ep transport.BoundEndpoint +} + +func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode { + file := &socketFile{ep: ep} + file.inode.init(file, fs, kuid, kgid, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go new file mode 100644 index 000000000..f7ee4aab2 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -0,0 +1,236 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func TestStatAfterCreate(t *testing.T) { + ctx := contexttest.Context(t) + mode := linux.FileMode(0644) + + // Run with different file types. + for _, typ := range []string{"file", "dir", "pipe"} { + t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { + var ( + fd *vfs.FileDescription + cleanup func() + err error + ) + switch typ { + case "file": + fd, cleanup, err = newFileFD(ctx, mode) + case "dir": + fd, cleanup, err = newDirFD(ctx, mode) + case "pipe": + fd, cleanup, err = newPipeFD(ctx, mode) + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + if err != nil { + t.Fatal(err) + } + defer cleanup() + + got, err := fd.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Atime, Ctime, Mtime should all be current time (non-zero). + atime, ctime, mtime := got.Atime.ToNsec(), got.Ctime.ToNsec(), got.Mtime.ToNsec() + if atime != ctime || ctime != mtime { + t.Errorf("got atime=%d ctime=%d mtime=%d, wanted equal values", atime, ctime, mtime) + } + if atime == 0 { + t.Errorf("got atime=%d, want non-zero", atime) + } + + // Btime should be 0, as it is not set by tmpfs. + if btime := got.Btime.ToNsec(); btime != 0 { + t.Errorf("got btime %d, want 0", got.Btime.ToNsec()) + } + + // Size should be 0 (except for directories, which make up a size + // of 20 per entry, including the "." and ".." entries present in + // otherwise-empty directories). + wantSize := uint64(0) + if typ == "dir" { + wantSize = 40 + } + if got.Size != wantSize { + t.Errorf("got size %d, want %d", got.Size, wantSize) + } + + // Nlink should be 1 for files, 2 for dirs. + wantNlink := uint32(1) + if typ == "dir" { + wantNlink = 2 + } + if got.Nlink != wantNlink { + t.Errorf("got nlink %d, want %d", got.Nlink, wantNlink) + } + + // UID and GID are set from context creds. + creds := auth.CredentialsFromContext(ctx) + if got.UID != uint32(creds.EffectiveKUID) { + t.Errorf("got uid %d, want %d", got.UID, uint32(creds.EffectiveKUID)) + } + if got.GID != uint32(creds.EffectiveKGID) { + t.Errorf("got gid %d, want %d", got.GID, uint32(creds.EffectiveKGID)) + } + + // Mode. + wantMode := uint16(mode) + switch typ { + case "file": + wantMode |= linux.S_IFREG + case "dir": + wantMode |= linux.S_IFDIR + case "pipe": + wantMode |= linux.S_IFIFO + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + + if got.Mode != wantMode { + t.Errorf("got mode %x, want %x", got.Mode, wantMode) + } + + // Ino. + if got.Ino == 0 { + t.Errorf("got ino %d, want not 0", got.Ino) + } + }) + } +} + +func TestSetStatAtime(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL} + + // Get initial stat. + initialStat, err := fd.Stat(ctx, allStatOptions) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Set atime, but without the mask. + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{ + Mask: 0, + Atime: linux.NsecToStatxTimestamp(100), + }}); err != nil { + t.Errorf("SetStat atime without mask failed: %v", err) + } + // Atime should be unchanged. + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != initialStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime) + } + + // Set atime, this time included in the mask. + setStat := linux.Statx{ + Mask: linux.STATX_ATIME, + Atime: linux.NsecToStatxTimestamp(100), + } + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { + t.Errorf("SetStat atime with mask failed: %v", err) + } + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != setStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime) + } +} + +func TestSetStat(t *testing.T) { + ctx := contexttest.Context(t) + mode := linux.FileMode(0644) + + // Run with different file types. + for _, typ := range []string{"file", "dir", "pipe"} { + t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { + var ( + fd *vfs.FileDescription + cleanup func() + err error + ) + switch typ { + case "file": + fd, cleanup, err = newFileFD(ctx, mode) + case "dir": + fd, cleanup, err = newDirFD(ctx, mode) + case "pipe": + fd, cleanup, err = newPipeFD(ctx, mode) + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + if err != nil { + t.Fatal(err) + } + defer cleanup() + + allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL} + + // Get initial stat. + initialStat, err := fd.Stat(ctx, allStatOptions) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Set atime, but without the mask. + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{ + Mask: 0, + Atime: linux.NsecToStatxTimestamp(100), + }}); err != nil { + t.Errorf("SetStat atime without mask failed: %v", err) + } + // Atime should be unchanged. + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != initialStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime) + } + + // Set atime, this time included in the mask. + setStat := linux.Statx{ + Mask: linux.STATX_ATIME, + Atime: linux.NsecToStatxTimestamp(100), + } + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { + t.Errorf("SetStat atime with mask failed: %v", err) + } + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != setStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime) + } + }) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go new file mode 100644 index 000000000..b0de5fabe --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -0,0 +1,37 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +type symlink struct { + inode inode + target string // immutable +} + +func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode { + link := &symlink{ + target: target, + } + link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode) + link.inode.nlink = 1 // from parent directory + return &link.inode +} + +// O_PATH is unimplemented, so there's no way to get a FileDescription +// representing a symlink yet. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go new file mode 100644 index 000000000..d7f4f0779 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -0,0 +1,787 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tmpfs provides an in-memory filesystem whose contents are +// application-mutable, consistent with Linux's tmpfs. +// +// Lock order: +// +// filesystem.mu +// inode.mu +// regularFileFD.offMu +// *** "memmap.Mappable locks" below this point +// regularFile.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point +// regularFile.dataMu +// directory.iterMu +package tmpfs + +import ( + "fmt" + "math" + "strconv" + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Name is the default filesystem name. +const Name = "tmpfs" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // memFile is used to allocate pages to for regular files. + memFile *pgalloc.MemoryFile + + // clock is a realtime clock used to set timestamps in file operations. + clock time.Clock + + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 + + // mu serializes changes to the Dentry tree. + mu sync.RWMutex + + nextInoMinusOne uint64 // accessed using atomic memory operations +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// FilesystemOpts is used to pass configuration data to tmpfs. +type FilesystemOpts struct { + // RootFileType is the FileType of the filesystem root. Valid values + // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. + RootFileType uint16 + + // RootSymlinkTarget is the target of the root symlink. Only valid if + // RootFileType == S_IFLNK. + RootSymlinkTarget string + + // FilesystemType allows setting a different FilesystemType for this + // tmpfs filesystem. This allows tmpfs to "impersonate" other + // filesystems, like ramdiskfs and cgroupfs. + FilesystemType vfs.FilesystemType +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) + if memFileProvider == nil { + panic("MemoryFileProviderFromContext returned nil") + } + + rootFileType := uint16(linux.S_IFDIR) + newFSType := vfs.FilesystemType(&fstype) + tmpfsOpts, ok := opts.InternalData.(FilesystemOpts) + if ok { + if tmpfsOpts.RootFileType != 0 { + rootFileType = tmpfsOpts.RootFileType + } + if tmpfsOpts.FilesystemType != nil { + newFSType = tmpfsOpts.FilesystemType + } + } + + mopts := vfs.GenericParseMountOptions(opts.Data) + rootMode := linux.FileMode(0777) + if rootFileType == linux.S_IFDIR { + rootMode = 01777 + } + modeStr, ok := mopts["mode"] + if ok { + delete(mopts, "mode") + mode, err := strconv.ParseUint(modeStr, 8, 32) + if err != nil { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) + return nil, nil, syserror.EINVAL + } + rootMode = linux.FileMode(mode & 07777) + } + rootKUID := creds.EffectiveKUID + uidStr, ok := mopts["uid"] + if ok { + delete(mopts, "uid") + uid, err := strconv.ParseUint(uidStr, 10, 32) + if err != nil { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) + return nil, nil, syserror.EINVAL + } + kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) + if !kuid.Ok() { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) + return nil, nil, syserror.EINVAL + } + rootKUID = kuid + } + rootKGID := creds.EffectiveKGID + gidStr, ok := mopts["gid"] + if ok { + delete(mopts, "gid") + gid, err := strconv.ParseUint(gidStr, 10, 32) + if err != nil { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) + return nil, nil, syserror.EINVAL + } + kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) + if !kgid.Ok() { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) + return nil, nil, syserror.EINVAL + } + rootKGID = kgid + } + if len(mopts) != 0 { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + clock := time.RealtimeClockFromContext(ctx) + fs := filesystem{ + memFile: memFileProvider.MemoryFile(), + clock: clock, + devMinor: devMinor, + } + fs.vfsfs.Init(vfsObj, newFSType, &fs) + + var root *dentry + switch rootFileType { + case linux.S_IFREG: + root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode)) + case linux.S_IFLNK: + root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget)) + case linux.S_IFDIR: + root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry + default: + fs.vfsfs.DecRef() + return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) + } + return &fs.vfsfs, &root.vfsd, nil +} + +// NewFilesystem returns a new tmpfs filesystem. +func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) { + return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{}) +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + // parent is this dentry's parent directory. Each referenced dentry holds a + // reference on parent.dentry. If this dentry is a filesystem root, parent + // is nil. parent is protected by filesystem.mu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.mu. + name string + + // dentryEntry (ugh) links dentries into their parent directory.childList. + dentryEntry + + // inode is the inode represented by this dentry. Multiple Dentries may + // share a single non-directory inode (with hard links). inode is + // immutable. + // + // tmpfs doesn't count references on dentries; because the dentry tree is + // the sole source of truth, it is by definition always consistent with the + // state of the filesystem. However, it does count references on inodes, + // because inode resources are released when all references are dropped. + // dentry therefore forwards reference counting directly to inode. + inode *inode +} + +func (fs *filesystem) newDentry(inode *inode) *dentry { + d := &dentry{ + inode: inode, + } + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + d.inode.incRef() +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + return d.inode.tryIncRef() +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + d.inode.decRef() +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { + if d.inode.isDir() { + events |= linux.IN_ISDIR + } + + // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates + // that d was deleted. + deleted := d.vfsd.IsDead() + + d.inode.fs.mu.RLock() + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + d.parent.inode.watches.Notify(d.name, events, cookie, et, deleted) + } + d.inode.watches.Notify("", events, cookie, et, deleted) + d.inode.fs.mu.RUnlock() +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + return &d.inode.watches +} + +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +func (d *dentry) OnZeroWatches() {} + +// inode represents a filesystem object. +type inode struct { + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // refs is a reference count. refs is accessed using atomic memory + // operations. + // + // A reference is held on all inodes as long as they are reachable in the + // filesystem tree, i.e. nlink is nonzero. This reference is dropped when + // nlink reaches 0. + refs int64 + + // xattrs implements extended attributes. + // + // TODO(b/148380782): Support xattrs other than user.* + xattrs memxattr.SimpleExtendedAttributes + + // Inode metadata. Writing multiple fields atomically requires holding + // mu, othewise atomic operations can be used. + mu sync.Mutex + mode uint32 // file type and mode + nlink uint32 // protected by filesystem.mu instead of inode.mu + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + ino uint64 // immutable + + // Linux's tmpfs has no concept of btime. + atime int64 // nanoseconds + ctime int64 // nanoseconds + mtime int64 // nanoseconds + + locks vfs.FileLocks + + // Inotify watches for this inode. + watches vfs.Watches + + impl interface{} // immutable +} + +const maxLinks = math.MaxUint32 + +func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) { + if mode.FileType() == 0 { + panic("file type is required in FileMode") + } + i.fs = fs + i.refs = 1 + i.mode = uint32(mode) + i.uid = uint32(kuid) + i.gid = uint32(kgid) + i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) + // Tmpfs creation sets atime, ctime, and mtime to current time. + now := fs.clock.Now().Nanoseconds() + i.atime = now + i.ctime = now + i.mtime = now + // i.nlink initialized by caller + i.impl = impl +} + +// incLinksLocked increments i's link count. +// +// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. +// i.nlink < maxLinks. +func (i *inode) incLinksLocked() { + if i.nlink == 0 { + panic("tmpfs.inode.incLinksLocked() called with no existing links") + } + if i.nlink == maxLinks { + panic("tmpfs.inode.incLinksLocked() called with maximum link count") + } + atomic.AddUint32(&i.nlink, 1) +} + +// decLinksLocked decrements i's link count. If the link count reaches 0, we +// remove a reference on i as well. +// +// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. +func (i *inode) decLinksLocked() { + if i.nlink == 0 { + panic("tmpfs.inode.decLinksLocked() called with no existing links") + } + if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 { + i.decRef() + } +} + +func (i *inode) incRef() { + if atomic.AddInt64(&i.refs, 1) <= 1 { + panic("tmpfs.inode.incRef() called without holding a reference") + } +} + +func (i *inode) tryIncRef() bool { + for { + refs := atomic.LoadInt64(&i.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { + return true + } + } +} + +func (i *inode) decRef() { + if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + i.watches.HandleDeletion() + if regFile, ok := i.impl.(*regularFile); ok { + // Release memory used by regFile to store data. Since regFile is + // no longer usable, we don't need to grab any locks or update any + // metadata. + regFile.data.DropAll(regFile.memFile) + } + } else if refs < 0 { + panic("tmpfs.inode.decRef() called without holding a reference") + } +} + +func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) +} + +// Go won't inline this function, and returning linux.Statx (which is quite +// big) means spending a lot of time in runtime.duffcopy(), so instead it's an +// output parameter. +// +// Note that Linux does not guarantee to return consistent data (in the case of +// a concurrent modification), so we do not require holding inode.mu. +func (i *inode) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | + linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | + linux.STATX_MTIME + stat.Blksize = usermem.PageSize + stat.Nlink = atomic.LoadUint32(&i.nlink) + stat.UID = atomic.LoadUint32(&i.uid) + stat.GID = atomic.LoadUint32(&i.gid) + stat.Mode = uint16(atomic.LoadUint32(&i.mode)) + stat.Ino = i.ino + stat.Atime = linux.NsecToStatxTimestamp(i.atime) + stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) + stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = i.fs.devMinor + switch impl := i.impl.(type) { + case *regularFile: + stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS + stat.Size = uint64(atomic.LoadUint64(&impl.size)) + // TODO(jamieliu): This should be impl.data.Span() / 512, but this is + // too expensive to compute here. Cache it in regularFile. + stat.Blocks = allocatedBlocksForSize(stat.Size) + case *directory: + // "20" is mm/shmem.c:BOGO_DIRENT_SIZE. + stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren))) + // stat.Blocks is 0. + case *symlink: + stat.Size = uint64(len(impl.target)) + // stat.Blocks is 0. + case *namedPipe, *socketFile: + // stat.Size and stat.Blocks are 0. + case *deviceFile: + // stat.Size and stat.Blocks are 0. + stat.RdevMajor = impl.major + stat.RdevMinor = impl.minor + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error { + if stat.Mask == 0 { + return nil + } + if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { + return syserror.EPERM + } + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + return err + } + i.mu.Lock() + defer i.mu.Unlock() + var ( + needsMtimeBump bool + needsCtimeBump bool + ) + mask := stat.Mask + if mask&linux.STATX_MODE != 0 { + ft := atomic.LoadUint32(&i.mode) & linux.S_IFMT + atomic.StoreUint32(&i.mode, ft|uint32(stat.Mode&^linux.S_IFMT)) + needsCtimeBump = true + } + if mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&i.uid, stat.UID) + needsCtimeBump = true + } + if mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&i.gid, stat.GID) + needsCtimeBump = true + } + if mask&linux.STATX_SIZE != 0 { + switch impl := i.impl.(type) { + case *regularFile: + updated, err := impl.truncateLocked(stat.Size) + if err != nil { + return err + } + if updated { + needsMtimeBump = true + needsCtimeBump = true + } + case *directory: + return syserror.EISDIR + default: + return syserror.EINVAL + } + } + now := i.fs.clock.Now().Nanoseconds() + if mask&linux.STATX_ATIME != 0 { + if stat.Atime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.atime, now) + } else { + atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + } + needsCtimeBump = true + } + if mask&linux.STATX_MTIME != 0 { + if stat.Mtime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.mtime, now) + } else { + atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + } + needsCtimeBump = true + // Ignore the mtime bump, since we just set it ourselves. + needsMtimeBump = false + } + if mask&linux.STATX_CTIME != 0 { + if stat.Ctime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.ctime, now) + } else { + atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + } + // Ignore the ctime bump, since we just set it ourselves. + needsCtimeBump = false + } + if needsMtimeBump { + atomic.StoreInt64(&i.mtime, now) + } + if needsCtimeBump { + atomic.StoreInt64(&i.ctime, now) + } + + return nil +} + +// allocatedBlocksForSize returns the number of 512B blocks needed to +// accommodate the given size in bytes, as appropriate for struct +// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block +// size is independent of the "preferred block size for I/O", struct +// stat::st_blksize and struct statx::stx_blksize.) +func allocatedBlocksForSize(size uint64) uint64 { + return (size + 511) / 512 +} + +func (i *inode) direntType() uint8 { + switch impl := i.impl.(type) { + case *regularFile: + return linux.DT_REG + case *directory: + return linux.DT_DIR + case *symlink: + return linux.DT_LNK + case *socketFile: + return linux.DT_SOCK + case *deviceFile: + switch impl.kind { + case vfs.BlockDevice: + return linux.DT_BLK + case vfs.CharDevice: + return linux.DT_CHR + default: + panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) + } + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +func (i *inode) isDir() bool { + return linux.FileMode(i.mode).FileType() == linux.S_IFDIR +} + +func (i *inode) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime { + return + } + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now := i.fs.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.atime, now) + i.mu.Unlock() + mnt.EndWrite() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). +func (i *inode) touchCtime() { + now := i.fs.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.ctime, now) + i.mu.Unlock() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). +func (i *inode) touchCMtime() { + now := i.fs.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.mtime, now) + atomic.StoreInt64(&i.ctime, now) + i.mu.Unlock() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds +// inode.mu. +func (i *inode) touchCMtimeLocked() { + now := i.fs.clock.Now().Nanoseconds() + atomic.StoreInt64(&i.mtime, now) + atomic.StoreInt64(&i.ctime, now) +} + +func (i *inode) listxattr(size uint64) ([]string, error) { + return i.xattrs.Listxattr(size) +} + +func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if err := i.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return "", syserror.ENODATA + } + return i.xattrs.Getxattr(opts) +} + +func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return syserror.EPERM + } + return i.xattrs.Setxattr(opts) +} + +func (i *inode) removexattr(creds *auth.Credentials, name string) error { + if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return syserror.EPERM + } + return i.xattrs.Removexattr(name) +} + +// Extended attributes in the user.* namespace are only supported for regular +// files and directories. +func (i *inode) userXattrSupported() bool { + filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode) + return filetype == linux.S_IFREG || filetype == linux.S_IFDIR +} + +// fileDescription is embedded by tmpfs implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + +func (fd *fileDescription) inode() *inode { + return fd.dentry().inode +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + fd.inode().statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + d := fd.dentry() + if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil { + return err + } + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0, vfs.InodeEvent) + } + return nil +} + +// Listxattr implements vfs.FileDescriptionImpl.Listxattr. +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.inode().listxattr(size) +} + +// Getxattr implements vfs.FileDescriptionImpl.Getxattr. +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts) +} + +// Setxattr implements vfs.FileDescriptionImpl.Setxattr. +func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { + d := fd.dentry() + if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil { + return err + } + + // Generate inotify events. + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// Removexattr implements vfs.FileDescriptionImpl.Removexattr. +func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { + d := fd.dentry() + if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil { + return err + } + + // Generate inotify events. + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// NewMemfd creates a new tmpfs regular file and file description that can back +// an anonymous fd created by memfd_create. +func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("NewMemfd() called with non-tmpfs mount") + } + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with + // S_IRWXUGO. + inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) + rf := inode.impl.(*regularFile) + if allowSeals { + rf.seals = 0 + } + + d := fs.newDentry(inode) + defer d.DecRef() + d.name = name + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with + // FMODE_READ | FMODE_WRITE. + var fd regularFileFD + fd.Init(&inode.locks) + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} + +// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all +// filesystem state is in-memory. +func (*fileDescription) Sync(context.Context) error { + return nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go new file mode 100644 index 000000000..a240fb276 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -0,0 +1,156 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// nextFileID is used to generate unique file names. +var nextFileID int64 + +// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error +// is not nil, then cleanup should be called when the root is no longer needed. +func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { + creds := auth.CredentialsFromContext(ctx) + + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) + } + root := mntns.Root() + return vfsObj, root, func() { + root.DecRef() + mntns.DecRef() + }, nil +} + +// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If +// the returned err is not nil, then cleanup should be called when the FD is no +// longer needed. +func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the file that will be write/read. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: linux.ModeRegular | mode, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) + } + + return fd, cleanup, nil +} + +// newDirFD is like newFileFD, but for directories. +func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the dir. + if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.MkdirOptions{ + Mode: linux.ModeDirectory | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) + } + + // Open the dir and return it. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) + } + + return fd, cleanup, nil +} + +// newPipeFD is like newFileFD, but for pipes. +func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + name := fmt.Sprintf("tmpfs-test-%d", atomic.AddInt64(&nextFileID, 1)) + + if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + }, &vfs.MknodOptions{ + Mode: linux.ModeNamedPipe | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create pipe %q: %v", name, err) + } + + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open pipe %q: %v", name, err) + } + + return fd, cleanup, nil +} |