diff options
Diffstat (limited to 'runsc/fsgofer/fsgofer.go')
-rw-r--r-- | runsc/fsgofer/fsgofer.go | 1181 |
1 files changed, 1181 insertions, 0 deletions
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go new file mode 100644 index 000000000..b7521bda7 --- /dev/null +++ b/runsc/fsgofer/fsgofer.go @@ -0,0 +1,1181 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsgofer implements p9.File giving access to local files using +// a simple mapping from a path prefix that is added to the path requested +// by the sandbox. Ex: +// +// prefix: "/docker/imgs/alpine" +// app path: /bin/ls => /docker/imgs/alpine/bin/ls +package fsgofer + +import ( + "fmt" + "io" + "math" + "os" + "path" + "path/filepath" + "runtime" + "strconv" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" +) + +const ( + // invalidMode is set to a value that doesn't match any other valid + // modes to ensure an unopened/closed file fails all mode checks. + invalidMode = p9.OpenFlags(math.MaxUint32) + + openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC +) + +type fileType int + +const ( + regular fileType = iota + directory + symlink + socket + unknown +) + +// String implements fmt.Stringer. +func (f fileType) String() string { + switch f { + case regular: + return "regular" + case directory: + return "directory" + case symlink: + return "symlink" + case socket: + return "socket" + } + return "unknown" +} + +// ControlSocketAddr generates an abstract unix socket name for the given id. +func ControlSocketAddr(id string) string { + return fmt.Sprintf("\x00runsc-gofer.%s", id) +} + +// Config sets configuration options for each attach point. +type Config struct { + // ROMount is set to true if this is a readonly mount. + ROMount bool + + // PanicOnWrite panics on attempts to write to RO mounts. + PanicOnWrite bool + + // HostUDS signals whether the gofer can mount a host's UDS. + HostUDS bool +} + +type attachPoint struct { + prefix string + conf Config + + // attachedMu protects attached. + attachedMu sync.Mutex + attached bool + + // deviceMu protects devices and nextDevice. + deviceMu sync.Mutex + + // nextDevice is the next device id that will be allocated. + nextDevice uint8 + + // devices is a map from actual host devices to "small" integers that + // can be combined with host inode to form a unique virtual inode id. + devices map[uint64]uint8 +} + +// NewAttachPoint creates a new attacher that gives local file +// access to all files under 'prefix'. 'prefix' must be an absolute path. +func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) { + // Sanity check the prefix. + if !filepath.IsAbs(prefix) { + return nil, fmt.Errorf("attach point prefix must be absolute %q", prefix) + } + return &attachPoint{ + prefix: prefix, + conf: c, + devices: make(map[uint64]uint8), + }, nil +} + +// Attach implements p9.Attacher. +func (a *attachPoint) Attach() (p9.File, error) { + a.attachedMu.Lock() + defer a.attachedMu.Unlock() + + if a.attached { + return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) + } + + f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { + return fd.Open(a.prefix, openFlags|mode, 0) + }) + if err != nil { + return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err) + } + + stat, err := fstat(f.FD()) + if err != nil { + return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) + } + + lf, err := newLocalFile(a, f, a.prefix, stat) + if err != nil { + return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) + } + a.attached = true + return lf, nil +} + +// makeQID returns a unique QID for the given stat buffer. +func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { + a.deviceMu.Lock() + defer a.deviceMu.Unlock() + + // First map the host device id to a unique 8-bit integer. + dev, ok := a.devices[stat.Dev] + if !ok { + a.devices[stat.Dev] = a.nextDevice + dev = a.nextDevice + a.nextDevice++ + if a.nextDevice < dev { + panic(fmt.Sprintf("device id overflow! map: %+v", a.devices)) + } + } + + // Construct a "virtual" inode id with the uint8 device number in the + // first 8 bits, and the rest of the bits from the host inode id. + maskedIno := stat.Ino & 0x00ffffffffffffff + if maskedIno != stat.Ino { + log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino) + } + ino := uint64(dev)<<56 | maskedIno + return p9.QID{ + Type: p9.FileMode(stat.Mode).QIDType(), + Path: ino, + } +} + +// localFile implements p9.File wrapping a local file. The underlying file +// is opened during Walk() and stored in 'file' to be used with other +// operations. The file is opened as readonly, unless it's a symlink or there is +// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is +// called to clone the file. This reduces the number of walks that need to be +// done by the host file system when files are reused. +// +// The file may be reopened if the requested mode in Open() is not a subset of +// current mode. Consequently, 'file' could have a mode wider than requested and +// must be verified before read/write operations. Before the file is opened and +// after it's closed, 'mode' is set to an invalid value to prevent an unopened +// file from being used. +// +// The reason that the file is not opened initially as read-write is for better +// performance with 'overlay2' storage driver. overlay2 eagerly copies the +// entire file up when it's opened in write mode, and would perform badly when +// multiple files are only being opened for read (esp. startup). +type localFile struct { + p9.DefaultWalkGetAttr + + // attachPoint is the attachPoint that serves this localFile. + attachPoint *attachPoint + + // hostPath will be safely updated by the Renamed hook. + hostPath string + + // file is opened when localFile is created and it's never nil. It may be + // reopened if the Open() mode is wider than the mode the file was originally + // opened with. + file *fd.FD + + // mode is the mode in which the file was opened. Set to invalidMode + // if localFile isn't opened. + mode p9.OpenFlags + + // ft is the fileType for this file. + ft fileType + + // readDirMu protects against concurrent Readdir calls. + readDirMu sync.Mutex + + // lastDirentOffset is the last offset returned by Readdir(). If another call + // to Readdir is made at the same offset, the file doesn't need to be + // repositioned. This is an important optimization because the caller must + // always make one extra call to detect EOF (empty result, no error). + lastDirentOffset uint64 +} + +var procSelfFD *fd.FD + +// OpenProcSelfFD opens the /proc/self/fd directory, which will be used to +// reopen file descriptors. +func OpenProcSelfFD() error { + d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0) + if err != nil { + return fmt.Errorf("error opening /proc/self/fd: %v", err) + } + procSelfFD = fd.New(d) + return nil +} + +func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { + d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0) + if err != nil { + return nil, err + } + + return fd.New(d), nil +} + +func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) { + path := path.Join(parent.hostPath, name) + f, err := openAnyFile(path, func(mode int) (*fd.FD, error) { + return fd.OpenAt(parent.file, name, openFlags|mode, 0) + }) + return f, path, err +} + +// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback +// to O_PATH. 'path' is used for logging messages only. 'fn' is what does the +// actual file open and is customizable by the caller. +func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) { + // Attempt to open file in the following mode in order: + // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. + // Use non-blocking to prevent getting stuck inside open(2) for + // FIFOs. This option has no effect on regular files. + // 2. PATH: for symlinks, sockets. + modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH} + + var err error + var file *fd.FD + for i, mode := range modes { + file, err = fn(mode) + if err == nil { + // openat succeeded, we're done. + break + } + switch e := extractErrno(err); e { + case syscall.ENOENT: + // File doesn't exist, no point in retrying. + return nil, e + } + // openat failed. Try again with next mode, preserving 'err' in case this + // was the last attempt. + log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err) + } + if err != nil { + // All attempts to open file have failed, return the last error. + log.Debugf("Failed to open file, path: %q, err: %v", path, err) + return nil, extractErrno(err) + } + + return file, nil +} + +func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) { + var ft fileType + switch stat.Mode & syscall.S_IFMT { + case syscall.S_IFREG: + ft = regular + case syscall.S_IFDIR: + ft = directory + case syscall.S_IFLNK: + ft = symlink + case syscall.S_IFSOCK: + if !permitSocket { + return unknown, syscall.EPERM + } + ft = socket + default: + return unknown, syscall.EPERM + } + return ft, nil +} + +func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) { + ft, err := getSupportedFileType(stat, a.conf.HostUDS) + if err != nil { + return nil, err + } + + return &localFile{ + attachPoint: a, + hostPath: path, + file: file, + mode: invalidMode, + ft: ft, + }, nil +} + +// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as +// non-blocking. If anything fails, returns nil. It's better to have a file +// without host FD, than to fail the operation. +func newFDMaybe(file *fd.FD) *fd.FD { + dupFD, err := syscall.Dup(file.FD()) + // Technically, the runtime may call the finalizer on file as soon as + // FD() returns. + runtime.KeepAlive(file) + if err != nil { + return nil + } + dup := fd.New(dupFD) + + // fd is blocking; non-blocking is required. + if err := syscall.SetNonblock(dup.FD(), true); err != nil { + dup.Close() + return nil + } + return dup +} + +func fstat(fd int) (syscall.Stat_t, error) { + var stat syscall.Stat_t + if err := syscall.Fstat(fd, &stat); err != nil { + return syscall.Stat_t{}, err + } + return stat, nil +} + +func stat(path string) (syscall.Stat_t, error) { + var stat syscall.Stat_t + if err := syscall.Stat(path, &stat); err != nil { + return syscall.Stat_t{}, err + } + return stat, nil +} + +func fchown(fd int, uid p9.UID, gid p9.GID) error { + return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) +} + +// Open implements p9.File. +func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { + if l.isOpen() { + panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath)) + } + + // Check if control file can be used or if a new open must be created. + var newFile *fd.FD + if flags == p9.ReadOnly { + log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath) + newFile = l.file + } else { + // Ideally reopen would call name_to_handle_at (with empty name) and + // open_by_handle_at to reopen the file without using 'hostPath'. However, + // name_to_handle_at and open_by_handle_at aren't supported by overlay2. + log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath) + var err error + // Constrain open flags to the open mode and O_TRUNC. + newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC))) + if err != nil { + return nil, p9.QID{}, 0, extractErrno(err) + } + } + + stat, err := fstat(newFile.FD()) + if err != nil { + if newFile != l.file { + newFile.Close() + } + return nil, p9.QID{}, 0, extractErrno(err) + } + + var fd *fd.FD + if stat.Mode&syscall.S_IFMT == syscall.S_IFREG { + // Donate FD for regular files only. + fd = newFDMaybe(newFile) + } + + // Close old file in case a new one was created. + if newFile != l.file { + if err := l.file.Close(); err != nil { + log.Warningf("Error closing file %q: %v", l.hostPath, err) + } + l.file = newFile + } + l.mode = flags & p9.OpenFlagsModeMask + return fd, l.attachPoint.makeQID(stat), 0, nil +} + +// Create implements p9.File. +func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return nil, nil, p9.QID{}, 0, syscall.EBADF + } + + // 'file' may be used for other operations (e.g. Walk), so read access is + // always added to flags. Note that resulting file might have a wider mode + // than needed for each particular case. + flags := openFlags | syscall.O_CREAT | syscall.O_EXCL + if mode == p9.WriteOnly { + flags |= syscall.O_RDWR + } else { + flags |= mode.OSFlags() + } + + child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions())) + if err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + cu := cleanup.Make(func() { + child.Close() + // Best effort attempt to remove the file in case of failure. + if err := syscall.Unlinkat(l.file.FD(), name); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err) + } + }) + defer cu.Clean() + + if err := fchown(child.FD(), uid, gid); err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + stat, err := fstat(child.FD()) + if err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + + c := &localFile{ + attachPoint: l.attachPoint, + hostPath: path.Join(l.hostPath, name), + file: child, + mode: mode, + } + + cu.Release() + return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil +} + +// Mkdir implements p9.File. +func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EBADF + } + + if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil { + return p9.QID{}, extractErrno(err) + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the dir in case of failure. + if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil { + log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err) + } + }) + defer cu.Clean() + + // Open directory to change ownership and stat it. + flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags + f, err := fd.OpenAt(l.file, name, flags, 0) + if err != nil { + return p9.QID{}, extractErrno(err) + } + defer f.Close() + + if err := fchown(f.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } + stat, err := fstat(f.FD()) + if err != nil { + return p9.QID{}, extractErrno(err) + } + + cu.Release() + return l.attachPoint.makeQID(stat), nil +} + +// Walk implements p9.File. +func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { + // Duplicate current file if 'names' is empty. + if len(names) == 0 { + newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { + return reopenProcFd(l.file, openFlags|mode) + }) + if err != nil { + return nil, nil, extractErrno(err) + } + + stat, err := fstat(newFile.FD()) + if err != nil { + newFile.Close() + return nil, nil, extractErrno(err) + } + + c := &localFile{ + attachPoint: l.attachPoint, + hostPath: l.hostPath, + file: newFile, + mode: invalidMode, + } + return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil + } + + var qids []p9.QID + last := l + for _, name := range names { + f, path, err := openAnyFileFromParent(last, name) + if last != l { + last.Close() + } + if err != nil { + return nil, nil, extractErrno(err) + } + stat, err := fstat(f.FD()) + if err != nil { + f.Close() + return nil, nil, extractErrno(err) + } + c, err := newLocalFile(last.attachPoint, f, path, stat) + if err != nil { + f.Close() + return nil, nil, extractErrno(err) + } + + qids = append(qids, l.attachPoint.makeQID(stat)) + last = c + } + return qids, last, nil +} + +// StatFS implements p9.File. +func (l *localFile) StatFS() (p9.FSStat, error) { + var s syscall.Statfs_t + if err := syscall.Fstatfs(l.file.FD(), &s); err != nil { + return p9.FSStat{}, extractErrno(err) + } + + // Populate with what's available. + return p9.FSStat{ + Type: uint32(s.Type), + BlockSize: uint32(s.Bsize), + Blocks: s.Blocks, + BlocksFree: s.Bfree, + BlocksAvailable: s.Bavail, + Files: s.Files, + FilesFree: s.Ffree, + NameLength: uint32(s.Namelen), + }, nil +} + +// FSync implements p9.File. +func (l *localFile) FSync() error { + if !l.isOpen() { + return syscall.EBADF + } + if err := syscall.Fsync(l.file.FD()); err != nil { + return extractErrno(err) + } + return nil +} + +// GetAttr implements p9.File. +func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { + stat, err := fstat(l.file.FD()) + if err != nil { + return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) + } + + attr := p9.Attr{ + Mode: p9.FileMode(stat.Mode), + UID: p9.UID(stat.Uid), + GID: p9.GID(stat.Gid), + NLink: uint64(stat.Nlink), + RDev: stat.Rdev, + Size: uint64(stat.Size), + BlockSize: uint64(stat.Blksize), + Blocks: uint64(stat.Blocks), + ATimeSeconds: uint64(stat.Atim.Sec), + ATimeNanoSeconds: uint64(stat.Atim.Nsec), + MTimeSeconds: uint64(stat.Mtim.Sec), + MTimeNanoSeconds: uint64(stat.Mtim.Nsec), + CTimeSeconds: uint64(stat.Ctim.Sec), + CTimeNanoSeconds: uint64(stat.Ctim.Nsec), + } + valid := p9.AttrMask{ + Mode: true, + UID: true, + GID: true, + NLink: true, + RDev: true, + Size: true, + Blocks: true, + ATime: true, + MTime: true, + CTime: true, + } + + return l.attachPoint.makeQID(stat), valid, attr, nil +} + +// SetAttr implements p9.File. Due to mismatch in file API, options +// cannot be changed atomically and user may see partial changes when +// an error happens. +func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return syscall.EBADF + } + + allowed := p9.SetAttrMask{ + Permissions: true, + UID: true, + GID: true, + Size: true, + ATime: true, + MTime: true, + ATimeNotSystemTime: true, + MTimeNotSystemTime: true, + } + + if valid.Empty() { + // Nothing to do. + return nil + } + + // Handle all the sanity checks up front so that the client gets a + // consistent result that is not attribute dependent. + if !valid.IsSubsetOf(allowed) { + log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid) + return syscall.EPERM + } + + // Check if it's possible to use cached file, or if another one needs to be + // opened for write. + f := l.file + if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { + var err error + f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY) + if err != nil { + return extractErrno(err) + } + defer f.Close() + } + + // The semantics are to either return an error if no changes were made, + // or no error if *all* changes were made. Well, this can be impossible + // if the filesystem rejects at least one of the changes, especially + // since some operations are not easy to undo atomically. + // + // This could be made better if SetAttr actually returned the changes + // it did make, so the client can at least know what has changed. So + // we at least attempt to make all of the changes and return a generic + // error if any of them fails, which at least doesn't bias any change + // over another. + var err error + if valid.Permissions { + if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil { + log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr) + err = extractErrno(cerr) + } + } + + if valid.Size { + if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil { + log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } + + if valid.ATime || valid.MTime { + utimes := [2]syscall.Timespec{ + {Sec: 0, Nsec: linux.UTIME_OMIT}, + {Sec: 0, Nsec: linux.UTIME_OMIT}, + } + if valid.ATime { + if valid.ATimeNotSystemTime { + utimes[0].Sec = int64(attr.ATimeSeconds) + utimes[0].Nsec = int64(attr.ATimeNanoSeconds) + } else { + utimes[0].Nsec = linux.UTIME_NOW + } + } + if valid.MTime { + if valid.MTimeNotSystemTime { + utimes[1].Sec = int64(attr.MTimeSeconds) + utimes[1].Nsec = int64(attr.MTimeNanoSeconds) + } else { + utimes[1].Nsec = linux.UTIME_NOW + } + } + + if l.ft == symlink { + // utimensat operates different that other syscalls. To operate on a + // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty + // name. + parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0) + if err != nil { + return extractErrno(err) + } + defer syscall.Close(parent) + + if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil { + log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } else { + // Directories and regular files can operate directly on the fd + // using empty name. + if terr := utimensat(f.FD(), "", utimes, 0); terr != nil { + log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } + } + + if valid.UID || valid.GID { + uid := -1 + if valid.UID { + uid = int(attr.UID) + } + gid := -1 + if valid.GID { + gid = int(attr.GID) + } + if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil { + log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr) + err = extractErrno(oerr) + } + } + + return err +} + +func (*localFile) GetXattr(string, uint64) (string, error) { + return "", syscall.EOPNOTSUPP +} + +func (*localFile) SetXattr(string, string, uint32) error { + return syscall.EOPNOTSUPP +} + +func (*localFile) ListXattr(uint64) (map[string]struct{}, error) { + return nil, syscall.EOPNOTSUPP +} + +func (*localFile) RemoveXattr(string) error { + return syscall.EOPNOTSUPP +} + +// Allocate implements p9.File. +func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error { + if !l.isOpen() { + return syscall.EBADF + } + + if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil { + return extractErrno(err) + } + return nil +} + +// Rename implements p9.File; this should never be called. +func (*localFile) Rename(p9.File, string) error { + panic("rename called directly") +} + +// RenameAt implements p9.File.RenameAt. +func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return syscall.EBADF + } + + newParent := directory.(*localFile) + if err := renameat(l.file.FD(), oldName, newParent.file.FD(), newName); err != nil { + return extractErrno(err) + } + return nil +} + +// ReadAt implements p9.File. +func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) { + if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite { + return 0, syscall.EBADF + } + if !l.isOpen() { + return 0, syscall.EBADF + } + + r, err := l.file.ReadAt(p, int64(offset)) + switch err { + case nil, io.EOF: + return r, nil + default: + return r, extractErrno(err) + } +} + +// WriteAt implements p9.File. +func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) { + if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { + return 0, syscall.EBADF + } + if !l.isOpen() { + return 0, syscall.EBADF + } + + w, err := l.file.WriteAt(p, int64(offset)) + if err != nil { + return w, extractErrno(err) + } + return w, nil +} + +// Symlink implements p9.File. +func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EBADF + } + + if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil { + return p9.QID{}, extractErrno(err) + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the symlink in case of failure. + if err := syscall.Unlinkat(l.file.FD(), newName); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err) + } + }) + defer cu.Clean() + + // Open symlink to change ownership and stat it. + f, err := fd.OpenAt(l.file, newName, unix.O_PATH|openFlags, 0) + if err != nil { + return p9.QID{}, extractErrno(err) + } + defer f.Close() + + if err := fchown(f.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } + stat, err := fstat(f.FD()) + if err != nil { + return p9.QID{}, extractErrno(err) + } + + cu.Release() + return l.attachPoint.makeQID(stat), nil +} + +// Link implements p9.File. +func (l *localFile) Link(target p9.File, newName string) error { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return syscall.EBADF + } + + targetFile := target.(*localFile) + if err := unix.Linkat(targetFile.file.FD(), "", l.file.FD(), newName, linux.AT_EMPTY_PATH); err != nil { + return extractErrno(err) + } + return nil +} + +// Mknod implements p9.File. +func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EROFS + } + + hostPath := path.Join(l.hostPath, name) + + // Return EEXIST if the file already exists. + if _, err := stat(hostPath); err == nil { + return p9.QID{}, syscall.EEXIST + } + + // From mknod(2) man page: + // "EPERM: [...] if the filesystem containing pathname does not support + // the type of node requested." + if mode.FileType() != p9.ModeRegular { + return p9.QID{}, syscall.EPERM + } + + // Allow Mknod to create regular files. + if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil { + return p9.QID{}, err + } + + stat, err := stat(hostPath) + if err != nil { + return p9.QID{}, extractErrno(err) + } + return l.attachPoint.makeQID(stat), nil +} + +// UnlinkAt implements p9.File. +func (l *localFile) UnlinkAt(name string, flags uint32) error { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return syscall.EBADF + } + + if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil { + return extractErrno(err) + } + return nil +} + +// Readdir implements p9.File. +func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { + if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite { + return nil, syscall.EBADF + } + if !l.isOpen() { + return nil, syscall.EBADF + } + + // Readdirnames is a cursor over directories, so seek back to 0 to ensure it's + // reading all directory contents. Take a lock because this operation is + // stateful. + l.readDirMu.Lock() + defer l.readDirMu.Unlock() + + skip := uint64(0) + + // Check if the file is at the correct position already. If not, seek to the + // beginning and read the entire directory again. + if l.lastDirentOffset != offset { + if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil { + return nil, extractErrno(err) + } + skip = offset + } + + dirents, err := l.readDirent(l.file.FD(), offset, count, skip) + if err == nil { + // On success, remember the offset that was returned at the current + // position. + l.lastDirentOffset = offset + uint64(len(dirents)) + } else { + // On failure, the state is unknown, force call to seek() next time. + l.lastDirentOffset = math.MaxUint64 + } + return dirents, err +} + +func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) { + var dirents []p9.Dirent + + // Limit 'count' to cap the slice size that is returned. + const maxCount = 100000 + if count > maxCount { + count = maxCount + } + + // Pre-allocate buffers that will be reused to get partial results. + direntsBuf := make([]byte, 8192) + names := make([]string, 0, 100) + + end := offset + uint64(count) + for offset < end { + dirSize, err := syscall.ReadDirent(f, direntsBuf) + if err != nil { + return dirents, err + } + if dirSize <= 0 { + return dirents, nil + } + + names := names[:0] + _, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names) + + // Skip over entries that the caller is not interested in. + if skip > 0 { + if skip > uint64(len(names)) { + skip -= uint64(len(names)) + names = names[:0] + } else { + names = names[skip:] + skip = 0 + } + } + for _, name := range names { + stat, err := statAt(l.file.FD(), name) + if err != nil { + log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err) + continue + } + qid := l.attachPoint.makeQID(stat) + offset++ + dirents = append(dirents, p9.Dirent{ + QID: qid, + Type: qid.Type, + Name: name, + Offset: offset, + }) + } + } + return dirents, nil +} + +// Readlink implements p9.File. +func (l *localFile) Readlink() (string, error) { + // Shamelessly stolen from os.Readlink (added upper bound limit to buffer). + const limit = 1024 * 1024 + for len := 128; len < limit; len *= 2 { + b := make([]byte, len) + n, err := unix.Readlinkat(l.file.FD(), "", b) + if err != nil { + return "", extractErrno(err) + } + if n < len { + return string(b[:n]), nil + } + } + return "", syscall.ENOMEM +} + +// Flush implements p9.File. +func (l *localFile) Flush() error { + return nil +} + +// Connect implements p9.File. +func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { + if !l.attachPoint.conf.HostUDS { + return nil, syscall.ECONNREFUSED + } + + // TODO(gvisor.dev/issue/1003): Due to different app vs replacement + // mappings, the app path may have fit in the sockaddr, but we can't + // fit f.path in our sockaddr. We'd need to redirect through a shorter + // path in order to actually connect to this socket. + if len(l.hostPath) > linux.UnixPathMax { + return nil, syscall.ECONNREFUSED + } + + var stype int + switch flags { + case p9.StreamSocket: + stype = syscall.SOCK_STREAM + case p9.DgramSocket: + stype = syscall.SOCK_DGRAM + case p9.SeqpacketSocket: + stype = syscall.SOCK_SEQPACKET + default: + return nil, syscall.ENXIO + } + + f, err := syscall.Socket(syscall.AF_UNIX, stype, 0) + if err != nil { + return nil, err + } + + if err := syscall.SetNonblock(f, true); err != nil { + syscall.Close(f) + return nil, err + } + + sa := syscall.SockaddrUnix{Name: l.hostPath} + if err := syscall.Connect(f, &sa); err != nil { + syscall.Close(f) + return nil, err + } + + return fd.New(f), nil +} + +// Close implements p9.File. +func (l *localFile) Close() error { + l.mode = invalidMode + err := l.file.Close() + l.file = nil + return err +} + +func (l *localFile) isOpen() bool { + return l.mode != invalidMode +} + +// Renamed implements p9.Renamed. +func (l *localFile) Renamed(newDir p9.File, newName string) { + l.hostPath = path.Join(newDir.(*localFile).hostPath, newName) +} + +// extractErrno tries to determine the errno. +func extractErrno(err error) syscall.Errno { + if err == nil { + // This should never happen. The likely result will be that + // some user gets the frustrating "error: SUCCESS" message. + log.Warningf("extractErrno called with nil error!") + return 0 + } + + switch err { + case os.ErrNotExist: + return syscall.ENOENT + case os.ErrExist: + return syscall.EEXIST + case os.ErrPermission: + return syscall.EACCES + case os.ErrInvalid: + return syscall.EINVAL + } + + // See if it's an errno or a common wrapped error. + switch e := err.(type) { + case syscall.Errno: + return e + case *os.PathError: + return extractErrno(e.Err) + case *os.LinkError: + return extractErrno(e.Err) + case *os.SyscallError: + return extractErrno(e.Err) + } + + // Fall back to EIO. + log.Debugf("Unknown error: %v, defaulting to EIO", err) + return syscall.EIO +} |