summaryrefslogtreecommitdiffhomepage
path: root/runsc/fsgofer
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/fsgofer')
-rw-r--r--runsc/fsgofer/filter/config.go182
-rw-r--r--runsc/fsgofer/filter/extra_filters.go28
-rw-r--r--runsc/fsgofer/filter/extra_filters_msan.go33
-rw-r--r--runsc/fsgofer/filter/extra_filters_race.go42
-rw-r--r--runsc/fsgofer/filter/filter.go33
-rw-r--r--runsc/fsgofer/fsgofer.go1057
-rw-r--r--runsc/fsgofer/fsgofer_unsafe.go107
7 files changed, 1482 insertions, 0 deletions
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
new file mode 100644
index 000000000..4faab2946
--- /dev/null
+++ b/runsc/fsgofer/filter/config.go
@@ -0,0 +1,182 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+ "os"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// allowedSyscalls is the set of syscalls executed by the gofer.
+var allowedSyscalls = seccomp.SyscallRules{
+ syscall.SYS_ACCEPT: {},
+ syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(linux.ARCH_GET_FS)},
+ {seccomp.AllowValue(linux.ARCH_SET_FS)},
+ },
+ syscall.SYS_CLOCK_GETTIME: {},
+ syscall.SYS_CLONE: []seccomp.Rule{
+ {
+ seccomp.AllowValue(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ },
+ },
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
+ syscall.SYS_EPOLL_CTL: {},
+ syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_EVENTFD2: []seccomp.Rule{
+ {
+ seccomp.AllowValue(0),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_EXIT: {},
+ syscall.SYS_EXIT_GROUP: {},
+ syscall.SYS_FALLOCATE: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_FCHMOD: {},
+ syscall.SYS_FCHOWNAT: {},
+ syscall.SYS_FCNTL: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_GETFL),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_SETFL),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_GETFD),
+ },
+ },
+ syscall.SYS_FSTAT: {},
+ syscall.SYS_FSTATFS: {},
+ syscall.SYS_FSYNC: {},
+ syscall.SYS_FTRUNCATE: {},
+ syscall.SYS_FUTEX: {
+ seccomp.Rule{
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ seccomp.Rule{
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_GETDENTS64: {},
+ syscall.SYS_GETPID: {},
+ unix.SYS_GETRANDOM: {},
+ syscall.SYS_GETTID: {},
+ syscall.SYS_GETTIMEOFDAY: {},
+ syscall.SYS_LINKAT: {},
+ syscall.SYS_LSEEK: {},
+ syscall.SYS_MADVISE: {},
+ syscall.SYS_MKDIRAT: {},
+ syscall.SYS_MMAP: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_SHARED),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+ },
+ },
+ syscall.SYS_MPROTECT: {},
+ syscall.SYS_MUNMAP: {},
+ syscall.SYS_NANOSLEEP: {},
+ syscall.SYS_NEWFSTATAT: {},
+ syscall.SYS_OPENAT: {},
+ syscall.SYS_POLL: {},
+ syscall.SYS_PREAD64: {},
+ syscall.SYS_PWRITE64: {},
+ syscall.SYS_READ: {},
+ syscall.SYS_READLINKAT: {},
+ syscall.SYS_RECVMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+ },
+ },
+ syscall.SYS_RENAMEAT: {},
+ syscall.SYS_RESTART_SYSCALL: {},
+ syscall.SYS_RT_SIGPROCMASK: {},
+ syscall.SYS_SCHED_YIELD: {},
+ syscall.SYS_SENDMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+ },
+ },
+ syscall.SYS_SHUTDOWN: []seccomp.Rule{
+ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+ },
+ syscall.SYS_SIGALTSTACK: {},
+ syscall.SYS_SYMLINKAT: {},
+ syscall.SYS_TGKILL: []seccomp.Rule{
+ {
+ seccomp.AllowValue(uint64(os.Getpid())),
+ },
+ },
+ syscall.SYS_UNLINKAT: {},
+ syscall.SYS_UTIMENSAT: {},
+ syscall.SYS_WRITE: {},
+}
diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go
new file mode 100644
index 000000000..5c5ec4e06
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go intrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() seccomp.SyscallRules {
+ return nil
+}
diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go
new file mode 100644
index 000000000..553060bc3
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters_msan.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+ log.Warningf("*** SECCOMP WARNING: MSAN is enabled: syscall filters less restrictive!")
+ return seccomp.SyscallRules{
+ syscall.SYS_SCHED_GETAFFINITY: {},
+ syscall.SYS_SET_ROBUST_LIST: {},
+ }
+}
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
new file mode 100644
index 000000000..28555f898
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+ log.Warningf("*** SECCOMP WARNING: TSAN is enabled: syscall filters less restrictive!")
+ return seccomp.SyscallRules{
+ syscall.SYS_BRK: {},
+ syscall.SYS_CLONE: {},
+ syscall.SYS_FUTEX: {},
+ syscall.SYS_MADVISE: {},
+ syscall.SYS_MMAP: {},
+ syscall.SYS_MUNLOCK: {},
+ syscall.SYS_NANOSLEEP: {},
+ syscall.SYS_OPEN: {},
+ syscall.SYS_SET_ROBUST_LIST: {},
+ // Used within glibc's malloc.
+ syscall.SYS_TIME: {},
+ }
+}
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
new file mode 100644
index 000000000..ff8154369
--- /dev/null
+++ b/runsc/fsgofer/filter/filter.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the gofer is allowed to make, and
+// installs seccomp filters to prevent prohibited syscalls in case it's
+// compromised.
+package filter
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// Install installs seccomp filters.
+func Install() error {
+ s := allowedSyscalls
+
+ // Set of additional filters used by -race and -msan. Returns empty
+ // when not enabled.
+ s.Merge(instrumentationFilters())
+
+ return seccomp.Install(s)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
new file mode 100644
index 000000000..2cf50290a
--- /dev/null
+++ b/runsc/fsgofer/fsgofer.go
@@ -0,0 +1,1057 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsgofer implements p9.File giving access to local files using
+// a simple mapping from a path prefix that is added to the path requested
+// by the sandbox. Ex:
+//
+// prefix: "/docker/imgs/alpine"
+// app path: /bin/ls => /docker/imgs/alpine/bin/ls
+package fsgofer
+
+import (
+ "fmt"
+ "io"
+ "math"
+ "os"
+ "path"
+ "path/filepath"
+ "runtime"
+ "sync"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/p9"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ // invalidMode is set to a value that doesn't match any other valid
+ // modes to ensure an unopened/closed file fails all mode checks.
+ invalidMode = p9.OpenFlags(math.MaxUint32)
+
+ openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
+)
+
+type fileType int
+
+const (
+ regular fileType = iota
+ directory
+ symlink
+ unknown
+)
+
+// String implements fmt.Stringer.
+func (f fileType) String() string {
+ switch f {
+ case regular:
+ return "regular"
+ case directory:
+ return "directory"
+ case symlink:
+ return "symlink"
+ }
+ return "unknown"
+}
+
+// ControlSocketAddr generates an abstract unix socket name for the given id.
+func ControlSocketAddr(id string) string {
+ return fmt.Sprintf("\x00runsc-gofer.%s", id)
+}
+
+// Config sets configuration options for each attach point.
+type Config struct {
+ // ROMount is set to true if this is a readonly mount.
+ ROMount bool
+
+ // PanicOnWrite panics on attempts to write to RO mounts.
+ PanicOnWrite bool
+}
+
+type attachPoint struct {
+ prefix string
+ conf Config
+
+ // attachedMu protects attached.
+ attachedMu sync.Mutex
+ attached bool
+
+ // deviceMu protects devices and nextDevice.
+ deviceMu sync.Mutex
+
+ // nextDevice is the next device id that will be allocated.
+ nextDevice uint8
+
+ // devices is a map from actual host devices to "small" integers that
+ // can be combined with host inode to form a unique virtual inode id.
+ devices map[uint64]uint8
+}
+
+// NewAttachPoint creates a new attacher that gives local file
+// access to all files under 'prefix'. 'prefix' must be an absolute path.
+func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) {
+ // Sanity check the prefix.
+ if !filepath.IsAbs(prefix) {
+ return nil, fmt.Errorf("attach point prefix must be absolute %q", prefix)
+ }
+ return &attachPoint{
+ prefix: prefix,
+ conf: c,
+ devices: make(map[uint64]uint8),
+ }, nil
+}
+
+// Attach implements p9.Attacher.
+func (a *attachPoint) Attach() (p9.File, error) {
+ // dirFD (1st argument) is ignored because 'prefix' is always absolute.
+ stat, err := statAt(-1, a.prefix)
+ if err != nil {
+ return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err)
+ }
+ mode := syscall.O_RDWR
+ if a.conf.ROMount || stat.Mode&syscall.S_IFDIR != 0 {
+ mode = syscall.O_RDONLY
+ }
+
+ // Open the root directory.
+ f, err := fd.Open(a.prefix, openFlags|mode, 0)
+ if err != nil {
+ return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err)
+ }
+
+ a.attachedMu.Lock()
+ defer a.attachedMu.Unlock()
+ if a.attached {
+ f.Close()
+ return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
+ }
+ a.attached = true
+
+ return newLocalFile(a, f, a.prefix, stat)
+}
+
+// makeQID returns a unique QID for the given stat buffer.
+func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
+ a.deviceMu.Lock()
+ defer a.deviceMu.Unlock()
+
+ // First map the host device id to a unique 8-bit integer.
+ dev, ok := a.devices[stat.Dev]
+ if !ok {
+ a.devices[stat.Dev] = a.nextDevice
+ dev = a.nextDevice
+ a.nextDevice++
+ if a.nextDevice < dev {
+ panic(fmt.Sprintf("device id overflow! map: %+v", a.devices))
+ }
+ }
+
+ // Construct a "virtual" inode id with the uint8 device number in the
+ // first 8 bits, and the rest of the bits from the host inode id.
+ maskedIno := stat.Ino & 0x00ffffffffffffff
+ if maskedIno != stat.Ino {
+ log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino)
+ }
+ ino := uint64(dev)<<56 | maskedIno
+ log.Debugf("host inode %x on device %x mapped to virtual inode %x", stat.Ino, stat.Dev, ino)
+
+ return p9.QID{
+ Type: p9.FileMode(stat.Mode).QIDType(),
+ Path: ino,
+ }
+}
+
+// localFile implements p9.File wrapping a local file. The underlying file
+// is opened during Walk() and stored in 'file' to be used with other
+// operations. The file is opened as readonly, unless it's a symlink or there is
+// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is
+// called to clone the file. This reduces the number of walks that need to be
+// done by the host file system when files are reused.
+//
+// The file may be reopened if the requested mode in Open() is not a subset of
+// current mode. Consequently, 'file' could have a mode wider than requested and
+// must be verified before read/write operations. Before the file is opened and
+// after it's closed, 'mode' is set to an invalid value to prevent an unopened
+// file from being used.
+//
+// The reason that the file is not opened initially as read-write is for better
+// performance with 'overlay2' storage driver. overlay2 eagerly copies the
+// entire file up when it's opened in write mode, and would perform badly when
+type localFile struct {
+ p9.DefaultWalkGetAttr
+
+ // attachPoint is the attachPoint that serves this localFile.
+ attachPoint *attachPoint
+
+ // hostPath will be safely updated by the Renamed hook.
+ hostPath string
+
+ // file is opened when localFile is created and it's never nil. It may be
+ // reopened if the Open() mode is wider than the mode the file was originally
+ // opened with.
+ file *fd.FD
+
+ // mode is the mode in which the file was opened. Set to invalidMode
+ // if localFile isn't opened.
+ mode p9.OpenFlags
+
+ // ft is the fileType for this file.
+ ft fileType
+
+ // readDirMu protects against concurrent Readdir calls.
+ readDirMu sync.Mutex
+
+ // lastDirentOffset is the last offset returned by Readdir(). If another call
+ // to Readdir is made at the same offset, the file doesn't need to be
+ // repositioned. This is an important optimization because the caller must
+ // always make one extra call to detect EOF (empty result, no error).
+ lastDirentOffset uint64
+}
+
+func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) {
+ path := path.Join(parent.hostPath, name)
+ f, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+ return fd.OpenAt(parent.file, name, openFlags|mode, 0)
+ })
+ return f, path, err
+}
+
+// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
+// to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
+// actual file open and is customizable by the caller.
+func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
+ // Attempt to open file in the following mode in order:
+ // 1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too.
+ // Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option
+ // has no effect on regular files.
+ // 2. PATH: for symlinks
+ modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
+
+ var err error
+ var file *fd.FD
+ for i, mode := range modes {
+ file, err = fn(mode)
+ if err == nil {
+ // openat succeeded, we're done.
+ break
+ }
+ switch e := extractErrno(err); e {
+ case syscall.ENOENT:
+ // File doesn't exist, no point in retrying.
+ return nil, e
+ }
+ // openat failed. Try again with next mode, preserving 'err' in case this
+ // was the last attempt.
+ log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err)
+ }
+ if err != nil {
+ // All attempts to open file have failed, return the last error.
+ log.Debugf("Failed to open file, path: %q, err: %v", path, err)
+ return nil, extractErrno(err)
+ }
+
+ return file, nil
+}
+
+func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
+ var ft fileType
+ switch stat.Mode & syscall.S_IFMT {
+ case syscall.S_IFREG:
+ ft = regular
+ case syscall.S_IFDIR:
+ ft = directory
+ case syscall.S_IFLNK:
+ ft = symlink
+ default:
+ return unknown, syscall.EPERM
+ }
+ return ft, nil
+}
+
+func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
+ ft, err := getSupportedFileType(stat)
+ if err != nil {
+ return nil, err
+ }
+
+ return &localFile{
+ attachPoint: a,
+ hostPath: path,
+ file: file,
+ mode: invalidMode,
+ ft: ft,
+ }, nil
+}
+
+// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as
+// non-blocking. If anything fails, returns nil. It's better to have a file
+// without host FD, than to fail the operation.
+func newFDMaybe(file *fd.FD) *fd.FD {
+ dupFD, err := syscall.Dup(file.FD())
+ // Technically, the runtime may call the finalizer on file as soon as
+ // FD() returns.
+ runtime.KeepAlive(file)
+ if err != nil {
+ return nil
+ }
+ dup := fd.New(dupFD)
+
+ // fd is blocking; non-blocking is required.
+ if err := syscall.SetNonblock(dup.FD(), true); err != nil {
+ dup.Close()
+ return nil
+ }
+ return dup
+}
+
+func stat(fd int) (syscall.Stat_t, error) {
+ var stat syscall.Stat_t
+ if err := syscall.Fstat(fd, &stat); err != nil {
+ return syscall.Stat_t{}, err
+ }
+ return stat, nil
+}
+
+func fchown(fd int, uid p9.UID, gid p9.GID) error {
+ return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+}
+
+// Open implements p9.File.
+func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+ if l.isOpen() {
+ panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
+ }
+
+ // Check if control file can be used or if a new open must be created.
+ var newFile *fd.FD
+ if mode == p9.ReadOnly {
+ log.Debugf("Open reusing control file, mode: %v, %q", mode, l.hostPath)
+ newFile = l.file
+ } else {
+ // Ideally reopen would call name_to_handle_at (with empty name) and
+ // open_by_handle_at to reopen the file without using 'hostPath'. However,
+ // name_to_handle_at and open_by_handle_at aren't supported by overlay2.
+ log.Debugf("Open reopening file, mode: %v, %q", mode, l.hostPath)
+ var err error
+ newFile, err = fd.Open(l.hostPath, openFlags|mode.OSFlags(), 0)
+ if err != nil {
+ return nil, p9.QID{}, 0, extractErrno(err)
+ }
+ }
+
+ stat, err := stat(newFile.FD())
+ if err != nil {
+ if newFile != l.file {
+ newFile.Close()
+ }
+ return nil, p9.QID{}, 0, extractErrno(err)
+ }
+
+ var fd *fd.FD
+ if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+ // Donate FD for regular files only.
+ fd = newFDMaybe(newFile)
+ }
+
+ // Close old file in case a new one was created.
+ if newFile != l.file {
+ if err := l.file.Close(); err != nil {
+ log.Warningf("Error closing file %q: %v", l.hostPath, err)
+ }
+ l.file = newFile
+ }
+ l.mode = mode
+ return fd, l.attachPoint.makeQID(stat), 0, nil
+}
+
+// Create implements p9.File.
+func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return nil, nil, p9.QID{}, 0, syscall.EBADF
+ }
+
+ // 'file' may be used for other operations (e.g. Walk), so read access is
+ // always added to flags. Note that resulting file might have a wider mode
+ // than needed for each particular case.
+ flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+ if mode == p9.WriteOnly {
+ flags |= syscall.O_RDWR
+ } else {
+ flags |= mode.OSFlags()
+ }
+
+ child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions()))
+ if err != nil {
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+ cu := specutils.MakeCleanup(func() {
+ child.Close()
+ // Best effort attempt to remove the file in case of failure.
+ if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
+ }
+ })
+ defer cu.Clean()
+
+ if err := fchown(child.FD(), uid, gid); err != nil {
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+ stat, err := stat(child.FD())
+ if err != nil {
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+
+ c := &localFile{
+ attachPoint: l.attachPoint,
+ hostPath: path.Join(l.hostPath, name),
+ file: child,
+ mode: mode,
+ }
+
+ cu.Release()
+ return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
+}
+
+// Mkdir implements p9.File.
+func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return p9.QID{}, syscall.EBADF
+ }
+
+ if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ cu := specutils.MakeCleanup(func() {
+ // Best effort attempt to remove the dir in case of failure.
+ if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil {
+ log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
+ }
+ })
+ defer cu.Clean()
+
+ // Open directory to change ownership and stat it.
+ flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+ f, err := fd.OpenAt(l.file, name, flags, 0)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ defer f.Close()
+
+ if err := fchown(f.FD(), uid, gid); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ stat, err := stat(f.FD())
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+
+ cu.Release()
+ return l.attachPoint.makeQID(stat), nil
+}
+
+// Walk implements p9.File.
+func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+ // Duplicate current file if 'names' is empty.
+ if len(names) == 0 {
+ newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
+ return fd.Open(l.hostPath, openFlags|mode, 0)
+ })
+ if err != nil {
+ return nil, nil, extractErrno(err)
+ }
+
+ stat, err := stat(newFile.FD())
+ if err != nil {
+ newFile.Close()
+ return nil, nil, extractErrno(err)
+ }
+
+ c := &localFile{
+ attachPoint: l.attachPoint,
+ hostPath: l.hostPath,
+ file: newFile,
+ mode: invalidMode,
+ }
+ return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
+ }
+
+ var qids []p9.QID
+ last := l
+ for _, name := range names {
+ f, path, err := openAnyFileFromParent(last, name)
+ if last != l {
+ last.Close()
+ }
+ if err != nil {
+ return nil, nil, extractErrno(err)
+ }
+ stat, err := stat(f.FD())
+ if err != nil {
+ f.Close()
+ return nil, nil, extractErrno(err)
+ }
+ c, err := newLocalFile(last.attachPoint, f, path, stat)
+ if err != nil {
+ f.Close()
+ return nil, nil, extractErrno(err)
+ }
+
+ qids = append(qids, l.attachPoint.makeQID(stat))
+ last = c
+ }
+ return qids, last, nil
+}
+
+// StatFS implements p9.File.
+func (l *localFile) StatFS() (p9.FSStat, error) {
+ var s syscall.Statfs_t
+ if err := syscall.Fstatfs(l.file.FD(), &s); err != nil {
+ return p9.FSStat{}, extractErrno(err)
+ }
+
+ // Populate with what's available.
+ return p9.FSStat{
+ Type: uint32(s.Type),
+ BlockSize: uint32(s.Bsize),
+ Blocks: s.Blocks,
+ BlocksFree: s.Bfree,
+ BlocksAvailable: s.Bavail,
+ Files: s.Files,
+ FilesFree: s.Ffree,
+ NameLength: uint32(s.Namelen),
+ }, nil
+}
+
+// FSync implements p9.File.
+func (l *localFile) FSync() error {
+ if !l.isOpen() {
+ return syscall.EBADF
+ }
+ if err := syscall.Fsync(l.file.FD()); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// GetAttr implements p9.File.
+func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+ stat, err := stat(l.file.FD())
+ if err != nil {
+ return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
+ }
+
+ attr := p9.Attr{
+ Mode: p9.FileMode(stat.Mode),
+ UID: p9.UID(stat.Uid),
+ GID: p9.GID(stat.Gid),
+ NLink: stat.Nlink,
+ RDev: stat.Rdev,
+ Size: uint64(stat.Size),
+ BlockSize: uint64(stat.Blksize),
+ Blocks: uint64(stat.Blocks),
+ ATimeSeconds: uint64(stat.Atim.Sec),
+ ATimeNanoSeconds: uint64(stat.Atim.Nsec),
+ MTimeSeconds: uint64(stat.Mtim.Sec),
+ MTimeNanoSeconds: uint64(stat.Mtim.Nsec),
+ CTimeSeconds: uint64(stat.Ctim.Sec),
+ CTimeNanoSeconds: uint64(stat.Ctim.Nsec),
+ }
+ valid := p9.AttrMask{
+ Mode: true,
+ UID: true,
+ GID: true,
+ NLink: true,
+ RDev: true,
+ Size: true,
+ Blocks: true,
+ ATime: true,
+ MTime: true,
+ CTime: true,
+ }
+
+ return l.attachPoint.makeQID(stat), valid, attr, nil
+}
+
+// SetAttr implements p9.File. Due to mismatch in file API, options
+// cannot be changed atomicaly and user may see partial changes when
+// an error happens.
+func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return syscall.EBADF
+ }
+
+ allowed := p9.SetAttrMask{
+ Permissions: true,
+ UID: true,
+ GID: true,
+ Size: true,
+ ATime: true,
+ MTime: true,
+ ATimeNotSystemTime: true,
+ MTimeNotSystemTime: true,
+ }
+
+ if valid.Empty() {
+ // Nothing to do.
+ return nil
+ }
+
+ // Handle all the sanity checks up front so that the client gets a
+ // consistent result that is not attribute dependent.
+ if !valid.IsSubsetOf(allowed) {
+ log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid)
+ return syscall.EPERM
+ }
+
+ // Check if it's possible to use cached file, or if another one needs to be
+ // opened for write.
+ f := l.file
+ if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+ var err error
+ f, err = fd.Open(l.hostPath, openFlags|syscall.O_WRONLY, 0)
+ if err != nil {
+ return extractErrno(err)
+ }
+ defer f.Close()
+ }
+
+ // The semantics are to either return an error if no changes were made,
+ // or no error if *all* changes were made. Well, this can be impossible
+ // if the filesystem rejects at least one of the changes, especially
+ // since some operations are not easy to undo atomically.
+ //
+ // This could be made better if SetAttr actually returned the changes
+ // it did make, so the client can at least know what has changed. So
+ // we at least attempt to make all of the changes and return a generic
+ // error if any of them fails, which at least doesn't bias any change
+ // over another.
+ var err error
+ if valid.Permissions {
+ if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
+ log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
+ err = extractErrno(cerr)
+ }
+ }
+
+ if valid.Size {
+ if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
+ log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ }
+
+ if valid.ATime || valid.MTime {
+ utimes := [2]syscall.Timespec{
+ {Sec: 0, Nsec: linux.UTIME_OMIT},
+ {Sec: 0, Nsec: linux.UTIME_OMIT},
+ }
+ if valid.ATime {
+ if valid.ATimeNotSystemTime {
+ utimes[0].Sec = int64(attr.ATimeSeconds)
+ utimes[0].Nsec = int64(attr.ATimeNanoSeconds)
+ } else {
+ utimes[0].Nsec = linux.UTIME_NOW
+ }
+ }
+ if valid.MTime {
+ if valid.MTimeNotSystemTime {
+ utimes[1].Sec = int64(attr.MTimeSeconds)
+ utimes[1].Nsec = int64(attr.MTimeNanoSeconds)
+ } else {
+ utimes[1].Nsec = linux.UTIME_NOW
+ }
+ }
+
+ if l.ft == symlink {
+ // utimensat operates different that other syscalls. To operate on a
+ // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
+ // name.
+ parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+ if err != nil {
+ return extractErrno(err)
+ }
+ defer syscall.Close(parent)
+
+ if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
+ log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ } else {
+ // Directories and regular files can operate directly on the fd
+ // using empty name.
+ if terr := utimensat(f.FD(), "", utimes, 0); terr != nil {
+ log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ }
+ }
+
+ if valid.UID || valid.GID {
+ uid := -1
+ if valid.UID {
+ uid = int(attr.UID)
+ }
+ gid := -1
+ if valid.GID {
+ gid = int(attr.GID)
+ }
+ if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+ log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
+ err = extractErrno(oerr)
+ }
+ }
+
+ return err
+}
+
+// Allocate implements p9.File.
+func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
+ if !l.isOpen() {
+ return syscall.EBADF
+ }
+
+ if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// Rename implements p9.File; this should never be called.
+func (l *localFile) Rename(p9.File, string) error {
+ panic("rename called directly")
+}
+
+// RenameAt implements p9.File.RenameAt.
+func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return syscall.EBADF
+ }
+
+ newParent := directory.(*localFile)
+ if err := renameat(l.file.FD(), oldName, newParent.file.FD(), newName); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// ReadAt implements p9.File.
+func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
+ if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+ return 0, syscall.EBADF
+ }
+ if !l.isOpen() {
+ return 0, syscall.EBADF
+ }
+
+ r, err := l.file.ReadAt(p, int64(offset))
+ switch err {
+ case nil, io.EOF:
+ return r, nil
+ default:
+ return r, extractErrno(err)
+ }
+}
+
+// WriteAt implements p9.File.
+func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
+ if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+ return 0, syscall.EBADF
+ }
+ if !l.isOpen() {
+ return 0, syscall.EBADF
+ }
+
+ w, err := l.file.WriteAt(p, int64(offset))
+ if err != nil {
+ return w, extractErrno(err)
+ }
+ return w, nil
+}
+
+// Symlink implements p9.File.
+func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return p9.QID{}, syscall.EBADF
+ }
+
+ if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ cu := specutils.MakeCleanup(func() {
+ // Best effort attempt to remove the symlink in case of failure.
+ if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
+ }
+ })
+ defer cu.Clean()
+
+ // Open symlink to change ownership and stat it.
+ f, err := fd.OpenAt(l.file, newName, unix.O_PATH|openFlags, 0)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ defer f.Close()
+
+ if err := fchown(f.FD(), uid, gid); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ stat, err := stat(f.FD())
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+
+ cu.Release()
+ return l.attachPoint.makeQID(stat), nil
+}
+
+// Link implements p9.File.
+func (l *localFile) Link(target p9.File, newName string) error {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return syscall.EBADF
+ }
+
+ targetFile := target.(*localFile)
+ if err := unix.Linkat(targetFile.file.FD(), "", l.file.FD(), newName, linux.AT_EMPTY_PATH); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// Mknod implements p9.File.
+//
+// Not implemented.
+func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+ // From mknod(2) man page:
+ // "EPERM: [...] if the filesystem containing pathname does not support
+ // the type of node requested."
+ return p9.QID{}, syscall.EPERM
+}
+
+// UnlinkAt implements p9.File.
+func (l *localFile) UnlinkAt(name string, flags uint32) error {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return syscall.EBADF
+ }
+
+ if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// Readdir implements p9.File.
+func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
+ if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+ return nil, syscall.EBADF
+ }
+ if !l.isOpen() {
+ return nil, syscall.EBADF
+ }
+
+ // Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
+ // reading all directory contents. Take a lock because this operation is
+ // stateful.
+ l.readDirMu.Lock()
+ defer l.readDirMu.Unlock()
+
+ skip := uint64(0)
+
+ // Check if the file is at the correct position already. If not, seek to the
+ // beginning and read the entire directory again.
+ if l.lastDirentOffset != offset {
+ if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
+ return nil, extractErrno(err)
+ }
+ skip = offset
+ }
+
+ dirents, err := l.readDirent(l.file.FD(), offset, count, skip)
+ if err == nil {
+ // On success, remember the offset that was returned at the current
+ // position.
+ l.lastDirentOffset = offset + uint64(len(dirents))
+ } else {
+ // On failure, the state is unknown, force call to seek() next time.
+ l.lastDirentOffset = math.MaxUint64
+ }
+ return dirents, err
+}
+
+func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) {
+ // Limit 'count' to cap the slice size that is returned.
+ const maxCount = 100000
+ if count > maxCount {
+ count = maxCount
+ }
+
+ dirents := make([]p9.Dirent, 0, count)
+
+ // Pre-allocate buffers that will be reused to get partial results.
+ direntsBuf := make([]byte, 8192)
+ names := make([]string, 0, 100)
+
+ end := offset + uint64(count)
+ for offset < end {
+ dirSize, err := syscall.ReadDirent(f, direntsBuf)
+ if err != nil {
+ return dirents, err
+ }
+ if dirSize <= 0 {
+ return dirents, nil
+ }
+
+ names := names[:0]
+ _, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names)
+
+ // Skip over entries that the caller is not interested in.
+ if skip > 0 {
+ if skip > uint64(len(names)) {
+ skip -= uint64(len(names))
+ names = names[:0]
+ } else {
+ names = names[skip:]
+ skip = 0
+ }
+ }
+ for _, name := range names {
+ stat, err := statAt(l.file.FD(), name)
+ if err != nil {
+ log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err)
+ continue
+ }
+ qid := l.attachPoint.makeQID(stat)
+ offset++
+ dirents = append(dirents, p9.Dirent{
+ QID: qid,
+ Type: qid.Type,
+ Name: name,
+ Offset: offset,
+ })
+ }
+ }
+ return dirents, nil
+}
+
+// Readlink implements p9.File.
+func (l *localFile) Readlink() (string, error) {
+ // Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
+ const limit = 1024 * 1024
+ for len := 128; len < limit; len *= 2 {
+ b := make([]byte, len)
+ n, err := unix.Readlinkat(l.file.FD(), "", b)
+ if err != nil {
+ return "", extractErrno(err)
+ }
+ if n < len {
+ return string(b[:n]), nil
+ }
+ }
+ return "", syscall.ENOMEM
+}
+
+// Flush implements p9.File.
+func (l *localFile) Flush() error {
+ return nil
+}
+
+// Connect implements p9.File.
+func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
+ return nil, syscall.ECONNREFUSED
+}
+
+// Close implements p9.File.
+func (l *localFile) Close() error {
+ l.mode = invalidMode
+ err := l.file.Close()
+ l.file = nil
+ return err
+}
+
+func (l *localFile) isOpen() bool {
+ return l.mode != invalidMode
+}
+
+// Renamed implements p9.Renamed.
+func (l *localFile) Renamed(newDir p9.File, newName string) {
+ l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
+}
+
+// extractErrno tries to determine the errno.
+func extractErrno(err error) syscall.Errno {
+ if err == nil {
+ // This should never happen. The likely result will be that
+ // some user gets the frustrating "error: SUCCESS" message.
+ log.Warningf("extractErrno called with nil error!")
+ return 0
+ }
+
+ switch err {
+ case os.ErrNotExist:
+ return syscall.ENOENT
+ case os.ErrExist:
+ return syscall.EEXIST
+ case os.ErrPermission:
+ return syscall.EACCES
+ case os.ErrInvalid:
+ return syscall.EINVAL
+ }
+
+ // See if it's an errno or a common wrapped error.
+ switch e := err.(type) {
+ case syscall.Errno:
+ return e
+ case *os.PathError:
+ return extractErrno(e.Err)
+ case *os.LinkError:
+ return extractErrno(e.Err)
+ case *os.SyscallError:
+ return extractErrno(e.Err)
+ }
+
+ // Fall back to EIO.
+ log.Debugf("Unknown error: %v, defaulting to EIO", err)
+ return syscall.EIO
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
new file mode 100644
index 000000000..58af5e44d
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -0,0 +1,107 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+ nameBytes, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return syscall.Stat_t{}, err
+ }
+ namePtr := unsafe.Pointer(nameBytes)
+
+ var stat syscall.Stat_t
+ statPtr := unsafe.Pointer(&stat)
+
+ if _, _, errno := syscall.Syscall6(
+ syscall.SYS_NEWFSTATAT,
+ uintptr(dirFd),
+ uintptr(namePtr),
+ uintptr(statPtr),
+ linux.AT_SYMLINK_NOFOLLOW,
+ 0,
+ 0); errno != 0 {
+
+ return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+ }
+ return stat, nil
+}
+
+func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+ // utimensat(2) doesn't accept empty name, instead name must be nil to make it
+ // operate directly on 'dirFd' unlike other *at syscalls.
+ var namePtr unsafe.Pointer
+ if name != "" {
+ nameBytes, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return err
+ }
+ namePtr = unsafe.Pointer(nameBytes)
+ }
+
+ timesPtr := unsafe.Pointer(&times[0])
+
+ if _, _, errno := syscall.Syscall6(
+ syscall.SYS_UTIMENSAT,
+ uintptr(dirFd),
+ uintptr(namePtr),
+ uintptr(timesPtr),
+ uintptr(flags),
+ 0,
+ 0); errno != 0 {
+
+ return syserr.FromHost(errno).ToError()
+ }
+ return nil
+}
+
+func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error {
+ var oldNamePtr unsafe.Pointer
+ if oldName != "" {
+ nameBytes, err := syscall.BytePtrFromString(oldName)
+ if err != nil {
+ return err
+ }
+ oldNamePtr = unsafe.Pointer(nameBytes)
+ }
+ var newNamePtr unsafe.Pointer
+ if newName != "" {
+ nameBytes, err := syscall.BytePtrFromString(newName)
+ if err != nil {
+ return err
+ }
+ newNamePtr = unsafe.Pointer(nameBytes)
+ }
+
+ if _, _, errno := syscall.Syscall6(
+ syscall.SYS_RENAMEAT,
+ uintptr(oldDirFD),
+ uintptr(oldNamePtr),
+ uintptr(newDirFD),
+ uintptr(newNamePtr),
+ 0,
+ 0); errno != 0 {
+
+ return syserr.FromHost(errno).ToError()
+ }
+ return nil
+}