// Copyright 2018 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package container import ( "bufio" "fmt" "os" "path/filepath" "strings" "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/runsc/boot" "gvisor.googlesource.com/gvisor/runsc/specutils" ) type mapping struct { set bool val uint32 } var optionsMap = map[string]mapping{ "acl": {set: true, val: syscall.MS_POSIXACL}, "async": {set: false, val: syscall.MS_SYNCHRONOUS}, "atime": {set: false, val: syscall.MS_NOATIME}, "bind": {set: true, val: syscall.MS_BIND}, "defaults": {set: true, val: 0}, "dev": {set: false, val: syscall.MS_NODEV}, "diratime": {set: false, val: syscall.MS_NODIRATIME}, "dirsync": {set: true, val: syscall.MS_DIRSYNC}, "exec": {set: false, val: syscall.MS_NOEXEC}, "iversion": {set: true, val: syscall.MS_I_VERSION}, "loud": {set: false, val: syscall.MS_SILENT}, "mand": {set: true, val: syscall.MS_MANDLOCK}, "noacl": {set: false, val: syscall.MS_POSIXACL}, "noatime": {set: true, val: syscall.MS_NOATIME}, "nodev": {set: true, val: syscall.MS_NODEV}, "nodiratime": {set: true, val: syscall.MS_NODIRATIME}, "noexec": {set: true, val: syscall.MS_NOEXEC}, "noiversion": {set: false, val: syscall.MS_I_VERSION}, "nomand": {set: false, val: syscall.MS_MANDLOCK}, "norelatime": {set: false, val: syscall.MS_RELATIME}, "nostrictatime": {set: false, val: syscall.MS_STRICTATIME}, "nosuid": {set: true, val: syscall.MS_NOSUID}, "private": {set: true, val: syscall.MS_PRIVATE}, "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC}, "relatime": {set: true, val: syscall.MS_RELATIME}, "remount": {set: true, val: syscall.MS_REMOUNT}, "ro": {set: true, val: syscall.MS_RDONLY}, "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC}, "rw": {set: false, val: syscall.MS_RDONLY}, "silent": {set: true, val: syscall.MS_SILENT}, "strictatime": {set: true, val: syscall.MS_STRICTATIME}, "suid": {set: false, val: syscall.MS_NOSUID}, "sync": {set: true, val: syscall.MS_SYNCHRONOUS}, } // setupFS creates the container directory structure under 'spec.Root.Path'. // This allows the gofer serving the containers to be chroot under this // directory to create an extra layer to security in case the gofer gets // compromised. // Returns list of mounts equivalent to 'spec.Mounts' with all destination paths // cleaned and with symlinks resolved. func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mount, error) { rv := make([]specs.Mount, 0, len(spec.Mounts)) for _, m := range spec.Mounts { if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { rv = append(rv, m) continue } // It's possible that 'm.Destination' follows symlinks inside the // container. dst, err := resolveSymlinks(spec.Root.Path, m.Destination) if err != nil { return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) } flags := optionsToFlags(m.Options) flags |= syscall.MS_BIND log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil { return nil, fmt.Errorf("failed to mount %v: %v", m, err) } // Make the mount a slave, so that for recursive bind mount, umount won't // propagate to the source. flags = syscall.MS_SLAVE | syscall.MS_REC if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil { return nil, fmt.Errorf("failed to rslave mount dst: %q, flags: %#x, err: %v", dst, flags, err) } cpy := m relDst, err := filepath.Rel(spec.Root.Path, dst) if err != nil { panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, spec.Root.Path, err)) } cpy.Destination = filepath.Join("/", relDst) rv = append(rv, cpy) } if spec.Process.Cwd != "" { dst, err := resolveSymlinks(spec.Root.Path, spec.Process.Cwd) if err != nil { return nil, fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) } if err := os.MkdirAll(dst, 0755); err != nil { return nil, err } } // If root is read only, check if it needs to be remounted as readonly. if spec.Root.Readonly { isMountPoint, readonly, err := mountInfo(spec.Root.Path) if err != nil { return nil, err } if readonly { return rv, nil } if !isMountPoint { // Readonly root is not a mount point nor read-only. Can't do much other // than just logging a warning. The gofer will prevent files to be open // in write mode. log.Warningf("Mount where root is located is not read-only and cannot be changed: %q", spec.Root.Path) return rv, nil } // If root is a mount point but not read-only, we can change mount options // to make it read-only for extra safety. log.Infof("Remounting root as readonly: %q", spec.Root.Path) flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC) src := spec.Root.Path if err := syscall.Mount(src, src, "bind", flags, ""); err != nil { return nil, fmt.Errorf("failed to remount root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err) } } return rv, nil } // mountInfo returns whether the path is a mount point and whether the mount // that path belongs to is read-only. func mountInfo(path string) (bool, bool, error) { // Mounts are listed by their real paths. realPath, err := filepath.EvalSymlinks(path) if err != nil { return false, false, err } f, err := os.Open("/proc/mounts") if err != nil { return false, false, err } scanner := bufio.NewScanner(f) var mountPoint string var readonly bool for scanner.Scan() { line := scanner.Text() parts := strings.Split(line, " ") if len(parts) < 4 { return false, false, fmt.Errorf("invalid /proc/mounts line format %q", line) } mp := parts[1] opts := strings.Split(parts[3], ",") // Find the closest submount to the path. if strings.Contains(realPath, mp) && len(mp) > len(mountPoint) { mountPoint = mp readonly = specutils.ContainsStr(opts, "ro") } } if err := scanner.Err(); err != nil { return false, false, err } return mountPoint == realPath, readonly, nil } // destroyFS unmounts mounts done by runsc under `spec.Root.Path`. This // recovers the container rootfs into the original state. func destroyFS(spec *specs.Spec) error { for _, m := range spec.Mounts { if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { continue } // It's possible that 'm.Destination' follows symlinks inside the // container. dst, err := resolveSymlinks(spec.Root.Path, m.Destination) if err != nil { return err } flags := syscall.MNT_DETACH log.Infof("Unmounting dst: %q, flags: %#x", dst, flags) // Do not return error if dst is not a mountpoint. // Based on http://man7.org/linux/man-pages/man2/umount.2.html // For kernel version 2.6+ and MNT_DETACH flag, EINVAL means // the dst is not a mount point. if err := syscall.Unmount(dst, flags); err != nil && !os.IsNotExist(err) && err != syscall.EINVAL { return err } } return nil } // resolveSymlinks walks 'rel' having 'root' as the root directory. If there are // symlinks, they are evaluated relative to 'root' to ensure the end result is // the same as if the process was running inside the container. func resolveSymlinks(root, rel string) (string, error) { return resolveSymlinksImpl(root, root, rel, 255) } func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { if followCount == 0 { return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) } rel = filepath.Clean(rel) for _, name := range strings.Split(rel, string(filepath.Separator)) { if name == "" { continue } // Note that Join() resolves things like ".." and returns a clean path. path := filepath.Join(base, name) if !strings.HasPrefix(path, root) { // One cannot '..' their way out of root. path = root continue } fi, err := os.Lstat(path) if err != nil { if !os.IsNotExist(err) { return "", err } // Not found means there is no symlink to check. Just keep walking dirs. base = path continue } if fi.Mode()&os.ModeSymlink != 0 { link, err := os.Readlink(path) if err != nil { return "", err } if filepath.IsAbs(link) { base = root } base, err = resolveSymlinksImpl(root, base, link, followCount-1) if err != nil { return "", err } continue } base = path } return base, nil } func optionsToFlags(opts []string) uint32 { var rv uint32 for _, opt := range opts { if m, ok := optionsMap[opt]; ok { if m.set { rv |= m.val } else { rv ^= m.val } } else { log.Warningf("Ignoring mount option %q", opt) } } return rv }