// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
	"context"
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"syscall"

	"github.com/google/subcommands"
	specs "github.com/opencontainers/runtime-spec/specs-go"
	"golang.org/x/sys/unix"
	"gvisor.dev/gvisor/pkg/log"
	"gvisor.dev/gvisor/pkg/p9"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/unet"
	"gvisor.dev/gvisor/runsc/config"
	"gvisor.dev/gvisor/runsc/flag"
	"gvisor.dev/gvisor/runsc/fsgofer"
	"gvisor.dev/gvisor/runsc/fsgofer/filter"
	"gvisor.dev/gvisor/runsc/specutils"
)

var caps = []string{
	"CAP_CHOWN",
	"CAP_DAC_OVERRIDE",
	"CAP_DAC_READ_SEARCH",
	"CAP_FOWNER",
	"CAP_FSETID",
	"CAP_SYS_CHROOT",
}

// goferCaps is the minimal set of capabilities needed by the Gofer to operate
// on files.
var goferCaps = &specs.LinuxCapabilities{
	Bounding:  caps,
	Effective: caps,
	Permitted: caps,
}

// Gofer implements subcommands.Command for the "gofer" command, which starts a
// filesystem gofer.  This command should not be called directly.
type Gofer struct {
	bundleDir string
	ioFDs     intFlags
	applyCaps bool
	setUpRoot bool

	specFD   int
	mountsFD int
}

// Name implements subcommands.Command.
func (*Gofer) Name() string {
	return "gofer"
}

// Synopsis implements subcommands.Command.
func (*Gofer) Synopsis() string {
	return "launch a gofer process that serves files over 9P protocol (internal use only)"
}

// Usage implements subcommands.Command.
func (*Gofer) Usage() string {
	return `gofer [flags]`
}

// SetFlags implements subcommands.Command.
func (g *Gofer) SetFlags(f *flag.FlagSet) {
	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
	f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
}

// Execute implements subcommands.Command.
func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 {
		f.Usage()
		return subcommands.ExitUsageError
	}

	conf := args[0].(*config.Config)

	specFile := os.NewFile(uintptr(g.specFD), "spec file")
	defer specFile.Close()
	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf)
	if err != nil {
		Fatalf("reading spec: %v", err)
	}

	if g.setUpRoot {
		if err := setupRootFS(spec, conf); err != nil {
			Fatalf("Error setting up root FS: %v", err)
		}
	}
	if g.applyCaps {
		// Disable caps when calling myself again.
		// Note: minimal argument handling for the default case to keep it simple.
		args := os.Args
		args = append(args, "--apply-caps=false", "--setup-root=false")
		if err := setCapsAndCallSelf(args, goferCaps); err != nil {
			Fatalf("Unable to apply caps: %v", err)
		}
		panic("unreachable")
	}

	// Find what path is going to be served by this gofer.
	root := spec.Root.Path
	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
		root = "/root"
	}

	// Resolve mount points paths, then replace mounts from our spec and send the
	// mount list over to the sandbox, so they are both in sync.
	//
	// Note that all mount points have been mounted in the proper location in
	// setupRootFS().
	cleanMounts, err := resolveMounts(conf, spec.Mounts, root)
	if err != nil {
		Fatalf("Failure to resolve mounts: %v", err)
	}
	spec.Mounts = cleanMounts
	go func() {
		if err := g.writeMounts(cleanMounts); err != nil {
			panic(fmt.Sprintf("Failed to write mounts: %v", err))
		}
	}()

	specutils.LogSpec(spec)

	// fsgofer should run with a umask of 0, because we want to preserve file
	// modes exactly as sent by the sandbox, which will have applied its own umask.
	syscall.Umask(0)

	if err := fsgofer.OpenProcSelfFD(); err != nil {
		Fatalf("failed to open /proc/self/fd: %v", err)
	}

	if err := syscall.Chroot(root); err != nil {
		Fatalf("failed to chroot to %q: %v", root, err)
	}
	if err := syscall.Chdir("/"); err != nil {
		Fatalf("changing working dir: %v", err)
	}
	log.Infof("Process chroot'd to %q", root)

	// Start with root mount, then add any other additional mount as needed.
	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
	ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
		ROMount: spec.Root.Readonly || conf.Overlay,
	})
	if err != nil {
		Fatalf("creating attach point: %v", err)
	}
	ats = append(ats, ap)
	log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly)

	mountIdx := 1 // first one is the root
	for _, m := range spec.Mounts {
		if specutils.Is9PMount(m) {
			cfg := fsgofer.Config{
				ROMount: isReadonlyMount(m.Options) || conf.Overlay,
				HostUDS: conf.FSGoferHostUDS,
			}
			ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
			if err != nil {
				Fatalf("creating attach point: %v", err)
			}
			ats = append(ats, ap)

			if mountIdx >= len(g.ioFDs) {
				Fatalf("no FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
			}
			log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfg.ROMount)
			mountIdx++
		}
	}
	if mountIdx != len(g.ioFDs) {
		Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
	}

	if conf.FSGoferHostUDS {
		filter.InstallUDSFilters()
	}

	if err := filter.Install(); err != nil {
		Fatalf("installing seccomp filters: %v", err)
	}

	runServers(ats, g.ioFDs)
	return subcommands.ExitSuccess
}

func runServers(ats []p9.Attacher, ioFDs []int) {
	// Run the loops and wait for all to exit.
	var wg sync.WaitGroup
	for i, ioFD := range ioFDs {
		wg.Add(1)
		go func(ioFD int, at p9.Attacher) {
			socket, err := unet.NewSocket(ioFD)
			if err != nil {
				Fatalf("creating server on FD %d: %v", ioFD, err)
			}
			s := p9.NewServer(at)
			if err := s.Handle(socket); err != nil {
				Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
			}
			wg.Done()
		}(ioFD, ats[i])
	}
	wg.Wait()
	log.Infof("All 9P servers exited.")
}

func (g *Gofer) writeMounts(mounts []specs.Mount) error {
	bytes, err := json.Marshal(mounts)
	if err != nil {
		return err
	}

	f := os.NewFile(uintptr(g.mountsFD), "mounts file")
	defer f.Close()

	for written := 0; written < len(bytes); {
		w, err := f.Write(bytes[written:])
		if err != nil {
			return err
		}
		written += w
	}
	return nil
}

func isReadonlyMount(opts []string) bool {
	for _, o := range opts {
		if o == "ro" {
			return true
		}
	}
	return false
}

func setupRootFS(spec *specs.Spec, conf *config.Config) error {
	// Convert all shared mounts into slaves to be sure that nothing will be
	// propagated outside of our namespace.
	if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
		Fatalf("error converting mounts: %v", err)
	}

	root := spec.Root.Path
	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
		// mount ./proc and ./root there, then move this mount to the root and after
		// setCapsAndCallSelf, runsc will chroot into /root.
		//
		// We need a directory to construct a new root and we know that
		// runsc can't start without /proc, so we can use it for this.
		flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC)
		if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil {
			Fatalf("error mounting tmpfs: %v", err)
		}

		// Prepare tree structure for pivot_root(2).
		os.Mkdir("/proc/proc", 0755)
		os.Mkdir("/proc/root", 0755)
		if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil {
			Fatalf("error mounting proc: %v", err)
		}
		root = "/proc/root"
	}

	// Mount root path followed by submounts.
	if err := syscall.Mount(spec.Root.Path, root, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
		return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
	}

	flags := uint32(syscall.MS_SLAVE | syscall.MS_REC)
	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
		flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
	}
	if err := syscall.Mount("", root, "", uintptr(flags), ""); err != nil {
		return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
	}

	// Replace the current spec, with the clean spec with symlinks resolved.
	if err := setupMounts(conf, spec.Mounts, root); err != nil {
		Fatalf("error setting up FS: %v", err)
	}

	// Create working directory if needed.
	if spec.Process.Cwd != "" {
		dst, err := resolveSymlinks(root, spec.Process.Cwd)
		if err != nil {
			return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
		}
		log.Infof("Create working directory %q if needed", spec.Process.Cwd)
		if err := os.MkdirAll(dst, 0755); err != nil {
			return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
		}
	}

	// Check if root needs to be remounted as readonly.
	if spec.Root.Readonly || conf.Overlay {
		// If root is a mount point but not read-only, we can change mount options
		// to make it read-only for extra safety.
		log.Infof("Remounting root as readonly: %q", root)
		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
		if err := syscall.Mount(root, root, "bind", flags, ""); err != nil {
			return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
		}
	}

	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
		if err := pivotRoot("/proc"); err != nil {
			Fatalf("failed to change the root file system: %v", err)
		}
		if err := os.Chdir("/"); err != nil {
			Fatalf("failed to change working directory")
		}
	}
	return nil
}

// setupMounts binds mount all mounts specified in the spec in their correct
// location inside root. It will resolve relative paths and symlinks. It also
// creates directories as needed.
func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error {
	for _, m := range mounts {
		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
			continue
		}

		dst, err := resolveSymlinks(root, m.Destination)
		if err != nil {
			return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
		}

		flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND
		if conf.Overlay {
			// Force mount read-only if writes are not going to be sent to it.
			flags |= syscall.MS_RDONLY
		}

		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
			return fmt.Errorf("mounting %v: %v", m, err)
		}

		// Set propagation options that cannot be set together with other options.
		flags = specutils.PropOptionsToFlags(m.Options)
		if flags != 0 {
			if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
				return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err)
			}
		}
	}
	return nil
}

// resolveMounts resolved relative paths and symlinks to mount points.
//
// Note: mount points must already be in place for resolution to work.
// Otherwise, it may follow symlinks to locations that would be overwritten
// with another mount point and return the wrong location. In short, make sure
// setupMounts() has been called before.
func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
	cleanMounts := make([]specs.Mount, 0, len(mounts))
	for _, m := range mounts {
		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
			cleanMounts = append(cleanMounts, m)
			continue
		}
		dst, err := resolveSymlinks(root, m.Destination)
		if err != nil {
			return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
		}
		relDst, err := filepath.Rel(root, dst)
		if err != nil {
			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
		}

		opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
		if err != nil {
			return nil, err
		}

		cpy := m
		cpy.Destination = filepath.Join("/", relDst)
		cpy.Options = opts
		cleanMounts = append(cleanMounts, cpy)
	}
	return cleanMounts, nil
}

// ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are
// symlinks, they are evaluated relative to 'root' to ensure the end result is
// the same as if the process was running inside the container.
func resolveSymlinks(root, rel string) (string, error) {
	return resolveSymlinksImpl(root, root, rel, 255)
}

func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
	if followCount == 0 {
		return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
	}

	rel = filepath.Clean(rel)
	for _, name := range strings.Split(rel, string(filepath.Separator)) {
		if name == "" {
			continue
		}
		// Note that Join() resolves things like ".." and returns a clean path.
		path := filepath.Join(base, name)
		if !strings.HasPrefix(path, root) {
			// One cannot '..' their way out of root.
			base = root
			continue
		}
		fi, err := os.Lstat(path)
		if err != nil {
			if !os.IsNotExist(err) {
				return "", err
			}
			// Not found means there is no symlink to check. Just keep walking dirs.
			base = path
			continue
		}
		if fi.Mode()&os.ModeSymlink != 0 {
			link, err := os.Readlink(path)
			if err != nil {
				return "", err
			}
			if filepath.IsAbs(link) {
				base = root
			}
			base, err = resolveSymlinksImpl(root, base, link, followCount-1)
			if err != nil {
				return "", err
			}
			continue
		}
		base = path
	}
	return base, nil
}

// adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
	rv := make([]string, len(opts))
	copy(rv, opts)

	if conf.OverlayfsStaleRead {
		statfs := syscall.Statfs_t{}
		if err := syscall.Statfs(path, &statfs); err != nil {
			return nil, err
		}
		if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC {
			rv = append(rv, "overlayfs_stale_read")
		}
	}
	return rv, nil
}