// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
	"context"
	"encoding/json"
	"fmt"
	"io/ioutil"
	"os"
	"os/exec"
	"path/filepath"
	"strconv"
	"strings"
	"syscall"
	"time"

	"github.com/google/subcommands"
	specs "github.com/opencontainers/runtime-spec/specs-go"
	"gvisor.dev/gvisor/pkg/log"
	"gvisor.dev/gvisor/pkg/sentry/control"
	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
	"gvisor.dev/gvisor/pkg/urpc"
	"gvisor.dev/gvisor/runsc/config"
	"gvisor.dev/gvisor/runsc/console"
	"gvisor.dev/gvisor/runsc/container"
	"gvisor.dev/gvisor/runsc/flag"
	"gvisor.dev/gvisor/runsc/specutils"
)

// Exec implements subcommands.Command for the "exec" command.
type Exec struct {
	cwd string
	env stringSlice
	// user contains the UID and GID with which to run the new process.
	user            user
	extraKGIDs      stringSlice
	caps            stringSlice
	detach          bool
	processPath     string
	pidFile         string
	internalPidFile string

	// consoleSocket is the path to an AF_UNIX socket which will receive a
	// file descriptor referencing the master end of the console's
	// pseudoterminal.
	consoleSocket string
}

// Name implements subcommands.Command.Name.
func (*Exec) Name() string {
	return "exec"
}

// Synopsis implements subcommands.Command.Synopsis.
func (*Exec) Synopsis() string {
	return "execute new process inside the container"
}

// Usage implements subcommands.Command.Usage.
func (*Exec) Usage() string {
	return `exec [command options] <container-id> <command> [command options] || --process process.json <container-id>


Where "<container-id>" is the name for the instance of the container and
"<command>" is the command to be executed in the container.
"<command>" can't be empty unless a "-process" flag provided.

EXAMPLE:
If the container is configured to run /bin/ps the following will
output a list of processes running in the container:

       # runc exec <container-id> ps

OPTIONS:
`
}

// SetFlags implements subcommands.Command.SetFlags.
func (ex *Exec) SetFlags(f *flag.FlagSet) {
	f.StringVar(&ex.cwd, "cwd", "", "current working directory")
	f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')")
	f.Var(&ex.user, "user", "UID (format: <uid>[:<gid>])")
	f.Var(&ex.extraKGIDs, "additional-gids", "additional gids")
	f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process")
	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
}

// Execute implements subcommands.Command.Execute. It starts a process in an
// already created container.
func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
	conf := args[0].(*config.Config)
	e, id, err := ex.parseArgs(f, conf.EnableRaw)
	if err != nil {
		Fatalf("parsing process spec: %v", err)
	}
	waitStatus := args[1].(*syscall.WaitStatus)

	c, err := container.Load(conf.RootDir, id)
	if err != nil {
		Fatalf("loading sandbox: %v", err)
	}

	log.Debugf("Exec arguments: %+v", e)
	log.Debugf("Exec capablities: %+v", e.Capabilities)

	// Replace empty settings with defaults from container.
	if e.WorkingDirectory == "" {
		e.WorkingDirectory = c.Spec.Process.Cwd
	}
	if e.Envv == nil {
		e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env)
		if err != nil {
			Fatalf("getting environment variables: %v", err)
		}
	}

	if e.Capabilities == nil {
		e.Capabilities, err = specutils.Capabilities(conf.EnableRaw, c.Spec.Process.Capabilities)
		if err != nil {
			Fatalf("creating capabilities: %v", err)
		}
		log.Infof("Using exec capabilities from container: %+v", e.Capabilities)
	}

	// containerd expects an actual process to represent the container being
	// executed. If detach was specified, starts a child in non-detach mode,
	// write the child's PID to the pid file. So when the container returns, the
	// child process will also return and signal containerd.
	if ex.detach {
		return ex.execChildAndWait(waitStatus)
	}
	return ex.exec(c, e, waitStatus)
}

func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
	// Start the new process and get it pid.
	pid, err := c.Execute(e)
	if err != nil {
		return Errorf("executing processes for container: %v", err)
	}

	if e.StdioIsPty {
		// Forward signals sent to this process to the foreground
		// process in the sandbox.
		stopForwarding := c.ForwardSignals(pid, true /* fgProcess */)
		defer stopForwarding()
	}

	// Write the sandbox-internal pid if required.
	if ex.internalPidFile != "" {
		pidStr := []byte(strconv.Itoa(int(pid)))
		if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
			return Errorf("writing internal pid file %q: %v", ex.internalPidFile, err)
		}
	}

	// Generate the pid file after the internal pid file is generated, so that
	// users can safely assume that the internal pid file is ready after
	// `runsc exec -d` returns.
	if ex.pidFile != "" {
		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
			return Errorf("writing pid file: %v", err)
		}
	}

	// Wait for the process to exit.
	ws, err := c.WaitPID(pid)
	if err != nil {
		return Errorf("waiting on pid %d: %v", pid, err)
	}
	*waitStatus = ws
	return subcommands.ExitSuccess
}

func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
	var args []string
	for _, a := range os.Args[1:] {
		if !strings.Contains(a, "detach") {
			args = append(args, a)
		}
	}

	// The command needs to write a pid file so that execChildAndWait can tell
	// when it has started. If no pid-file was provided, we should use a
	// filename in a temp directory.
	pidFile := ex.pidFile
	if pidFile == "" {
		tmpDir, err := ioutil.TempDir("", "exec-pid-")
		if err != nil {
			Fatalf("creating TempDir: %v", err)
		}
		defer os.RemoveAll(tmpDir)
		pidFile = filepath.Join(tmpDir, "pid")
		args = append(args, "--pid-file="+pidFile)
	}

	cmd := exec.Command(specutils.ExePath, args...)
	cmd.Args[0] = "runsc-exec"

	// Exec stdio defaults to current process stdio.
	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr

	// If the console control socket file is provided, then create a new
	// pty master/replica pair and set the TTY on the sandbox process.
	if ex.consoleSocket != "" {
		// Create a new TTY pair and send the master on the provided socket.
		tty, err := console.NewWithSocket(ex.consoleSocket)
		if err != nil {
			Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err)
		}
		defer tty.Close()

		// Set stdio to the new TTY replica.
		cmd.Stdin = tty
		cmd.Stdout = tty
		cmd.Stderr = tty
		cmd.SysProcAttr = &syscall.SysProcAttr{
			Setsid:  true,
			Setctty: true,
			// The Ctty FD must be the FD in the child process's FD
			// table. Since we set cmd.Stdin/Stdout/Stderr to the
			// tty FD, we can use any of 0, 1, or 2 here.
			// See https://github.com/golang/go/issues/29458.
			Ctty: 0,
		}
	}

	if err := cmd.Start(); err != nil {
		Fatalf("failure to start child exec process, err: %v", err)
	}

	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args)

	// Wait for PID file to ensure that child process has started. Otherwise,
	// '--process' file is deleted as soon as this process returns and the child
	// may fail to read it.
	ready := func() (bool, error) {
		pidb, err := ioutil.ReadFile(pidFile)
		if err == nil {
			// File appeared, check whether pid is fully written.
			pid, err := strconv.Atoi(string(pidb))
			if err != nil {
				return false, nil
			}
			return pid == cmd.Process.Pid, nil
		}
		if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
			return false, err
		}
		// No file yet, continue to wait...
		return false, nil
	}
	if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil {
		// Don't log fatal error here, otherwise it will override the error logged
		// by the child process that has failed to start.
		log.Warningf("Unexpected error waiting for PID file, err: %v", err)
		return subcommands.ExitFailure
	}

	*waitStatus = 0
	return subcommands.ExitSuccess
}

// parseArgs parses exec information from the command line or a JSON file
// depending on whether the --process flag was used. Returns an ExecArgs and
// the ID of the container to be used.
func (ex *Exec) parseArgs(f *flag.FlagSet, enableRaw bool) (*control.ExecArgs, string, error) {
	if ex.processPath == "" {
		// Requires at least a container ID and command.
		if f.NArg() < 2 {
			f.Usage()
			return nil, "", fmt.Errorf("both a container-id and command are required")
		}
		e, err := ex.argsFromCLI(f.Args()[1:], enableRaw)
		return e, f.Arg(0), err
	}
	// Requires only the container ID.
	if f.NArg() != 1 {
		f.Usage()
		return nil, "", fmt.Errorf("a container-id is required")
	}
	e, err := ex.argsFromProcessFile(enableRaw)
	return e, f.Arg(0), err
}

func (ex *Exec) argsFromCLI(argv []string, enableRaw bool) (*control.ExecArgs, error) {
	extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
	for _, s := range ex.extraKGIDs {
		kgid, err := strconv.Atoi(s)
		if err != nil {
			Fatalf("parsing GID: %s, %v", s, err)
		}
		extraKGIDs = append(extraKGIDs, auth.KGID(kgid))
	}

	var caps *auth.TaskCapabilities
	if len(ex.caps) > 0 {
		var err error
		caps, err = capabilities(ex.caps, enableRaw)
		if err != nil {
			return nil, fmt.Errorf("capabilities error: %v", err)
		}
	}

	return &control.ExecArgs{
		Argv:             argv,
		WorkingDirectory: ex.cwd,
		KUID:             ex.user.kuid,
		KGID:             ex.user.kgid,
		ExtraKGIDs:       extraKGIDs,
		Capabilities:     caps,
		StdioIsPty:       ex.consoleSocket != "",
		FilePayload:      urpc.FilePayload{[]*os.File{os.Stdin, os.Stdout, os.Stderr}},
	}, nil
}

func (ex *Exec) argsFromProcessFile(enableRaw bool) (*control.ExecArgs, error) {
	f, err := os.Open(ex.processPath)
	if err != nil {
		return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
	}
	defer f.Close()
	var p specs.Process
	if err := json.NewDecoder(f).Decode(&p); err != nil {
		return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
	}
	return argsFromProcess(&p, enableRaw)
}

// argsFromProcess performs all the non-IO conversion from the Process struct
// to ExecArgs.
func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error) {
	// Create capabilities.
	var caps *auth.TaskCapabilities
	if p.Capabilities != nil {
		var err error
		// Starting from Docker 19, capabilities are explicitly set for exec (instead
		// of nil like before). So we can't distinguish 'exec' from
		// 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
		// CAP_NET_RAW in the same way as container start.
		caps, err = specutils.Capabilities(enableRaw, p.Capabilities)
		if err != nil {
			return nil, fmt.Errorf("error creating capabilities: %v", err)
		}
	}

	// Convert the spec's additional GIDs to KGIDs.
	extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids))
	for _, GID := range p.User.AdditionalGids {
		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
	}

	return &control.ExecArgs{
		Argv:             p.Args,
		Envv:             p.Env,
		WorkingDirectory: p.Cwd,
		KUID:             auth.KUID(p.User.UID),
		KGID:             auth.KGID(p.User.GID),
		ExtraKGIDs:       extraKGIDs,
		Capabilities:     caps,
		StdioIsPty:       p.Terminal,
		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
	}, nil
}

// resolveEnvs transforms lists of environment variables into a single list of
// environment variables. If a variable is defined multiple times, the last
// value is used.
func resolveEnvs(envs ...[]string) ([]string, error) {
	// First create a map of variable names to values. This removes any
	// duplicates.
	envMap := make(map[string]string)
	for _, env := range envs {
		for _, str := range env {
			parts := strings.SplitN(str, "=", 2)
			if len(parts) != 2 {
				return nil, fmt.Errorf("invalid variable: %s", str)
			}
			envMap[parts[0]] = parts[1]
		}
	}
	// Reassemble envMap into a list of environment variables of the form
	// NAME=VALUE.
	env := make([]string, 0, len(envMap))
	for k, v := range envMap {
		env = append(env, fmt.Sprintf("%s=%s", k, v))
	}
	return env, nil
}

// capabilities takes a list of capabilities as strings and returns an
// auth.TaskCapabilities struct with those capabilities in every capability set.
// This mimics runc's behavior.
func capabilities(cs []string, enableRaw bool) (*auth.TaskCapabilities, error) {
	var specCaps specs.LinuxCapabilities
	for _, cap := range cs {
		specCaps.Ambient = append(specCaps.Ambient, cap)
		specCaps.Bounding = append(specCaps.Bounding, cap)
		specCaps.Effective = append(specCaps.Effective, cap)
		specCaps.Inheritable = append(specCaps.Inheritable, cap)
		specCaps.Permitted = append(specCaps.Permitted, cap)
	}
	// Starting from Docker 19, capabilities are explicitly set for exec (instead
	// of nil like before). So we can't distinguish 'exec' from
	// 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
	// CAP_NET_RAW in the same way as container start.
	return specutils.Capabilities(enableRaw, &specCaps)
}

// stringSlice allows a flag to be used multiple times, where each occurrence
// adds a value to the flag. For example, a flag called "x" could be invoked
// via "runsc exec -x foo -x bar", and the corresponding stringSlice would be
// {"x", "y"}.
type stringSlice []string

// String implements flag.Value.String.
func (ss *stringSlice) String() string {
	return fmt.Sprintf("%v", *ss)
}

// Get implements flag.Value.Get.
func (ss *stringSlice) Get() interface{} {
	return ss
}

// Set implements flag.Value.Set.
func (ss *stringSlice) Set(s string) error {
	*ss = append(*ss, s)
	return nil
}

// user allows -user to convey a UID and, optionally, a GID separated by a
// colon.
type user struct {
	kuid auth.KUID
	kgid auth.KGID
}

func (u *user) String() string {
	return fmt.Sprintf("%+v", *u)
}

func (u *user) Get() interface{} {
	return u
}

func (u *user) Set(s string) error {
	parts := strings.SplitN(s, ":", 2)
	kuid, err := strconv.Atoi(parts[0])
	if err != nil {
		return fmt.Errorf("couldn't parse UID: %s", parts[0])
	}
	u.kuid = auth.KUID(kuid)
	if len(parts) > 1 {
		kgid, err := strconv.Atoi(parts[1])
		if err != nil {
			return fmt.Errorf("couldn't parse GID: %s", parts[1])
		}
		u.kgid = auth.KGID(kgid)
	}
	return nil
}