// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package specutils

import (
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"runtime"
	"syscall"

	specs "github.com/opencontainers/runtime-spec/specs-go"
	"github.com/syndtr/gocapability/capability"
	"golang.org/x/sys/unix"
	"gvisor.dev/gvisor/pkg/log"
)

// nsCloneFlag returns the clone flag that can be used to set a namespace of
// the given type.
func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
	switch nst {
	case specs.IPCNamespace:
		return syscall.CLONE_NEWIPC
	case specs.MountNamespace:
		return syscall.CLONE_NEWNS
	case specs.NetworkNamespace:
		return syscall.CLONE_NEWNET
	case specs.PIDNamespace:
		return syscall.CLONE_NEWPID
	case specs.UTSNamespace:
		return syscall.CLONE_NEWUTS
	case specs.UserNamespace:
		return syscall.CLONE_NEWUSER
	case specs.CgroupNamespace:
		panic("cgroup namespace has no associated clone flag")
	default:
		panic(fmt.Sprintf("unknown namespace %v", nst))
	}
}

// nsPath returns the path of the namespace for the current process and the
// given namespace.
func nsPath(nst specs.LinuxNamespaceType) string {
	base := "/proc/self/ns"
	switch nst {
	case specs.CgroupNamespace:
		return filepath.Join(base, "cgroup")
	case specs.IPCNamespace:
		return filepath.Join(base, "ipc")
	case specs.MountNamespace:
		return filepath.Join(base, "mnt")
	case specs.NetworkNamespace:
		return filepath.Join(base, "net")
	case specs.PIDNamespace:
		return filepath.Join(base, "pid")
	case specs.UserNamespace:
		return filepath.Join(base, "user")
	case specs.UTSNamespace:
		return filepath.Join(base, "uts")
	default:
		panic(fmt.Sprintf("unknown namespace %v", nst))
	}
}

// GetNS returns true and the namespace with the given type from the slice of
// namespaces in the spec.  It returns false if the slice does not contain a
// namespace with the type.
func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
	if s.Linux == nil {
		return specs.LinuxNamespace{}, false
	}
	for _, ns := range s.Linux.Namespaces {
		if ns.Type == nst {
			return ns, true
		}
	}
	return specs.LinuxNamespace{}, false
}

// FilterNS returns a slice of namespaces from the spec with types that match
// those in the `filter` slice.
func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
	if s.Linux == nil {
		return nil
	}
	var out []specs.LinuxNamespace
	for _, nst := range filter {
		if ns, ok := GetNS(nst, s); ok {
			out = append(out, ns)
		}
	}
	return out
}

// setNS sets the namespace of the given type.  It must be called with
// OSThreadLocked.
func setNS(fd, nsType uintptr) error {
	if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
		return err
	}
	return nil
}

// ApplyNS applies the namespace on the current thread and returns a function
// that will restore the namespace to the original value.
//
// Preconditions: Must be called with os thread locked.
func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
	log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
	newNS, err := os.Open(ns.Path)
	if err != nil {
		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
	}
	defer newNS.Close()

	// Store current namespace to restore back.
	curPath := nsPath(ns.Type)
	oldNS, err := os.Open(curPath)
	if err != nil {
		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
	}

	// Set namespace to the one requested and setup function to restore it back.
	flag := nsCloneFlag(ns.Type)
	if err := setNS(newNS.Fd(), flag); err != nil {
		oldNS.Close()
		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
	}
	return func() {
		log.Infof("Restoring namespace %v", ns.Type)
		defer oldNS.Close()
		if err := setNS(oldNS.Fd(), flag); err != nil {
			panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
		}
	}, nil
}

// StartInNS joins or creates the given namespaces and calls cmd.Start before
// restoring the namespaces to the original values.
func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
	// We are about to setup namespaces, which requires the os thread being
	// locked so that Go doesn't change the thread out from under us.
	runtime.LockOSThread()
	defer runtime.UnlockOSThread()

	if cmd.SysProcAttr == nil {
		cmd.SysProcAttr = &syscall.SysProcAttr{}
	}

	for _, ns := range nss {
		if ns.Path == "" {
			// No path.  Just set a flag to create a new namespace.
			cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
			continue
		}
		// Join the given namespace, and restore the current namespace
		// before exiting.
		restoreNS, err := ApplyNS(ns)
		if err != nil {
			return err
		}
		defer restoreNS()
	}

	return cmd.Start()
}

// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
	if s.Linux == nil {
		return
	}
	if cmd.SysProcAttr == nil {
		cmd.SysProcAttr = &syscall.SysProcAttr{}
	}
	for _, idMap := range s.Linux.UIDMappings {
		log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
		cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
			ContainerID: int(idMap.ContainerID),
			HostID:      int(idMap.HostID),
			Size:        int(idMap.Size),
		})
	}
	for _, idMap := range s.Linux.GIDMappings {
		log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
		cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
			ContainerID: int(idMap.ContainerID),
			HostID:      int(idMap.HostID),
			Size:        int(idMap.Size),
		})
	}
}

// HasCapabilities returns true if the user has all capabilities in 'cs'.
func HasCapabilities(cs ...capability.Cap) bool {
	caps, err := capability.NewPid2(os.Getpid())
	if err != nil {
		return false
	}
	if err := caps.Load(); err != nil {
		return false
	}
	for _, c := range cs {
		if !caps.Get(capability.EFFECTIVE, c) {
			return false
		}
	}
	return true
}

// MaybeRunAsRoot ensures the process runs with capabilities needed to create a
// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
// it will create a new user namespace and re-execute the process as root
// inside the namespace with the same arguments and environment.
//
// This function returns immediately when no new capability is needed. If
// another process is executed, it returns straight from here with the same exit
// code as the child.
func MaybeRunAsRoot() error {
	if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
		return nil
	}

	// Current process doesn't have required capabilities, create user namespace
	// and run as root inside the namespace to acquire capabilities.
	log.Infof("*** Re-running as root in new user namespace ***")

	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)

	cmd.SysProcAttr = &syscall.SysProcAttr{
		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
		// Set current user/group as root inside the namespace. Since we may not
		// have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
		UidMappings: []syscall.SysProcIDMap{
			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
		},
		GidMappings: []syscall.SysProcIDMap{
			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
		},
		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
		GidMappingsEnableSetgroups: false,
	}

	cmd.Env = os.Environ()
	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr
	if err := cmd.Run(); err != nil {
		if exit, ok := err.(*exec.ExitError); ok {
			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
				os.Exit(ws.ExitStatus())
			}
			log.Warningf("No wait status provided, exiting with -1: %v", err)
			os.Exit(-1)
		}
		return fmt.Errorf("re-executing self: %v", err)
	}
	// Child completed with success.
	os.Exit(0)
	panic("unreachable")
}