// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
	"gvisor.googlesource.com/gvisor/pkg/syserror"
)

// TaskConfig defines the configuration of a new Task (see below).
type TaskConfig struct {
	// Kernel is the owning Kernel.
	*Kernel

	// Parent is the new task's parent. Parent may be nil.
	Parent *Task

	// ThreadGroup is the ThreadGroup the new task belongs to.
	*ThreadGroup

	// TaskContext is the TaskContext of the new task.
	*TaskContext

	// TaskResources is the TaskResources of the new task.
	*TaskResources

	// Credentials is the Credentials of the new task.
	Credentials *auth.Credentials

	// Niceness is the niceness of the new task.
	Niceness int

	// If NetworkNamespaced is true, the new task should observe a non-root
	// network namespace.
	NetworkNamespaced bool

	// AllowedCPUMask contains the cpus that this task can run on.
	AllowedCPUMask sched.CPUSet

	// UTSNamespace is the UTSNamespace of the new task.
	UTSNamespace *UTSNamespace

	// IPCNamespace is the IPCNamespace of the new task.
	IPCNamespace *IPCNamespace
}

// NewTask creates a new task defined by TaskConfig.
// Whether or not NewTask is successful, it takes ownership of both TaskContext
// and TaskResources of the TaskConfig.
//
// NewTask does not start the returned task; the caller must call Task.Start.
func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
	t, err := ts.newTask(cfg)
	if err != nil {
		cfg.TaskContext.release()
		cfg.TaskResources.release()
		return nil, err
	}
	return t, nil
}

// newTask is a helper for TaskSet.NewTask that only takes ownership of TaskContext
// and TaskResources of the TaskConfig if it succeeds.
func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
	tg := cfg.ThreadGroup
	tc := cfg.TaskContext
	t := &Task{
		taskNode: taskNode{
			tg:       tg,
			parent:   cfg.Parent,
			children: make(map[*Task]struct{}),
		},
		runState:       (*runApp)(nil),
		interruptChan:  make(chan struct{}, 1),
		signalStack:    arch.SignalStack{Flags: arch.SignalStackFlagDisable},
		tc:             *tc,
		tr:             *cfg.TaskResources,
		p:              cfg.Kernel.Platform.NewContext(),
		k:              cfg.Kernel,
		ptraceTracees:  make(map[*Task]struct{}),
		allowedCPUMask: cfg.AllowedCPUMask.Copy(),
		ioUsage:        &usage.IO{},
		creds:          cfg.Credentials,
		niceness:       cfg.Niceness,
		netns:          cfg.NetworkNamespaced,
		utsns:          cfg.UTSNamespace,
		ipcns:          cfg.IPCNamespace,
		rseqCPU:        -1,
		futexWaiter:    futex.NewWaiter(),
	}
	t.endStopCond.L = &t.tg.signalHandlers.mu
	t.ptraceTracer.Store((*Task)(nil))
	// We don't construct t.blockingTimer until Task.run(); see that function
	// for justification.

	// Make the new task (and possibly thread group) visible to the rest of
	// the system atomically.
	ts.mu.Lock()
	defer ts.mu.Unlock()
	tg.signalHandlers.mu.Lock()
	defer tg.signalHandlers.mu.Unlock()
	if tg.exiting || tg.execing != nil {
		// If the caller is in the same thread group, then what we return
		// doesn't matter too much since the caller will exit before it returns
		// to userspace. If the caller isn't in the same thread group, then
		// we're in uncharted territory and can return whatever we want.
		return nil, syserror.EINTR
	}
	if err := ts.assignTIDsLocked(t); err != nil {
		return nil, err
	}
	// Below this point, newTask is expected not to fail (there is no rollback
	// of assignTIDsLocked or any of the following).

	// Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
	// This is the earliest point at which we can do so (since t now has thread
	// IDs).
	t.updateLogPrefixLocked()

	if t.parent != nil {
		t.parent.children[t] = struct{}{}
	}

	if tg.leader == nil {
		// New thread group.
		tg.leader = t
		if parentPG := tg.parentPG(); parentPG == nil {
			tg.createSession()
		} else {
			// Inherit the process group.
			parentPG.incRefWithParent(parentPG)
			tg.processGroup = parentPG
		}
	}
	tg.tasks.PushBack(t)
	tg.tasksCount++
	tg.liveTasks++
	tg.activeTasks++

	// Propagate external TaskSet stops to the new task.
	t.stopCount = ts.stopCount

	t.mu.Lock()
	defer t.mu.Unlock()

	t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])

	t.startTime = t.k.RealtimeClock().Now()

	return t, nil
}

// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
// which it should be visible.
//
// Preconditions: ts.mu must be locked for writing.
func (ts *TaskSet) assignTIDsLocked(t *Task) error {
	type allocatedTID struct {
		ns  *PIDNamespace
		tid ThreadID
	}
	var allocatedTIDs []allocatedTID
	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
		tid, err := ns.allocateTID()
		if err != nil {
			// Failure. Remove the tids we already allocated in descendant
			// namespaces.
			for _, a := range allocatedTIDs {
				delete(a.ns.tasks, a.tid)
				delete(a.ns.tids, t)
			}
			return err
		}
		ns.tasks[tid] = t
		ns.tids[t] = tid
		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
	}
	return nil
}

// allocateTID returns an unused ThreadID from ns.
//
// Preconditions: ns.owner.mu must be locked for writing.
func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
	if ns.exiting {
		// "In this case, a subsequent fork(2) into this PID namespace will
		// fail with the error ENOMEM; it is not possible to create a new
		// processes [sic] in a PID namespace whose init process has
		// terminated." - pid_namespaces(7)
		return 0, syserror.ENOMEM
	}
	tid := ns.last
	for {
		// Next.
		tid++
		if tid > TasksLimit {
			tid = InitTID + 1
		}

		// Is it available?
		_, ok := ns.tasks[tid]
		if !ok {
			ns.last = tid
			return tid, nil
		}

		// Did we do a full cycle?
		if tid == ns.last {
			// No tid available.
			return 0, syserror.EAGAIN
		}
	}
}

// Start starts the task goroutine. Start must be called exactly once for each
// task returned by NewTask.
//
// 'tid' must be the task's TID in the root PID namespace and it's used for
// debugging purposes only (set as parameter to Task.run to make it visible
// in stack dumps).
func (t *Task) Start(tid ThreadID) {
	// If the task was restored, it may be "starting" after having already exited.
	if t.runState == nil {
		return
	}
	t.goroutineStopped.Add(1)
	t.tg.liveGoroutines.Add(1)
	t.tg.pidns.owner.liveGoroutines.Add(1)
	t.tg.pidns.owner.runningGoroutines.Add(1)

	// Task is now running in system mode.
	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)

	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
}