// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
	"bytes"
	"fmt"
	"io"
	"sort"
	"strconv"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/errors/linuxerr"
	"gvisor.dev/gvisor/pkg/hostarch"
	"gvisor.dev/gvisor/pkg/sentry/fs"
	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
	"gvisor.dev/gvisor/pkg/sentry/kernel"
	"gvisor.dev/gvisor/pkg/sentry/limits"
	"gvisor.dev/gvisor/pkg/sentry/mm"
	"gvisor.dev/gvisor/pkg/sentry/usage"
	"gvisor.dev/gvisor/pkg/usermem"
	"gvisor.dev/gvisor/pkg/waiter"
)

// LINT.IfChange

// getTaskMM gets the kernel task's MemoryManager. No additional reference is
// taken on mm here. This is safe because MemoryManager.destroy is required to
// leave the MemoryManager in a state where it's still usable as a
// DynamicBytesSource.
func getTaskMM(t *kernel.Task) *mm.MemoryManager {
	var tmm *mm.MemoryManager
	t.WithMuLocked(func(t *kernel.Task) {
		if mm := t.MemoryManager(); mm != nil {
			tmm = mm
		}
	})
	return tmm
}

// getTaskMMIncRef returns t's MemoryManager. If getTaskMMIncRef succeeds, the
// MemoryManager's users count is incremented, and must be decremented by the
// caller when it is no longer in use.
func getTaskMMIncRef(t *kernel.Task) (*mm.MemoryManager, error) {
	if t.ExitState() == kernel.TaskExitDead {
		return nil, linuxerr.ESRCH
	}
	var m *mm.MemoryManager
	t.WithMuLocked(func(t *kernel.Task) {
		m = t.MemoryManager()
	})
	if m == nil || !m.IncUsers() {
		return nil, io.EOF
	}
	return m, nil
}

func checkTaskState(t *kernel.Task) error {
	switch t.ExitState() {
	case kernel.TaskExitZombie:
		return linuxerr.EACCES
	case kernel.TaskExitDead:
		return linuxerr.ESRCH
	}
	return nil
}

// taskDir represents a task-level directory.
//
// +stateify savable
type taskDir struct {
	ramfs.Dir

	t *kernel.Task
}

var _ fs.InodeOperations = (*taskDir)(nil)

// newTaskDir creates a new proc task entry.
func (p *proc) newTaskDir(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
	contents := map[string]*fs.Inode{
		"auxv":          newAuxvec(ctx, t, msrc),
		"cmdline":       newExecArgInode(ctx, t, msrc, cmdlineExecArg),
		"comm":          newComm(ctx, t, msrc),
		"cwd":           newCwd(ctx, t, msrc),
		"environ":       newExecArgInode(ctx, t, msrc, environExecArg),
		"exe":           newExe(ctx, t, msrc),
		"fd":            newFdDir(ctx, t, msrc),
		"fdinfo":        newFdInfoDir(ctx, t, msrc),
		"gid_map":       newGIDMap(ctx, t, msrc),
		"io":            newIO(ctx, t, msrc, isThreadGroup),
		"maps":          newMaps(ctx, t, msrc),
		"mem":           newMem(ctx, t, msrc),
		"mountinfo":     seqfile.NewSeqFileInode(ctx, &mountInfoFile{t: t}, msrc),
		"mounts":        seqfile.NewSeqFileInode(ctx, &mountsFile{t: t}, msrc),
		"net":           newNetDir(ctx, t, msrc),
		"ns":            newNamespaceDir(ctx, t, msrc),
		"oom_score":     newOOMScore(ctx, msrc),
		"oom_score_adj": newOOMScoreAdj(ctx, t, msrc),
		"smaps":         newSmaps(ctx, t, msrc),
		"stat":          newTaskStat(ctx, t, msrc, isThreadGroup, p.pidns),
		"statm":         newStatm(ctx, t, msrc),
		"status":        newStatus(ctx, t, msrc, p.pidns),
		"uid_map":       newUIDMap(ctx, t, msrc),
	}
	if isThreadGroup {
		contents["task"] = p.newSubtasks(ctx, t, msrc)
	}
	if len(p.cgroupControllers) > 0 {
		contents["cgroup"] = newCGroupInode(ctx, msrc, p.cgroupControllers)
	}

	// N.B. taskOwnedInodeOps enforces dumpability-based ownership.
	d := &taskDir{
		Dir: *ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
		t:   t,
	}
	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, t)
}

// subtasks represents a /proc/TID/task directory.
//
// +stateify savable
type subtasks struct {
	ramfs.Dir

	t *kernel.Task
	p *proc
}

var _ fs.InodeOperations = (*subtasks)(nil)

func (p *proc) newSubtasks(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	s := &subtasks{
		Dir: *ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)),
		t:   t,
		p:   p,
	}
	return newProcInode(ctx, s, msrc, fs.SpecialDirectory, t)
}

// UnstableAttr returns unstable attributes of the subtasks.
func (s *subtasks) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
	uattr, err := s.Dir.UnstableAttr(ctx, inode)
	if err != nil {
		return fs.UnstableAttr{}, err
	}
	// We can't rely on ramfs' implementation because the task directories are
	// generated dynamically.
	uattr.Links = uint64(2 + s.t.ThreadGroup().Count())
	return uattr, nil
}

// GetFile implements fs.InodeOperations.GetFile.
func (s *subtasks) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
	return fs.NewFile(ctx, dirent, flags, &subtasksFile{t: s.t, pidns: s.p.pidns}), nil
}

// +stateify savable
type subtasksFile struct {
	fsutil.DirFileOperations        `state:"nosave"`
	fsutil.FileUseInodeUnstableAttr `state:"nosave"`

	t     *kernel.Task
	pidns *kernel.PIDNamespace
}

// Readdir implements fs.FileOperations.Readdir.
func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
	dirCtx := fs.DirCtx{
		Serializer: ser,
	}

	// Note that unlike most Readdir implementations, the offset here is
	// not an index into the subtasks, but rather the TID of the next
	// subtask to emit.
	offset := file.Offset()

	tasks := f.t.ThreadGroup().MemberIDs(f.pidns)
	if len(tasks) == 0 {
		return offset, linuxerr.ENOENT
	}

	if offset == 0 {
		// Serialize "." and "..".
		root := fs.RootFromContext(ctx)
		if root != nil {
			defer root.DecRef(ctx)
		}
		dot, dotdot := file.Dirent.GetDotAttrs(root)
		if err := dirCtx.DirEmit(".", dot); err != nil {
			return offset, err
		}
		if err := dirCtx.DirEmit("..", dotdot); err != nil {
			return offset, err
		}
	}

	// Serialize tasks.
	taskInts := make([]int, 0, len(tasks))
	for _, tid := range tasks {
		taskInts = append(taskInts, int(tid))
	}

	sort.Sort(sort.IntSlice(taskInts))
	// Find the task to start at.
	idx := sort.SearchInts(taskInts, int(offset))
	if idx == len(taskInts) {
		return offset, nil
	}
	taskInts = taskInts[idx:]

	var tid int
	for _, tid = range taskInts {
		name := strconv.FormatUint(uint64(tid), 10)
		attr := fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice)
		if err := dirCtx.DirEmit(name, attr); err != nil {
			// Returned offset is next tid to serialize.
			return int64(tid), err
		}
	}
	// We serialized them all.  Next offset should be higher than last
	// serialized tid.
	return int64(tid) + 1, nil
}

var _ fs.FileOperations = (*subtasksFile)(nil)

// Lookup loads an Inode in a task's subtask directory into a Dirent.
func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
	tid, err := strconv.ParseUint(p, 10, 32)
	if err != nil {
		return nil, linuxerr.ENOENT
	}

	task := s.p.pidns.TaskWithID(kernel.ThreadID(tid))
	if task == nil {
		return nil, linuxerr.ENOENT
	}
	if task.ThreadGroup() != s.t.ThreadGroup() {
		return nil, linuxerr.ENOENT
	}

	td := s.p.newTaskDir(ctx, task, dir.MountSource, false)
	return fs.NewDirent(ctx, td, p), nil
}

// exe is an fs.InodeOperations symlink for the /proc/PID/exe file.
//
// +stateify savable
type exe struct {
	ramfs.Symlink

	t *kernel.Task
}

func newExe(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	exeSymlink := &exe{
		Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""),
		t:       t,
	}
	return newProcInode(ctx, exeSymlink, msrc, fs.Symlink, t)
}

func (e *exe) executable() (file fsbridge.File, err error) {
	if err := checkTaskState(e.t); err != nil {
		return nil, err
	}
	mm := getTaskMM(e.t)
	if mm == nil {
		return nil, linuxerr.EACCES
	}

	// The MemoryManager may be destroyed, in which case
	// MemoryManager.destroy will simply set the executable to nil
	// (with locks held).
	file = mm.Executable()
	if file == nil {
		err = linuxerr.ESRCH
	}
	return
}

// Readlink implements fs.InodeOperations.
func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
	if !kernel.ContextCanTrace(ctx, e.t, false) {
		return "", linuxerr.EACCES
	}

	// Pull out the executable for /proc/TID/exe.
	exec, err := e.executable()
	if err != nil {
		return "", err
	}
	defer exec.DecRef(ctx)

	return exec.PathnameWithDeleted(ctx), nil
}

// cwd is an fs.InodeOperations symlink for the /proc/PID/cwd file.
//
// +stateify savable
type cwd struct {
	ramfs.Symlink

	t *kernel.Task
}

func newCwd(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	cwdSymlink := &cwd{
		Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""),
		t:       t,
	}
	return newProcInode(ctx, cwdSymlink, msrc, fs.Symlink, t)
}

// Readlink implements fs.InodeOperations.
func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
	if !kernel.ContextCanTrace(ctx, e.t, false) {
		return "", linuxerr.EACCES
	}
	if err := checkTaskState(e.t); err != nil {
		return "", err
	}
	cwd := e.t.FSContext().WorkingDirectory()
	if cwd == nil {
		// It could have raced with process deletion.
		return "", linuxerr.ESRCH
	}
	defer cwd.DecRef(ctx)

	root := fs.RootFromContext(ctx)
	if root == nil {
		// It could have raced with process deletion.
		return "", linuxerr.ESRCH
	}
	defer root.DecRef(ctx)

	name, _ := cwd.FullName(root)
	return name, nil
}

// namespaceSymlink represents a symlink in the namespacefs, such as the files
// in /proc/<pid>/ns.
//
// +stateify savable
type namespaceSymlink struct {
	ramfs.Symlink

	t *kernel.Task
}

func newNamespaceSymlink(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode {
	// TODO(rahat): Namespace symlinks should contain the namespace name and the
	// inode number for the namespace instance, so for example user:[123456]. We
	// currently fake the inode number by sticking the symlink inode in its
	// place.
	target := fmt.Sprintf("%s:[%d]", name, device.ProcDevice.NextIno())
	n := &namespaceSymlink{
		Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, target),
		t:       t,
	}
	return newProcInode(ctx, n, msrc, fs.Symlink, t)
}

// Readlink reads the symlink value.
func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
	if err := checkTaskState(n.t); err != nil {
		return "", err
	}
	return n.Symlink.Readlink(ctx, inode)
}

// Getlink implements fs.InodeOperations.Getlink.
func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
	if !kernel.ContextCanTrace(ctx, n.t, false) {
		return nil, linuxerr.EACCES
	}
	if err := checkTaskState(n.t); err != nil {
		return nil, err
	}

	// Create a new regular file to fake the namespace file.
	iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC)
	return fs.NewDirent(ctx, newProcInode(ctx, iops, inode.MountSource, fs.RegularFile, nil), n.Symlink.Target), nil
}

func newNamespaceDir(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	contents := map[string]*fs.Inode{
		"net":  newNamespaceSymlink(ctx, t, msrc, "net"),
		"pid":  newNamespaceSymlink(ctx, t, msrc, "pid"),
		"user": newNamespaceSymlink(ctx, t, msrc, "user"),
	}
	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0511))
	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, t)
}

// memData implements fs.Inode for /proc/[pid]/mem.
//
// +stateify savable
type memData struct {
	fsutil.SimpleFileInode

	t *kernel.Task
}

// memDataFile implements fs.FileOperations for /proc/[pid]/mem.
//
// +stateify savable
type memDataFile struct {
	fsutil.FileGenericSeek          `state:"nosave"`
	fsutil.FileNoIoctl              `state:"nosave"`
	fsutil.FileNoMMap               `state:"nosave"`
	fsutil.FileNoWrite              `state:"nosave"`
	fsutil.FileNoSplice             `state:"nosave"`
	fsutil.FileNoopFlush            `state:"nosave"`
	fsutil.FileNoopFsync            `state:"nosave"`
	fsutil.FileNoopRelease          `state:"nosave"`
	fsutil.FileNotDirReaddir        `state:"nosave"`
	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
	waiter.AlwaysReady              `state:"nosave"`

	t *kernel.Task
}

func newMem(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	inode := &memData{
		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0400), linux.PROC_SUPER_MAGIC),
		t:               t,
	}
	return newProcInode(ctx, inode, msrc, fs.SpecialFile, t)
}

// Truncate implements fs.InodeOperations.Truncate.
func (m *memData) Truncate(context.Context, *fs.Inode, int64) error {
	return nil
}

// GetFile implements fs.InodeOperations.GetFile.
func (m *memData) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
	// TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS
	// Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS
	// Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH
	if !kernel.ContextCanTrace(ctx, m.t, true) {
		return nil, linuxerr.EACCES
	}
	if err := checkTaskState(m.t); err != nil {
		return nil, err
	}
	// Enable random access reads
	flags.Pread = true
	return fs.NewFile(ctx, dirent, flags, &memDataFile{t: m.t}), nil
}

// Read implements fs.FileOperations.Read.
func (m *memDataFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
	if dst.NumBytes() == 0 {
		return 0, nil
	}
	mm, err := getTaskMMIncRef(m.t)
	if err != nil {
		return 0, nil
	}
	defer mm.DecUsers(ctx)
	// Buffer the read data because of MM locks
	buf := make([]byte, dst.NumBytes())
	n, readErr := mm.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true})
	if n > 0 {
		if _, err := dst.CopyOut(ctx, buf[:n]); err != nil {
			return 0, linuxerr.EFAULT
		}
		return int64(n), nil
	}
	if readErr != nil {
		return 0, linuxerr.EIO
	}
	return 0, nil
}

// mapsData implements seqfile.SeqSource for /proc/[pid]/maps.
//
// +stateify savable
type mapsData struct {
	t *kernel.Task
}

func newMaps(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	return newProcInode(ctx, seqfile.NewSeqFile(ctx, &mapsData{t}), msrc, fs.SpecialFile, t)
}

// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
func (md *mapsData) NeedsUpdate(generation int64) bool {
	if mm := getTaskMM(md.t); mm != nil {
		return mm.NeedsUpdate(generation)
	}
	return true
}

// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
	if mm := getTaskMM(md.t); mm != nil {
		return mm.ReadMapsSeqFileData(ctx, h)
	}
	return []seqfile.SeqData{}, 0
}

// smapsData implements seqfile.SeqSource for /proc/[pid]/smaps.
//
// +stateify savable
type smapsData struct {
	t *kernel.Task
}

func newSmaps(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	return newProcInode(ctx, seqfile.NewSeqFile(ctx, &smapsData{t}), msrc, fs.SpecialFile, t)
}

// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
func (sd *smapsData) NeedsUpdate(generation int64) bool {
	if mm := getTaskMM(sd.t); mm != nil {
		return mm.NeedsUpdate(generation)
	}
	return true
}

// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
func (sd *smapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
	if mm := getTaskMM(sd.t); mm != nil {
		return mm.ReadSmapsSeqFileData(ctx, h)
	}
	return []seqfile.SeqData{}, 0
}

// +stateify savable
type taskStatData struct {
	t *kernel.Task

	// If tgstats is true, accumulate fault stats (not implemented) and CPU
	// time across all tasks in t's thread group.
	tgstats bool

	// pidns is the PID namespace associated with the proc filesystem that
	// includes the file using this statData.
	pidns *kernel.PIDNamespace
}

func newTaskStat(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, showSubtasks bool, pidns *kernel.PIDNamespace) *fs.Inode {
	return newProcInode(ctx, seqfile.NewSeqFile(ctx, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t)
}

// NeedsUpdate returns whether the generation is old or not.
func (s *taskStatData) NeedsUpdate(generation int64) bool {
	return true
}

// ReadSeqFileData returns data for the SeqFile reader.
// SeqData, the current generation and where in the file the handle corresponds to.
func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
	if h != nil {
		return nil, 0
	}

	var buf bytes.Buffer

	fmt.Fprintf(&buf, "%d ", s.pidns.IDOfTask(s.t))
	fmt.Fprintf(&buf, "(%s) ", s.t.Name())
	fmt.Fprintf(&buf, "%c ", s.t.StateStatus()[0])
	ppid := kernel.ThreadID(0)
	if parent := s.t.Parent(); parent != nil {
		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
	}
	fmt.Fprintf(&buf, "%d ", ppid)
	fmt.Fprintf(&buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
	fmt.Fprintf(&buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
	fmt.Fprintf(&buf, "0 0 " /* tty_nr tpgid */)
	fmt.Fprintf(&buf, "0 " /* flags */)
	fmt.Fprintf(&buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
	var cputime usage.CPUStats
	if s.tgstats {
		cputime = s.t.ThreadGroup().CPUStats()
	} else {
		cputime = s.t.CPUStats()
	}
	fmt.Fprintf(&buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
	fmt.Fprintf(&buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
	fmt.Fprintf(&buf, "%d %d ", s.t.Priority(), s.t.Niceness())
	fmt.Fprintf(&buf, "%d ", s.t.ThreadGroup().Count())

	// itrealvalue. Since kernel 2.6.17, this field is no longer
	// maintained, and is hard coded as 0.
	fmt.Fprintf(&buf, "0 ")

	// Start time is relative to boot time, expressed in clock ticks.
	fmt.Fprintf(&buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))

	var vss, rss uint64
	if mm := getTaskMM(s.t); mm != nil {
		vss = mm.VirtualMemorySize()
		rss = mm.ResidentSetSize()
	}
	fmt.Fprintf(&buf, "%d %d ", vss, rss/hostarch.PageSize)

	// rsslim.
	fmt.Fprintf(&buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)

	fmt.Fprintf(&buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
	fmt.Fprintf(&buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
	fmt.Fprintf(&buf, "0 0 " /* nswap cnswap */)
	terminationSignal := linux.Signal(0)
	if s.t == s.t.ThreadGroup().Leader() {
		terminationSignal = s.t.ThreadGroup().TerminationSignal()
	}
	fmt.Fprintf(&buf, "%d ", terminationSignal)
	fmt.Fprintf(&buf, "0 0 0 " /* processor rt_priority policy */)
	fmt.Fprintf(&buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
	fmt.Fprintf(&buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
	fmt.Fprintf(&buf, "0\n" /* exit_code */)

	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*taskStatData)(nil)}}, 0
}

// statmData implements seqfile.SeqSource for /proc/[pid]/statm.
//
// +stateify savable
type statmData struct {
	t *kernel.Task
}

func newStatm(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	return newProcInode(ctx, seqfile.NewSeqFile(ctx, &statmData{t}), msrc, fs.SpecialFile, t)
}

// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
func (s *statmData) NeedsUpdate(generation int64) bool {
	return true
}

// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
func (s *statmData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
	if h != nil {
		return nil, 0
	}

	var vss, rss uint64
	if mm := getTaskMM(s.t); mm != nil {
		vss = mm.VirtualMemorySize()
		rss = mm.ResidentSetSize()
	}

	var buf bytes.Buffer
	fmt.Fprintf(&buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize)

	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statmData)(nil)}}, 0
}

// statusData implements seqfile.SeqSource for /proc/[pid]/status.
//
// +stateify savable
type statusData struct {
	t     *kernel.Task
	pidns *kernel.PIDNamespace
}

func newStatus(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode {
	return newProcInode(ctx, seqfile.NewSeqFile(ctx, &statusData{t, pidns}), msrc, fs.SpecialFile, t)
}

// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
func (s *statusData) NeedsUpdate(generation int64) bool {
	return true
}

// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
	if h != nil {
		return nil, 0
	}

	var buf bytes.Buffer
	fmt.Fprintf(&buf, "Name:\t%s\n", s.t.Name())
	fmt.Fprintf(&buf, "State:\t%s\n", s.t.StateStatus())
	fmt.Fprintf(&buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
	fmt.Fprintf(&buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
	ppid := kernel.ThreadID(0)
	if parent := s.t.Parent(); parent != nil {
		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
	}
	fmt.Fprintf(&buf, "PPid:\t%d\n", ppid)
	tpid := kernel.ThreadID(0)
	if tracer := s.t.Tracer(); tracer != nil {
		tpid = s.pidns.IDOfTask(tracer)
	}
	fmt.Fprintf(&buf, "TracerPid:\t%d\n", tpid)
	var fds int
	var vss, rss, data uint64
	s.t.WithMuLocked(func(t *kernel.Task) {
		if fdTable := t.FDTable(); fdTable != nil {
			fds = fdTable.CurrentMaxFDs()
		}
	})

	if mm := getTaskMM(s.t); mm != nil {
		vss = mm.VirtualMemorySize()
		rss = mm.ResidentSetSize()
		data = mm.VirtualDataSize()
	}
	fmt.Fprintf(&buf, "FDSize:\t%d\n", fds)
	fmt.Fprintf(&buf, "VmSize:\t%d kB\n", vss>>10)
	fmt.Fprintf(&buf, "VmRSS:\t%d kB\n", rss>>10)
	fmt.Fprintf(&buf, "VmData:\t%d kB\n", data>>10)
	fmt.Fprintf(&buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
	creds := s.t.Credentials()
	fmt.Fprintf(&buf, "CapInh:\t%016x\n", creds.InheritableCaps)
	fmt.Fprintf(&buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
	fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
	fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
	fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode())
	// We unconditionally report a single NUMA node. See
	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
	fmt.Fprintf(&buf, "Mems_allowed:\t1\n")
	fmt.Fprintf(&buf, "Mems_allowed_list:\t0\n")
	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0
}

// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
type ioUsage interface {
	// IOUsage returns the io usage data.
	IOUsage() *usage.IO
}

// +stateify savable
type ioData struct {
	ioUsage
}

func newIO(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
	if isThreadGroup {
		return newProcInode(ctx, seqfile.NewSeqFile(ctx, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
	}
	return newProcInode(ctx, seqfile.NewSeqFile(ctx, &ioData{t}), msrc, fs.SpecialFile, t)
}

// NeedsUpdate returns whether the generation is old or not.
func (i *ioData) NeedsUpdate(generation int64) bool {
	return true
}

// ReadSeqFileData returns data for the SeqFile reader.
// SeqData, the current generation and where in the file the handle corresponds to.
func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
	if h != nil {
		return nil, 0
	}

	io := usage.IO{}
	io.Accumulate(i.IOUsage())

	var buf bytes.Buffer
	fmt.Fprintf(&buf, "rchar: %d\n", io.CharsRead)
	fmt.Fprintf(&buf, "wchar: %d\n", io.CharsWritten)
	fmt.Fprintf(&buf, "syscr: %d\n", io.ReadSyscalls)
	fmt.Fprintf(&buf, "syscw: %d\n", io.WriteSyscalls)
	fmt.Fprintf(&buf, "read_bytes: %d\n", io.BytesRead)
	fmt.Fprintf(&buf, "write_bytes: %d\n", io.BytesWritten)
	fmt.Fprintf(&buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)

	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*ioData)(nil)}}, 0
}

// comm is a file containing the command name for a task.
//
// On Linux, /proc/[pid]/comm is writable, and writing to the comm file changes
// the thread name. We don't implement this yet as there are no known users of
// this feature.
//
// +stateify savable
type comm struct {
	fsutil.SimpleFileInode

	t *kernel.Task
}

// newComm returns a new comm file.
func newComm(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	c := &comm{
		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
		t:               t,
	}
	return newProcInode(ctx, c, msrc, fs.SpecialFile, t)
}

// Check implements fs.InodeOperations.Check.
func (c *comm) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
	// This file can always be read or written by members of the same
	// thread group. See fs/proc/base.c:proc_tid_comm_permission.
	//
	// N.B. This check is currently a no-op as we don't yet support writing
	// and this file is world-readable anyways.
	t := kernel.TaskFromContext(ctx)
	if t != nil && t.ThreadGroup() == c.t.ThreadGroup() && !p.Execute {
		return true
	}

	return fs.ContextCanAccessFile(ctx, inode, p)
}

// GetFile implements fs.InodeOperations.GetFile.
func (c *comm) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
	return fs.NewFile(ctx, dirent, flags, &commFile{t: c.t}), nil
}

// +stateify savable
type commFile struct {
	fsutil.FileGenericSeek          `state:"nosave"`
	fsutil.FileNoIoctl              `state:"nosave"`
	fsutil.FileNoMMap               `state:"nosave"`
	fsutil.FileNoSplice             `state:"nosave"`
	fsutil.FileNoWrite              `state:"nosave"`
	fsutil.FileNoopFlush            `state:"nosave"`
	fsutil.FileNoopFsync            `state:"nosave"`
	fsutil.FileNoopRelease          `state:"nosave"`
	fsutil.FileNotDirReaddir        `state:"nosave"`
	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
	waiter.AlwaysReady              `state:"nosave"`

	t *kernel.Task
}

var _ fs.FileOperations = (*commFile)(nil)

// Read implements fs.FileOperations.Read.
func (f *commFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
	if offset < 0 {
		return 0, linuxerr.EINVAL
	}

	buf := []byte(f.t.Name() + "\n")
	if offset >= int64(len(buf)) {
		return 0, io.EOF
	}

	n, err := dst.CopyOut(ctx, buf[offset:])
	return int64(n), err
}

// auxvec is a file containing the auxiliary vector for a task.
//
// +stateify savable
type auxvec struct {
	fsutil.SimpleFileInode

	t *kernel.Task
}

// newAuxvec returns a new auxvec file.
func newAuxvec(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	a := &auxvec{
		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
		t:               t,
	}
	return newProcInode(ctx, a, msrc, fs.SpecialFile, t)
}

// GetFile implements fs.InodeOperations.GetFile.
func (a *auxvec) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
	return fs.NewFile(ctx, dirent, flags, &auxvecFile{t: a.t}), nil
}

// +stateify savable
type auxvecFile struct {
	fsutil.FileGenericSeek          `state:"nosave"`
	fsutil.FileNoIoctl              `state:"nosave"`
	fsutil.FileNoMMap               `state:"nosave"`
	fsutil.FileNoSplice             `state:"nosave"`
	fsutil.FileNoWrite              `state:"nosave"`
	fsutil.FileNoopFlush            `state:"nosave"`
	fsutil.FileNoopFsync            `state:"nosave"`
	fsutil.FileNoopRelease          `state:"nosave"`
	fsutil.FileNotDirReaddir        `state:"nosave"`
	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
	waiter.AlwaysReady              `state:"nosave"`

	t *kernel.Task
}

// Read implements fs.FileOperations.Read.
func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
	if offset < 0 {
		return 0, linuxerr.EINVAL
	}

	m, err := getTaskMMIncRef(f.t)
	if err != nil {
		return 0, err
	}
	defer m.DecUsers(ctx)
	auxv := m.Auxv()

	// Space for buffer with AT_NULL (0) terminator at the end.
	size := (len(auxv) + 1) * 16
	if offset >= int64(size) {
		return 0, io.EOF
	}

	buf := make([]byte, size)
	for i, e := range auxv {
		hostarch.ByteOrder.PutUint64(buf[16*i:], e.Key)
		hostarch.ByteOrder.PutUint64(buf[16*i+8:], uint64(e.Value))
	}

	n, err := dst.CopyOut(ctx, buf[offset:])
	return int64(n), err
}

// newOOMScore returns a oom_score file. It is a stub that always returns 0.
// TODO(gvisor.dev/issue/1967)
func newOOMScore(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
	return newStaticProcInode(ctx, msrc, []byte("0\n"))
}

// oomScoreAdj is a file containing the oom_score adjustment for a task.
//
// +stateify savable
type oomScoreAdj struct {
	fsutil.SimpleFileInode

	t *kernel.Task
}

// +stateify savable
type oomScoreAdjFile struct {
	fsutil.FileGenericSeek          `state:"nosave"`
	fsutil.FileNoIoctl              `state:"nosave"`
	fsutil.FileNoMMap               `state:"nosave"`
	fsutil.FileNoSplice             `state:"nosave"`
	fsutil.FileNoopFlush            `state:"nosave"`
	fsutil.FileNoopFsync            `state:"nosave"`
	fsutil.FileNoopRelease          `state:"nosave"`
	fsutil.FileNotDirReaddir        `state:"nosave"`
	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
	waiter.AlwaysReady              `state:"nosave"`

	t *kernel.Task
}

// newOOMScoreAdj returns a oom_score_adj file.
func newOOMScoreAdj(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
	i := &oomScoreAdj{
		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
		t:               t,
	}
	return newProcInode(ctx, i, msrc, fs.SpecialFile, t)
}

// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
// O_TRUNC is specified for any kind of existing Dirent but is not called via
// (f)truncate for proc files.
func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error {
	return nil
}

// GetFile implements fs.InodeOperations.GetFile.
func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
	return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil
}

// Read implements fs.FileOperations.Read.
func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
	if f.t.ExitState() == kernel.TaskExitDead {
		return 0, linuxerr.ESRCH
	}
	var buf bytes.Buffer
	fmt.Fprintf(&buf, "%d\n", f.t.OOMScoreAdj())
	if offset >= int64(buf.Len()) {
		return 0, io.EOF
	}
	n, err := dst.CopyOut(ctx, buf.Bytes()[offset:])
	return int64(n), err
}

// Write implements fs.FileOperations.Write.
func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
	if src.NumBytes() == 0 {
		return 0, nil
	}

	// Limit input size so as not to impact performance if input size is large.
	src = src.TakeFirst(hostarch.PageSize - 1)

	var v int32
	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
	if err != nil {
		return 0, err
	}

	if f.t.ExitState() == kernel.TaskExitDead {
		return 0, linuxerr.ESRCH
	}
	if err := f.t.SetOOMScoreAdj(v); err != nil {
		return 0, err
	}

	return n, nil
}

// LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go)