diff options
Diffstat (limited to 'pkg/sentry')
309 files changed, 14124 insertions, 2467 deletions
diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD index 53989301f..2d6379c86 100644 --- a/pkg/sentry/BUILD +++ b/pkg/sentry/BUILD @@ -8,5 +8,7 @@ package_group( packages = [ "//pkg/sentry/...", "//runsc/...", + # Code generated by go_marshal relies on go_marshal libraries. + "//tools/go_marshal/...", ], ) diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 7aace2d7b..18c73cc24 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -1,9 +1,9 @@ load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) -load("//tools/go_stateify:defs.bzl", "go_library") - go_library( name = "arch", srcs = [ @@ -42,6 +42,12 @@ proto_library( visibility = ["//visibility:public"], ) +cc_proto_library( + name = "registers_cc_proto", + visibility = ["//visibility:public"], + deps = [":registers_proto"], +) + go_proto_library( name = "registers_go_proto", importpath = "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto", diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 9e7db8b30..67daa6c24 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -305,7 +305,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs()) return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil } - // TODO(b/34088053): debug registers + // Note: x86 debug registers are missing. return c.Native(0), nil } @@ -320,6 +320,6 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error { _, err := c.PtraceSetRegs(bytes.NewBuffer(buf)) return err } - // TODO(b/34088053): debug registers + // Note: x86 debug registers are missing. return nil } diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go index dfd62cbdb..23e009ef3 100644 --- a/pkg/sentry/context/context.go +++ b/pkg/sentry/context/context.go @@ -12,10 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package context defines the sentry's Context type. +// Package context defines an internal context type. +// +// The given Context conforms to the standard Go context, but mandates +// additional methods that are specific to the kernel internals. Note however, +// that the Context described by this package carries additional constraints +// regarding concurrent access and retaining beyond the scope of a call. +// +// See the Context type for complete details. package context import ( + "context" + "time" + "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/log" ) @@ -59,6 +69,7 @@ func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) { type Context interface { log.Logger amutex.Sleeper + context.Context // UninterruptibleSleepStart indicates the beginning of an uninterruptible // sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate @@ -72,19 +83,36 @@ type Context interface { // AddressSpace is activated. Normally activate is the same value as the // deactivate parameter passed to UninterruptibleSleepStart. UninterruptibleSleepFinish(activate bool) +} + +// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep +// methods for anonymous embedding in other types that do not implement sleeps. +type NoopSleeper struct { + amutex.NoopSleeper +} + +// UninterruptibleSleepStart does nothing. +func (NoopSleeper) UninterruptibleSleepStart(bool) {} + +// UninterruptibleSleepFinish does nothing. +func (NoopSleeper) UninterruptibleSleepFinish(bool) {} + +// Deadline returns zero values, meaning no deadline. +func (NoopSleeper) Deadline() (time.Time, bool) { + return time.Time{}, false +} + +// Done returns nil. +func (NoopSleeper) Done() <-chan struct{} { + return nil +} - // Value returns the value associated with this Context for key, or nil if - // no value is associated with key. Successive calls to Value with the same - // key returns the same result. - // - // A key identifies a specific value in a Context. Functions that wish to - // retrieve values from Context typically allocate a key in a global - // variable then use that key as the argument to Context.Value. A key can - // be any type that supports equality; packages should define keys as an - // unexported type to avoid collisions. - Value(key interface{}) interface{} +// Err returns nil. +func (NoopSleeper) Err() error { + return nil } +// logContext implements basic logging. type logContext struct { log.Logger NoopSleeper @@ -95,19 +123,6 @@ func (logContext) Value(key interface{}) interface{} { return nil } -// NoopSleeper is a noop implementation of amutex.Sleeper and -// Context.UninterruptibleSleep* methods for anonymous embedding in other types -// that do not want to notify kernel.Task about sleeps. -type NoopSleeper struct { - amutex.NoopSleeper -} - -// UninterruptibleSleepStart does nothing. -func (NoopSleeper) UninterruptibleSleepStart(bool) {} - -// UninterruptibleSleepFinish does nothing. -func (NoopSleeper) UninterruptibleSleepFinish(bool) {} - // bgContext is the context returned by context.Background. var bgContext = &logContext{Logger: log.Log()} diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD index 3b6841b7e..581e7aa96 100644 --- a/pkg/sentry/context/contexttest/BUILD +++ b/pkg/sentry/context/contexttest/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "contexttest", testonly = 1, diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index bf802d1b6..5522cecd0 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 1f78d54a2..e1f2fea60 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -22,6 +22,7 @@ import ( "sync" "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/urpc" ) @@ -56,6 +57,9 @@ type Profile struct { // traceFile is the current execution trace output file. traceFile *fd.FD + + // Kernel is the kernel under profile. + Kernel *kernel.Kernel } // StartCPUProfile is an RPC stub which starts recording the CPU profile in a @@ -147,6 +151,9 @@ func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error { return err } + // Ensure all trace contexts are registered. + p.Kernel.RebuildTraceContexts() + p.traceFile = output return nil } @@ -158,9 +165,15 @@ func (p *Profile) StopTrace(_, _ *struct{}) error { defer p.mu.Unlock() if p.traceFile == nil { - return errors.New("Execution tracing not start") + return errors.New("Execution tracing not started") } + // Similarly to the case above, if tasks have not ended traces, we will + // lose information. Thus we need to rebuild the tasks in order to have + // complete information. This will not lose information if multiple + // traces are overlapping. + p.Kernel.RebuildTraceContexts() + trace.Stop() p.traceFile.Close() p.traceFile = nil diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index c35faeb4c..ced51c66c 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -268,14 +268,17 @@ func (proc *Proc) Ps(args *PsArgs, out *string) error { } // Process contains information about a single process in a Sandbox. -// TODO(b/117881927): Implement TTY field. type Process struct { UID auth.KUID `json:"uid"` PID kernel.ThreadID `json:"pid"` // Parent PID - PPID kernel.ThreadID `json:"ppid"` + PPID kernel.ThreadID `json:"ppid"` + Threads []kernel.ThreadID `json:"threads"` // Processor utilization C int32 `json:"c"` + // TTY name of the process. Will be of the form "pts/N" if there is a + // TTY, or "?" if there is not. + TTY string `json:"tty"` // Start time STime string `json:"stime"` // CPU time @@ -285,18 +288,19 @@ type Process struct { } // ProcessListToTable prints a table with the following format: -// UID PID PPID C STIME TIME CMD -// 0 1 0 0 14:04 505262ns tail +// UID PID PPID C TTY STIME TIME CMD +// 0 1 0 0 pty/4 14:04 505262ns tail func ProcessListToTable(pl []*Process) string { var buf bytes.Buffer tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) - fmt.Fprint(tw, "UID\tPID\tPPID\tC\tSTIME\tTIME\tCMD") + fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") for _, d := range pl { - fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s", + fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", d.UID, d.PID, d.PPID, d.C, + d.TTY, d.STime, d.Time, d.Cmd) @@ -307,7 +311,7 @@ func ProcessListToTable(pl []*Process) string { // ProcessListToJSON will return the JSON representation of ps. func ProcessListToJSON(pl []*Process) (string, error) { - b, err := json.Marshal(pl) + b, err := json.MarshalIndent(pl, "", " ") if err != nil { return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err) } @@ -334,7 +338,9 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { ts := k.TaskSet() now := k.RealtimeClock().Now() for _, tg := range ts.Root.ThreadGroups() { - pid := tg.PIDNamespace().IDOfThreadGroup(tg) + pidns := tg.PIDNamespace() + pid := pidns.IDOfThreadGroup(tg) + // If tg has already been reaped ignore it. if pid == 0 { continue @@ -345,16 +351,19 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { ppid := kernel.ThreadID(0) if p := tg.Leader().Parent(); p != nil { - ppid = p.PIDNamespace().IDOfThreadGroup(p.ThreadGroup()) + ppid = pidns.IDOfThreadGroup(p.ThreadGroup()) } + threads := tg.MemberIDs(pidns) *out = append(*out, &Process{ - UID: tg.Leader().Credentials().EffectiveKUID, - PID: pid, - PPID: ppid, - STime: formatStartTime(now, tg.Leader().StartTime()), - C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), - Time: tg.CPUStats().SysTime.String(), - Cmd: tg.Leader().Name(), + UID: tg.Leader().Credentials().EffectiveKUID, + PID: pid, + PPID: ppid, + Threads: threads, + STime: formatStartTime(now, tg.Leader().StartTime()), + C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), + Time: tg.CPUStats().SysTime.String(), + Cmd: tg.Leader().Name(), + TTY: ttyName(tg.TTY()), }) } sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) @@ -395,3 +404,10 @@ func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { } return int32(percentCPU) } + +func ttyName(tty *kernel.TTY) string { + if tty == nil { + return "?" + } + return fmt.Sprintf("pts/%d", tty.Index) +} diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go index d8ada2694..0a88459b2 100644 --- a/pkg/sentry/control/proc_test.go +++ b/pkg/sentry/control/proc_test.go @@ -34,7 +34,7 @@ func TestProcessListTable(t *testing.T) { }{ { pl: []*Process{}, - expected: "UID PID PPID C STIME TIME CMD", + expected: "UID PID PPID C TTY STIME TIME CMD", }, { pl: []*Process{ @@ -43,6 +43,7 @@ func TestProcessListTable(t *testing.T) { PID: 0, PPID: 0, C: 0, + TTY: "?", STime: "0", Time: "0", Cmd: "zero", @@ -52,14 +53,15 @@ func TestProcessListTable(t *testing.T) { PID: 1, PPID: 1, C: 1, + TTY: "pts/4", STime: "1", Time: "1", Cmd: "one", }, }, - expected: `UID PID PPID C STIME TIME CMD -0 0 0 0 0 0 zero -1 1 1 1 1 1 one`, + expected: `UID PID PPID C TTY STIME TIME CMD +0 0 0 0 ? 0 0 zero +1 1 1 1 pts/4 1 1 one`, }, } diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD index 7e8918722..1098ed777 100644 --- a/pkg/sentry/device/BUILD +++ b/pkg/sentry/device/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "device", diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index d7259b47b..c035ffff7 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_library( name = "fs", @@ -67,9 +68,9 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/state", + "//pkg/syncutil", "//pkg/syserror", "//pkg/waiter", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 80e106e6f..a0d9e8496 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "dev", srcs = [ diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index fbca06761..3cb73bd78 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -1126,7 +1126,7 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { // Remove removes the given file or symlink. The root dirent is used to // resolve name, and must not be nil. -func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string) error { +func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath bool) error { // Check the root. if root == nil { panic("Dirent.Remove: root must not be nil") @@ -1151,6 +1151,8 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string) error { // Remove cannot remove directories. if IsDir(child.Inode.StableAttr) { return syscall.EISDIR + } else if dirPath { + return syscall.ENOTDIR } // Remove cannot remove a mount point. diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go index 884e3ff06..47bc72a88 100644 --- a/pkg/sentry/fs/dirent_refs_test.go +++ b/pkg/sentry/fs/dirent_refs_test.go @@ -343,7 +343,7 @@ func TestRemoveExtraRefs(t *testing.T) { } d := f.Dirent - if err := test.root.Remove(contexttest.Context(t), test.root, name); err != nil { + if err := test.root.Remove(contexttest.Context(t), test.root, name, false /* dirPath */); err != nil { t.Fatalf("root.Remove(root, %q) failed: %v", name, err) } diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index bf00b9c09..277ee4c31 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "fdpipe", diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index 8e4d839e1..577445148 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -25,6 +25,7 @@ import ( "time" "github.com/google/uuid" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index bb8117f89..c0a6e884b 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -515,6 +515,11 @@ type lockedReader struct { // File is the file to read from. File *File + + // Offset is the offset to start at. + // + // This applies only to Read, not ReadAt. + Offset int64 } // Read implements io.Reader.Read. @@ -522,7 +527,8 @@ func (r *lockedReader) Read(buf []byte) (int, error) { if r.Ctx.Interrupted() { return 0, syserror.ErrInterrupted } - n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.File.offset) + n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.Offset) + r.Offset += n return int(n), err } @@ -544,11 +550,21 @@ type lockedWriter struct { // File is the file to write to. File *File + + // Offset is the offset to start at. + // + // This applies only to Write, not WriteAt. + Offset int64 } // Write implements io.Writer.Write. func (w *lockedWriter) Write(buf []byte) (int, error) { - return w.WriteAt(buf, w.File.offset) + if w.Ctx.Interrupted() { + return 0, syserror.ErrInterrupted + } + n, err := w.WriteAt(buf, w.Offset) + w.Offset += int64(n) + return int(n), err } // WriteAt implements io.Writer.WriteAt. @@ -562,6 +578,9 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) { // io.Copy, since our own Write interface does not have this same // contract. Enforce that here. for written < len(buf) { + if w.Ctx.Interrupted() { + return written, syserror.ErrInterrupted + } var n int64 n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written)) if n > 0 { diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index d86f5bf45..b88303f17 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -15,6 +15,8 @@ package fs import ( + "io" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -105,8 +107,11 @@ type FileOperations interface { // on the destination, following by a buffered copy with standard Read // and Write operations. // + // If dup is set, the data should be duplicated into the destination + // and retained. + // // The same preconditions as Read apply. - WriteTo(ctx context.Context, file *File, dst *File, opts SpliceOpts) (int64, error) + WriteTo(ctx context.Context, file *File, dst io.Writer, count int64, dup bool) (int64, error) // Write writes src to file at offset and returns the number of bytes // written which must be greater than or equal to 0. Like Read, file @@ -126,7 +131,7 @@ type FileOperations interface { // source. See WriteTo for details regarding how this is called. // // The same preconditions as Write apply; FileFlags.Write must be set. - ReadFrom(ctx context.Context, file *File, src *File, opts SpliceOpts) (int64, error) + ReadFrom(ctx context.Context, file *File, src io.Reader, count int64) (int64, error) // Fsync writes buffered modifications of file and/or flushes in-flight // operations to backing storage based on syncType. The range to sync is diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 9820f0b13..225e40186 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -15,6 +15,7 @@ package fs import ( + "io" "sync" "gvisor.dev/gvisor/pkg/refs" @@ -268,9 +269,9 @@ func (f *overlayFileOperations) Read(ctx context.Context, file *File, dst userme } // WriteTo implements FileOperations.WriteTo. -func (f *overlayFileOperations) WriteTo(ctx context.Context, file *File, dst *File, opts SpliceOpts) (n int64, err error) { +func (f *overlayFileOperations) WriteTo(ctx context.Context, file *File, dst io.Writer, count int64, dup bool) (n int64, err error) { err = f.onTop(ctx, file, func(file *File, ops FileOperations) error { - n, err = ops.WriteTo(ctx, file, dst, opts) + n, err = ops.WriteTo(ctx, file, dst, count, dup) return err // Will overwrite itself. }) return @@ -285,9 +286,9 @@ func (f *overlayFileOperations) Write(ctx context.Context, file *File, src userm } // ReadFrom implements FileOperations.ReadFrom. -func (f *overlayFileOperations) ReadFrom(ctx context.Context, file *File, src *File, opts SpliceOpts) (n int64, err error) { +func (f *overlayFileOperations) ReadFrom(ctx context.Context, file *File, src io.Reader, count int64) (n int64, err error) { // See above; f.upper must be non-nil. - return f.upper.FileOperations.ReadFrom(ctx, f.upper, src, opts) + return f.upper.FileOperations.ReadFrom(ctx, f.upper, src, count) } // Fsync implements FileOperations.Fsync. diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD index a9d6d9301..358dc2be3 100644 --- a/pkg/sentry/fs/filetest/BUILD +++ b/pkg/sentry/fs/filetest/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "filetest", testonly = 1, diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go index 0fab876a9..4338ae1fa 100644 --- a/pkg/sentry/fs/flags.go +++ b/pkg/sentry/fs/flags.go @@ -64,6 +64,10 @@ type FileFlags struct { // NonSeekable indicates that file.offset isn't used. NonSeekable bool + + // Truncate indicates that the file should be truncated before opened. + // This is only applicable if the file is regular. + Truncate bool } // SettableFileFlags is a subset of FileFlags above that can be changed @@ -118,6 +122,9 @@ func (f FileFlags) ToLinux() (mask uint) { if f.LargeFile { mask |= linux.O_LARGEFILE } + if f.Truncate { + mask |= linux.O_TRUNC + } switch { case f.Read && f.Write: diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 6499f87ac..b2e8d9c77 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "dirty_set_impl", diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index 626b9126a..fc5b3b1a1 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -15,6 +15,8 @@ package fsutil import ( + "io" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -228,12 +230,12 @@ func (FileNoIoctl) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArgu type FileNoSplice struct{} // WriteTo implements fs.FileOperations.WriteTo. -func (FileNoSplice) WriteTo(context.Context, *fs.File, *fs.File, fs.SpliceOpts) (int64, error) { +func (FileNoSplice) WriteTo(context.Context, *fs.File, io.Writer, int64, bool) (int64, error) { return 0, syserror.ENOSYS } // ReadFrom implements fs.FileOperations.ReadFrom. -func (FileNoSplice) ReadFrom(context.Context, *fs.File, *fs.File, fs.SpliceOpts) (int64, error) { +func (FileNoSplice) ReadFrom(context.Context, *fs.File, io.Reader, int64) (int64, error) { return 0, syserror.ENOSYS } diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index e239f12a5..b06a71cc2 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -209,3 +209,29 @@ func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) { } delete(f.mappings, chunkStart) } + +// RegenerateMappings must be called when the file description mapped by f +// changes, to replace existing mappings of the previous file description. +func (f *HostFileMapper) RegenerateMappings(fd int) error { + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + + for chunkStart, m := range f.mappings { + prot := syscall.PROT_READ + if m.writable { + prot |= syscall.PROT_WRITE + } + _, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + m.addr, + chunkSize, + uintptr(prot), + syscall.MAP_SHARED|syscall.MAP_FIXED, + uintptr(fd), + uintptr(chunkStart)) + if errno != 0 { + return errno + } + } + return nil +} diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index d2495cb83..30475f340 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -100,13 +100,30 @@ func (h *HostMappable) Translate(ctx context.Context, required, optional memmap. } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. -func (h *HostMappable) InvalidateUnsavable(ctx context.Context) error { +func (h *HostMappable) InvalidateUnsavable(_ context.Context) error { h.mu.Lock() h.mappings.InvalidateAll(memmap.InvalidateOpts{}) h.mu.Unlock() return nil } +// NotifyChangeFD must be called after the file description represented by +// CachedFileObject.FD() changes. +func (h *HostMappable) NotifyChangeFD() error { + // Update existing sentry mappings to refer to the new file description. + if err := h.hostFileMapper.RegenerateMappings(h.backingFile.FD()); err != nil { + return err + } + + // Shoot down existing application mappings of the old file description; + // they will be remapped with the new file description on demand. + h.mu.Lock() + defer h.mu.Unlock() + + h.mappings.InvalidateAll(memmap.InvalidateOpts{}) + return nil +} + // MapInternal implements platform.File.MapInternal. func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write) @@ -144,7 +161,7 @@ func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error { mask := fs.AttrMask{Size: true} attr := fs.UnstableAttr{Size: newSize} - if err := h.backingFile.SetMaskedAttributes(ctx, mask, attr); err != nil { + if err := h.backingFile.SetMaskedAttributes(ctx, mask, attr, false); err != nil { return err } diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index d404a79d4..798920d18 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -140,12 +140,16 @@ type CachedFileObject interface { // WriteFromBlocksAt may return a partial write without an error. WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) - // SetMaskedAttributes sets the attributes in attr that are true in mask - // on the backing file. + // SetMaskedAttributes sets the attributes in attr that are true in + // mask on the backing file. If the mask contains only ATime or MTime + // and the CachedFileObject has an FD to the file, then this operation + // is a noop unless forceSetTimestamps is true. This avoids an extra + // RPC to the gofer in the open-read/write-close case, when the + // timestamps on the file will be updated by the host kernel for us. // // SetMaskedAttributes may be called at any point, regardless of whether // the file was opened. - SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error + SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, forceSetTimestamps bool) error // Allocate allows the caller to reserve disk space for the inode. // It's equivalent to fallocate(2) with 'mode=0'. @@ -224,7 +228,7 @@ func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.I now := ktime.NowFromContext(ctx) masked := fs.AttrMask{Perms: true} - if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil { + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}, false); err != nil { return false } c.attr.Perms = perms @@ -246,7 +250,7 @@ func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, UID: owner.UID.Ok(), GID: owner.GID.Ok(), } - if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}); err != nil { + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}, false); err != nil { return err } if owner.UID.Ok() { @@ -282,7 +286,9 @@ func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.In AccessTime: !ts.ATimeOmit, ModificationTime: !ts.MTimeOmit, } - if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}); err != nil { + // Call SetMaskedAttributes with forceSetTimestamps = true to make sure + // the timestamp is updated. + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}, true); err != nil { return err } if !ts.ATimeOmit { @@ -305,7 +311,7 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, now := ktime.NowFromContext(ctx) masked := fs.AttrMask{Size: true} attr := fs.UnstableAttr{Size: size} - if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr); err != nil { + if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr, false); err != nil { c.dataMu.Unlock() return err } @@ -394,7 +400,7 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) c.dirtyAttr.Size = false // Write out cached attributes. - if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil { + if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr, false); err != nil { c.attrMu.Unlock() return err } @@ -953,6 +959,23 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error return nil } +// NotifyChangeFD must be called after the file description represented by +// CachedFileObject.FD() changes. +func (c *CachingInodeOperations) NotifyChangeFD() error { + // Update existing sentry mappings to refer to the new file description. + if err := c.hostFileMapper.RegenerateMappings(c.backingFile.FD()); err != nil { + return err + } + + // Shoot down existing application mappings of the old file description; + // they will be remapped with the new file description on demand. + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + + c.mappings.InvalidateAll(memmap.InvalidateOpts{}) + return nil +} + // Evict implements pgalloc.EvictableMemoryUser.Evict. func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) { c.mapsMu.Lock() @@ -1021,7 +1044,6 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { } c.refs.MergeAdjacent(fr) c.dataMu.Unlock() - } // MapInternal implements platform.File.MapInternal. This is used when we diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go index eb5730c35..129f314c8 100644 --- a/pkg/sentry/fs/fsutil/inode_cached_test.go +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -39,7 +39,7 @@ func (noopBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.Block return srcs.NumBytes(), nil } -func (noopBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error { +func (noopBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr, bool) error { return nil } @@ -230,7 +230,7 @@ func (f *sliceBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.B return w.WriteFromBlocks(srcs) } -func (*sliceBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error { +func (*sliceBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr, bool) error { return nil } diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index 6b993928c..4a005c605 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "gofer", diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index 9e2e412cd..7960b9c7b 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -214,28 +214,64 @@ func (f *fileOperations) readdirAll(ctx context.Context) (map[string]fs.DentAttr return entries, nil } +// maybeSync will call FSync on the file if either the cache policy or file +// flags require it. +func (f *fileOperations) maybeSync(ctx context.Context, file *fs.File, offset, n int64) error { + if n == 0 { + // Nothing to sync. + return nil + } + + if f.inodeOperations.session().cachePolicy.writeThrough(file.Dirent.Inode) { + // Call WriteOut directly, as some "writethrough" filesystems + // do not support sync. + return f.inodeOperations.cachingInodeOps.WriteOut(ctx, file.Dirent.Inode) + } + + flags := file.Flags() + var syncType fs.SyncType + switch { + case flags.Direct || flags.Sync: + syncType = fs.SyncAll + case flags.DSync: + syncType = fs.SyncData + default: + // No need to sync. + return nil + } + + return f.Fsync(ctx, file, offset, offset+n, syncType) +} + // Write implements fs.FileOperations.Write. func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { if fs.IsDir(file.Dirent.Inode.StableAttr) { // Not all remote file systems enforce this so this client does. return 0, syserror.EISDIR } - cp := f.inodeOperations.session().cachePolicy - if cp.useCachingInodeOps(file.Dirent.Inode) { - n, err := f.inodeOperations.cachingInodeOps.Write(ctx, src, offset) - if err != nil { - return n, err - } - if cp.writeThrough(file.Dirent.Inode) { - // Write out the file. - err = f.inodeOperations.cachingInodeOps.WriteOut(ctx, file.Dirent.Inode) - } - return n, err + + var ( + n int64 + err error + ) + // The write is handled in different ways depending on the cache policy + // and availability of a host-mappable FD. + if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) { + n, err = f.inodeOperations.cachingInodeOps.Write(ctx, src, offset) + } else if f.inodeOperations.fileState.hostMappable != nil { + n, err = f.inodeOperations.fileState.hostMappable.Write(ctx, src, offset) + } else { + n, err = src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset)) } - if f.inodeOperations.fileState.hostMappable != nil { - return f.inodeOperations.fileState.hostMappable.Write(ctx, src, offset) + + // We may need to sync the written bytes. + if syncErr := f.maybeSync(ctx, file, offset, n); syncErr != nil { + // Sync failed. Report 0 bytes written, since none of them are + // guaranteed to have been synced. + return 0, syncErr } - return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset)) + + return n, err } // incrementReadCounters increments the read counters for the read starting at the given time. We @@ -273,7 +309,7 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO } // Fsync implements fs.FileOperations.Fsync. -func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error { +func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start, end int64, syncType fs.SyncType) error { switch syncType { case fs.SyncAll, fs.SyncData: if err := file.Dirent.Inode.WriteOut(ctx); err != nil { diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go index 9aa68a70e..bb8312849 100644 --- a/pkg/sentry/fs/gofer/file_state.go +++ b/pkg/sentry/fs/gofer/file_state.go @@ -28,8 +28,14 @@ func (f *fileOperations) afterLoad() { // Manually load the open handles. var err error + + // The file may have been opened with Truncate, but we don't + // want to re-open it with Truncate or we will lose data. + flags := f.flags + flags.Truncate = false + // TODO(b/38173783): Context is not plumbed to save/restore. - f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), f.flags) + f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), flags, f.inodeOperations.cachingInodeOps) if err != nil { return fmt.Errorf("failed to re-open handle: %v", err) } diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go index 8f8ab5d29..cf96dd9fa 100644 --- a/pkg/sentry/fs/gofer/fs.go +++ b/pkg/sentry/fs/gofer/fs.go @@ -58,6 +58,11 @@ const ( // If present, sets CachingInodeOperationsOptions.LimitHostFDTranslation to // true. limitHostFDTranslationKey = "limit_host_fd_translation" + + // overlayfsStaleRead if present closes cached readonly file after the first + // write. This is done to workaround a limitation of overlayfs in kernels + // before 4.19 where open FDs are not updated after the file is copied up. + overlayfsStaleRead = "overlayfs_stale_read" ) // defaultAname is the default attach name. @@ -145,6 +150,7 @@ type opts struct { version string privateunixsocket bool limitHostFDTranslation bool + overlayfsStaleRead bool } // options parses mount(2) data into structured options. @@ -247,6 +253,11 @@ func options(data string) (opts, error) { delete(options, limitHostFDTranslationKey) } + if _, ok := options[overlayfsStaleRead]; ok { + o.overlayfsStaleRead = true + delete(options, overlayfsStaleRead) + } + // Fail to attach if the caller wanted us to do something that we // don't support. if len(options) > 0 { diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index 27eeae3d9..b86c49b39 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -39,14 +39,22 @@ type handles struct { // Host is an *fd.FD handle. May be nil. Host *fd.FD + + // isHostBorrowed tells whether 'Host' is owned or borrowed. If owned, it's + // closed on destruction, otherwise it's released. + isHostBorrowed bool } // DecRef drops a reference on handles. func (h *handles) DecRef() { h.DecRefWithDestructor(func() { if h.Host != nil { - if err := h.Host.Close(); err != nil { - log.Warningf("error closing host file: %v", err) + if h.isHostBorrowed { + h.Host.Release() + } else { + if err := h.Host.Close(); err != nil { + log.Warningf("error closing host file: %v", err) + } } } // FIXME(b/38173783): Context is not plumbed here. @@ -56,7 +64,7 @@ func (h *handles) DecRef() { }) } -func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*handles, error) { +func newHandles(ctx context.Context, client *p9.Client, file contextFile, flags fs.FileFlags) (*handles, error) { _, newFile, err := file.walk(ctx, nil) if err != nil { return nil, err @@ -73,6 +81,9 @@ func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*han default: panic("impossible fs.FileFlags") } + if flags.Truncate && p9.VersionSupportsOpenTruncateFlag(client.Version()) { + p9flags |= p9.OpenTruncate + } hostFile, _, _, err := newFile.open(ctx, p9flags) if err != nil { diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 95b064aea..91263ebdc 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -100,7 +100,7 @@ type inodeFileState struct { // true. // // Once readHandles becomes non-nil, it can't be changed until - // inodeFileState.Release(), because of a defect in the + // inodeFileState.Release()*, because of a defect in the // fsutil.CachedFileObject interface: there's no way for the caller of // fsutil.CachedFileObject.FD() to keep the returned FD open, so if we // racily replace readHandles after inodeFileState.FD() has returned @@ -108,6 +108,9 @@ type inodeFileState struct { // FD. writeHandles can be changed if writeHandlesRW is false, since // inodeFileState.FD() can't return a write-only FD, but can't be changed // if writeHandlesRW is true for the same reason. + // + // * There is one notable exception in recreateReadHandles(), where it dup's + // the FD and invalidates the page cache. readHandles *handles `state:"nosave"` writeHandles *handles `state:"nosave"` writeHandlesRW bool `state:"nosave"` @@ -175,48 +178,135 @@ func (i *inodeFileState) setSharedHandlesLocked(flags fs.FileFlags, h *handles) // getHandles returns a set of handles for a new file using i opened with the // given flags. -func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags) (*handles, error) { +func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags, cache *fsutil.CachingInodeOperations) (*handles, error) { if !i.canShareHandles() { - return newHandles(ctx, i.file, flags) + return newHandles(ctx, i.s.client, i.file, flags) } + i.handlesMu.Lock() - defer i.handlesMu.Unlock() - // Do we already have usable shared handles? - if flags.Write { + h, invalidate, err := i.getHandlesLocked(ctx, flags) + i.handlesMu.Unlock() + + if invalidate { + cache.NotifyChangeFD() + if i.hostMappable != nil { + i.hostMappable.NotifyChangeFD() + } + } + + return h, err +} + +// getHandlesLocked returns a pointer to cached handles and a boolean indicating +// whether previously open read handle was recreated. Host mappings must be +// invalidated if so. +func (i *inodeFileState) getHandlesLocked(ctx context.Context, flags fs.FileFlags) (*handles, bool, error) { + // Check if we are able to use cached handles. + if flags.Truncate && p9.VersionSupportsOpenTruncateFlag(i.s.client.Version()) { + // If we are truncating (and the gofer supports it), then we + // always need a new handle. Don't return one from the cache. + } else if flags.Write { if i.writeHandles != nil && (i.writeHandlesRW || !flags.Read) { + // File is opened for writing, and we have cached write + // handles that we can use. i.writeHandles.IncRef() - return i.writeHandles, nil + return i.writeHandles, false, nil } } else if i.readHandles != nil { + // File is opened for reading and we have cached handles. i.readHandles.IncRef() - return i.readHandles, nil + return i.readHandles, false, nil } - // No; get new handles and cache them for future sharing. - h, err := newHandles(ctx, i.file, flags) + + // Get new handles and cache them for future sharing. + h, err := newHandles(ctx, i.s.client, i.file, flags) if err != nil { - return nil, err + return nil, false, err + } + + // Read handles invalidation is needed if: + // - Mount option 'overlayfs_stale_read' is set + // - Read handle is open: nothing to invalidate otherwise + // - Write handle is not open: file was not open for write and is being open + // for write now (will trigger copy up in overlayfs). + invalidate := false + if i.s.overlayfsStaleRead && i.readHandles != nil && i.writeHandles == nil && flags.Write { + if err := i.recreateReadHandles(ctx, h, flags); err != nil { + return nil, false, err + } + invalidate = true } i.setSharedHandlesLocked(flags, h) - return h, nil + return h, invalidate, nil +} + +func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handles, flags fs.FileFlags) error { + h := writer + if !flags.Read { + // Writer can't be used for read, must create a new handle. + var err error + h, err = newHandles(ctx, i.s.client, i.file, fs.FileFlags{Read: true}) + if err != nil { + return err + } + defer h.DecRef() + } + + if i.readHandles.Host == nil { + // If current readHandles doesn't have a host FD, it can simply be replaced. + i.readHandles.DecRef() + + h.IncRef() + i.readHandles = h + return nil + } + + if h.Host == nil { + // Current read handle has a host FD and can't be replaced with one that + // doesn't, because it breaks fsutil.CachedFileObject.FD() contract. + log.Warningf("Read handle can't be invalidated, reads may return stale data") + return nil + } + + // Due to a defect in the fsutil.CachedFileObject interface, + // readHandles.Host.FD() may be used outside locks, making it impossible to + // reliably close it. To workaround it, we dup the new FD into the old one, so + // operations on the old will see the new data. Then, make the new handle take + // ownereship of the old FD and mark the old readHandle to not close the FD + // when done. + if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), 0); err != nil { + return err + } + + h.Host.Close() + h.Host = fd.New(i.readHandles.Host.FD()) + i.readHandles.isHostBorrowed = true + i.readHandles.DecRef() + + h.IncRef() + i.readHandles = h + return nil } // ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt. func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { i.handlesMu.RLock() - defer i.handlesMu.RUnlock() - return i.readHandles.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts) + n, err := i.readHandles.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts) + i.handlesMu.RUnlock() + return n, err } // WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt. func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { i.handlesMu.RLock() - defer i.handlesMu.RUnlock() - return i.writeHandles.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs) + n, err := i.writeHandles.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs) + i.handlesMu.RUnlock() + return n, err } // SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. -func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error { - if i.skipSetAttr(mask) { +func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, forceSetTimestamps bool) error { + if i.skipSetAttr(mask, forceSetTimestamps) { return nil } as, ans := attr.AccessTime.Unix() @@ -251,13 +341,14 @@ func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMa // when: // - Mask is empty // - Mask contains only attributes that cannot be set in the gofer -// - Mask contains only atime and/or mtime, and host FD exists +// - forceSetTimestamps is false and mask contains only atime and/or mtime +// and host FD exists // // Updates to atime and mtime can be skipped because cached value will be // "close enough" to host value, given that operation went directly to host FD. // Skipping atime updates is particularly important to reduce the number of // operations sent to the Gofer for readonly files. -func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool { +func (i *inodeFileState) skipSetAttr(mask fs.AttrMask, forceSetTimestamps bool) bool { // First remove attributes that cannot be updated. cpy := mask cpy.Type = false @@ -277,6 +368,12 @@ func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool { return false } + // If forceSetTimestamps was passed, then we cannot skip. + if forceSetTimestamps { + return false + } + + // Skip if we have a host FD. i.handlesMu.RLock() defer i.handlesMu.RUnlock() return (i.readHandles != nil && i.readHandles.Host != nil) || @@ -442,7 +539,7 @@ func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (* } func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - h, err := i.fileState.getHandles(ctx, flags) + h, err := i.fileState.getHandles(ctx, flags, i.cachingInodeOps) if err != nil { return nil, err } diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index 8c17603f8..c09f3b71c 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -234,6 +234,8 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, if err != nil { return nil, err } + // We're not going to use newFile after return. + defer newFile.close(ctx) // Stabilize the endpoint map while creation is in progress. unlock := i.session().endpoints.lock() @@ -254,7 +256,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, // Get the attributes of the file to create inode key. qid, mask, attr, err := getattr(ctx, newFile) if err != nil { - newFile.close(ctx) return nil, err } @@ -270,7 +271,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, // cloned and re-opened multiple times after creation. _, unopened, err := i.fileState.file.walk(ctx, []string{name}) if err != nil { - newFile.close(ctx) return nil, err } diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 50da865c1..4e358a46a 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -122,6 +122,10 @@ type session struct { // CachingInodeOperations created by the session. limitHostFDTranslation bool + // overlayfsStaleRead when set causes the readonly handle to be invalidated + // after file is open for write. + overlayfsStaleRead bool + // connID is a unique identifier for the session connection. connID string `state:"wait"` @@ -139,9 +143,9 @@ type session struct { // socket files. This allows unix domain sockets to be used with paths that // belong to a gofer. // - // TODO(b/77154739): there are few possible races with someone stat'ing the - // file and another deleting it concurrently, where the file will not be - // reported as socket file. + // TODO(gvisor.dev/issue/1200): there are few possible races with someone + // stat'ing the file and another deleting it concurrently, where the file + // will not be reported as socket file. endpoints *endpointMaps `state:"wait"` } @@ -257,6 +261,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF aname: o.aname, superBlockFlags: superBlockFlags, limitHostFDTranslation: o.limitHostFDTranslation, + overlayfsStaleRead: o.overlayfsStaleRead, mounter: mounter, } s.EnableLeakCheck("gofer.session") diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index b1080fb1a..23daeb528 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "host", @@ -20,6 +21,8 @@ go_library( "socket_unsafe.go", "tty.go", "util.go", + "util_amd64_unsafe.go", + "util_arm64_unsafe.go", "util_unsafe.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/host", diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index 894ab01f0..a6e4a09e3 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -114,7 +114,7 @@ func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.Blo } // SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. -func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error { +func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, _ bool) error { if mask.Empty() { return nil } @@ -163,7 +163,7 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err return unstableAttr(i.mops, &s), nil } -// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. +// Allocate implements fsutil.CachedFileObject.Allocate. func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error { return syscall.Fallocate(i.FD(), 0, offset, length) } diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 2392787cb..107336a3e 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -385,3 +385,6 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { func (c *ConnectedEndpoint) Release() { c.ref.DecRefWithDestructor(c.close) } + +// CloseUnread implements transport.ConnectedEndpoint.CloseUnread. +func (c *ConnectedEndpoint) CloseUnread() {} diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 2526412a4..90331e3b2 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -43,12 +43,15 @@ type TTYFileOperations struct { // fgProcessGroup is the foreground process group that is currently // connected to this TTY. fgProcessGroup *kernel.ProcessGroup + + termios linux.KernelTermios } // newTTYFile returns a new fs.File that wraps a TTY FD. func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File { return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{ fileOperations: fileOperations{iops: iops}, + termios: linux.DefaultSlaveTermios, }) } @@ -97,9 +100,12 @@ func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src userme t.mu.Lock() defer t.mu.Unlock() - // Are we allowed to do the write? - if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { - return 0, err + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } } return t.fileOperations.Write(ctx, file, src, offset) } @@ -144,6 +150,9 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO return 0, err } err := ioctlSetTermios(fd, ioctl, &termios) + if err == nil { + t.termios.FromTermios(termios) + } return 0, err case linux.TIOCGPGRP: diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go index bad61a9a1..e37e687c6 100644 --- a/pkg/sentry/fs/host/util.go +++ b/pkg/sentry/fs/host/util.go @@ -155,7 +155,7 @@ func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr { AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), - Links: s.Nlink, + Links: uint64(s.Nlink), } } diff --git a/pkg/sentry/fs/host/util_amd64_unsafe.go b/pkg/sentry/fs/host/util_amd64_unsafe.go new file mode 100644 index 000000000..66da6e9f5 --- /dev/null +++ b/pkg/sentry/fs/host/util_amd64_unsafe.go @@ -0,0 +1,41 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package host + +import ( + "syscall" + "unsafe" +) + +func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) { + var stat syscall.Stat_t + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return stat, err + } + _, _, errno := syscall.Syscall6( + syscall.SYS_NEWFSTATAT, + uintptr(fd), + uintptr(unsafe.Pointer(namePtr)), + uintptr(unsafe.Pointer(&stat)), + uintptr(flags), + 0, 0) + if errno != 0 { + return stat, errno + } + return stat, nil +} diff --git a/pkg/sentry/fs/host/util_arm64_unsafe.go b/pkg/sentry/fs/host/util_arm64_unsafe.go new file mode 100644 index 000000000..e8cb94aeb --- /dev/null +++ b/pkg/sentry/fs/host/util_arm64_unsafe.go @@ -0,0 +1,41 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package host + +import ( + "syscall" + "unsafe" +) + +func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) { + var stat syscall.Stat_t + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return stat, err + } + _, _, errno := syscall.Syscall6( + syscall.SYS_FSTATAT, + uintptr(fd), + uintptr(unsafe.Pointer(namePtr)), + uintptr(unsafe.Pointer(&stat)), + uintptr(flags), + 0, 0) + if errno != 0 { + return stat, errno + } + return stat, nil +} diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go index 2b76f1065..3ab36b088 100644 --- a/pkg/sentry/fs/host/util_unsafe.go +++ b/pkg/sentry/fs/host/util_unsafe.go @@ -116,22 +116,3 @@ func setTimestamps(fd int, ts fs.TimeSpec) error { } return nil } - -func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) { - var stat syscall.Stat_t - namePtr, err := syscall.BytePtrFromString(name) - if err != nil { - return stat, err - } - _, _, errno := syscall.Syscall6( - syscall.SYS_NEWFSTATAT, - uintptr(fd), - uintptr(unsafe.Pointer(namePtr)), - uintptr(unsafe.Pointer(&stat)), - uintptr(flags), - 0, 0) - if errno != 0 { - return stat, errno - } - return stat, nil -} diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index f4ddfa406..91e2fde2f 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -270,6 +270,14 @@ func (i *Inode) Getxattr(name string) (string, error) { return i.InodeOperations.Getxattr(i, name) } +// Setxattr calls i.InodeOperations.Setxattr with i as the Inode. +func (i *Inode) Setxattr(name, value string) error { + if i.overlay != nil { + return overlaySetxattr(i.overlay, name, value) + } + return i.InodeOperations.Setxattr(i, name, value) +} + // Listxattr calls i.InodeOperations.Listxattr with i as the Inode. func (i *Inode) Listxattr() (map[string]struct{}, error) { if i.overlay != nil { @@ -344,6 +352,10 @@ func (i *Inode) SetTimestamps(ctx context.Context, d *Dirent, ts TimeSpec) error // Truncate calls i.InodeOperations.Truncate with i as the Inode. func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error { + if IsDir(i.StableAttr) { + return syserror.EISDIR + } + if i.overlay != nil { return overlayTruncate(ctx, i.overlay, d, size) } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 246b97161..13d11e001 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -15,6 +15,7 @@ package fs import ( + "fmt" "strings" "gvisor.dev/gvisor/pkg/abi/linux" @@ -207,6 +208,11 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name } func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name string, flags FileFlags, perm FilePermissions) (*File, error) { + // Sanity check. + if parent.Inode.overlay == nil { + panic(fmt.Sprintf("overlayCreate called with non-overlay parent inode (parent InodeOperations type is %T)", parent.Inode.InodeOperations)) + } + // Dirent.Create takes renameMu if the Inode is an overlay Inode. if err := copyUpLockedForRename(ctx, parent); err != nil { return nil, err @@ -430,7 +436,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena } func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { - if err := copyUp(ctx, parent); err != nil { + if err := copyUpLockedForRename(ctx, parent); err != nil { return nil, err } @@ -456,7 +462,9 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri inode.DecRef() return nil, err } - return NewDirent(ctx, newOverlayInode(ctx, entry, inode.MountSource), name), nil + // Use the parent's MountSource, since that corresponds to the overlay, + // and not the upper filesystem. + return NewDirent(ctx, newOverlayInode(ctx, entry, parent.Inode.MountSource), name), nil } func overlayBoundEndpoint(o *overlayEntry, path string) transport.BoundEndpoint { @@ -544,6 +552,11 @@ func overlayGetxattr(o *overlayEntry, name string) (string, error) { return s, err } +// TODO(b/146028302): Support setxattr for overlayfs. +func overlaySetxattr(o *overlayEntry, name, value string) error { + return syserror.EOPNOTSUPP +} + func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) { o.copyMu.RLock() defer o.copyMu.RUnlock() diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index c7f4e2d13..ba3e0233d 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -15,6 +15,7 @@ package fs import ( + "io" "sync" "sync/atomic" @@ -172,7 +173,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i } // WriteTo implements FileOperations.WriteTo. -func (*Inotify) WriteTo(context.Context, *File, *File, SpliceOpts) (int64, error) { +func (*Inotify) WriteTo(context.Context, *File, io.Writer, int64, bool) (int64, error) { return 0, syserror.ENOSYS } @@ -182,7 +183,7 @@ func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error { } // ReadFrom implements FileOperations.ReadFrom. -func (*Inotify) ReadFrom(context.Context, *File, *File, SpliceOpts) (int64, error) { +func (*Inotify) ReadFrom(context.Context, *File, io.Reader, int64) (int64, error) { return 0, syserror.ENOSYS } diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD index 08d7c0c57..8d62642e7 100644 --- a/pkg/sentry/fs/lock/BUILD +++ b/pkg/sentry/fs/lock/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "lock_range", diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index 1d3ff39e0..25573e986 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syncutil" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/third_party/gvsync" ) // The virtual filesystem implements an overlay configuration. For a high-level @@ -199,7 +199,7 @@ type overlayEntry struct { upper *Inode // dirCacheMu protects dirCache. - dirCacheMu gvsync.DowngradableRWMutex `state:"nosave"` + dirCacheMu syncutil.DowngradableRWMutex `state:"nosave"` // dirCache is cache of DentAttrs from upper and lower Inodes. dirCache *SortedDentryMap diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index c7599d1f6..75cbb0622 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "proc", @@ -51,6 +52,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/tcpip/header", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 5e28982c5..402919924 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "io" + "reflect" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -33,6 +34,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/header" ) // newNet creates a new proc net entry. @@ -40,7 +43,8 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo var contents map[string]*fs.Inode if s := p.k.NetworkStack(); s != nil { contents = map[string]*fs.Inode{ - "dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc), + "dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc), + "snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc), // The following files are simple stubs until they are // implemented in netstack, if the file contains a @@ -57,7 +61,7 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo // (ClockGetres returns 1ns resolution). "psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), "ptype": newStaticProcInode(ctx, msrc, []byte("Type Device Function")), - "route": newStaticProcInode(ctx, msrc, []byte("Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT")), + "route": seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc), "tcp": seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc), "udp": seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc), "unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc), @@ -66,7 +70,7 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo if s.SupportsIPv6() { contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc) contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte("")) - contents["tcp6"] = newStaticProcInode(ctx, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode")) + contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc) contents["udp6"] = newStaticProcInode(ctx, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode")) } } @@ -195,6 +199,193 @@ func (n *netDev) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se return data, 0 } +// netSnmp implements seqfile.SeqSource for /proc/net/snmp. +// +// +stateify savable +type netSnmp struct { + s inet.Stack +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (n *netSnmp) NeedsUpdate(generation int64) bool { + return true +} + +type snmpLine struct { + prefix string + header string +} + +var snmp = []snmpLine{ + { + prefix: "Ip", + header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates", + }, + { + prefix: "Icmp", + header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps", + }, + { + prefix: "IcmpMsg", + }, + { + prefix: "Tcp", + header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors", + }, + { + prefix: "Udp", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, + { + prefix: "UdpLite", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, +} + +func toSlice(a interface{}) []uint64 { + v := reflect.Indirect(reflect.ValueOf(a)) + return v.Slice(0, v.Len()).Interface().([]uint64) +} + +func sprintSlice(s []uint64) string { + if len(s) == 0 { + return "" + } + r := fmt.Sprint(s) + return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. See Linux's +// net/core/net-procfs.c:dev_seq_show. +func (n *netSnmp) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + contents := make([]string, 0, len(snmp)*2) + types := []interface{}{ + &inet.StatSNMPIP{}, + &inet.StatSNMPICMP{}, + nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats. + &inet.StatSNMPTCP{}, + &inet.StatSNMPUDP{}, + &inet.StatSNMPUDPLite{}, + } + for i, stat := range types { + line := snmp[i] + if stat == nil { + contents = append( + contents, + fmt.Sprintf("%s:\n", line.prefix), + fmt.Sprintf("%s:\n", line.prefix), + ) + continue + } + if err := n.s.Statistics(stat, line.prefix); err != nil { + if err == syserror.EOPNOTSUPP { + log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } else { + log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } + } + var values string + if line.prefix == "Tcp" { + tcp := stat.(*inet.StatSNMPTCP) + // "Tcp" needs special processing because MaxConn is signed. RFC 2012. + values = fmt.Sprintf("%s %d %s", sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) + } else { + values = sprintSlice(toSlice(stat)) + } + contents = append( + contents, + fmt.Sprintf("%s: %s\n", line.prefix, line.header), + fmt.Sprintf("%s: %s\n", line.prefix, values), + ) + } + + data := make([]seqfile.SeqData, 0, len(snmp)*2) + for _, l := range contents { + data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*netSnmp)(nil)}) + } + + return data, 0 +} + +// netRoute implements seqfile.SeqSource for /proc/net/route. +// +// +stateify savable +type netRoute struct { + s inet.Stack +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (n *netRoute) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. +func (n *netRoute) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + interfaces := n.s.Interfaces() + contents := []string{"Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT"} + for _, rt := range n.s.RouteTable() { + // /proc/net/route only includes ipv4 routes. + if rt.Family != linux.AF_INET { + continue + } + + // /proc/net/route does not include broadcast or multicast routes. + if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST { + continue + } + + iface, ok := interfaces[rt.OutputInterface] + if !ok || iface.Name == "lo" { + continue + } + + var ( + gw uint32 + prefix uint32 + flags = linux.RTF_UP + ) + if len(rt.GatewayAddr) == header.IPv4AddressSize { + flags |= linux.RTF_GATEWAY + gw = usermem.ByteOrder.Uint32(rt.GatewayAddr) + } + if len(rt.DstAddr) == header.IPv4AddressSize { + prefix = usermem.ByteOrder.Uint32(rt.DstAddr) + } + l := fmt.Sprintf( + "%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d", + iface.Name, + prefix, + gw, + flags, + 0, // RefCnt. + 0, // Use. + 0, // Metric. + (uint32(1)<<rt.DstLen)-1, + 0, // MTU. + 0, // Window. + 0, // RTT. + ) + contents = append(contents, l) + } + + var data []seqfile.SeqData + for _, l := range contents { + l = fmt.Sprintf("%-127s\n", l) + data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*netRoute)(nil)}) + } + + return data, 0 +} + // netUnix implements seqfile.SeqSource for /proc/net/unix. // // +stateify savable @@ -310,44 +501,51 @@ func networkToHost16(n uint16) uint16 { return usermem.ByteOrder.Uint16(buf[:]) } -func writeInetAddr(w io.Writer, a linux.SockAddrInet) { - // linux.SockAddrInet.Port is stored in the network byte order and is - // printed like a number in host byte order. Note that all numbers in host - // byte order are printed with the most-significant byte first when - // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux. - port := networkToHost16(a.Port) - - // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order - // (i.e. most-significant byte in index 0). Linux represents this as a - // __be32 which is a typedef for an unsigned int, and is printed with - // %X. This means that for a little-endian machine, Linux prints the - // least-significant byte of the address first. To emulate this, we first - // invert the byte order for the address using usermem.ByteOrder.Uint32, - // which makes it have the equivalent encoding to a __be32 on a little - // endian machine. Note that this operation is a no-op on a big endian - // machine. Then similar to Linux, we format it with %X, which will print - // the most-significant byte of the __be32 address first, which is now - // actually the least-significant byte of the original address in - // linux.SockAddrInet.Addr on little endian machines, due to the conversion. - addr := usermem.ByteOrder.Uint32(a.Addr[:]) - - fmt.Fprintf(w, "%08X:%04X ", addr, port) -} +func writeInetAddr(w io.Writer, family int, i linux.SockAddr) { + switch family { + case linux.AF_INET: + var a linux.SockAddrInet + if i != nil { + a = *i.(*linux.SockAddrInet) + } -// netTCP implements seqfile.SeqSource for /proc/net/tcp. -// -// +stateify savable -type netTCP struct { - k *kernel.Kernel -} + // linux.SockAddrInet.Port is stored in the network byte order and is + // printed like a number in host byte order. Note that all numbers in host + // byte order are printed with the most-significant byte first when + // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux. + port := networkToHost16(a.Port) + + // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order + // (i.e. most-significant byte in index 0). Linux represents this as a + // __be32 which is a typedef for an unsigned int, and is printed with + // %X. This means that for a little-endian machine, Linux prints the + // least-significant byte of the address first. To emulate this, we first + // invert the byte order for the address using usermem.ByteOrder.Uint32, + // which makes it have the equivalent encoding to a __be32 on a little + // endian machine. Note that this operation is a no-op on a big endian + // machine. Then similar to Linux, we format it with %X, which will print + // the most-significant byte of the __be32 address first, which is now + // actually the least-significant byte of the original address in + // linux.SockAddrInet.Addr on little endian machines, due to the conversion. + addr := usermem.ByteOrder.Uint32(a.Addr[:]) + + fmt.Fprintf(w, "%08X:%04X ", addr, port) + case linux.AF_INET6: + var a linux.SockAddrInet6 + if i != nil { + a = *i.(*linux.SockAddrInet6) + } -// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. -func (*netTCP) NeedsUpdate(generation int64) bool { - return true + port := networkToHost16(a.Port) + addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4]) + addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8]) + addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12]) + addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16]) + fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port) + } } -// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. -func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { +func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kernel.Kernel, h seqfile.SeqHandle, fa int, header []byte) ([]seqfile.SeqData, int64) { // t may be nil here if our caller is not part of a task goroutine. This can // happen for example if we're here for "sentryctl cat". When t is nil, // degrade gracefully and retrieve what we can. @@ -358,7 +556,7 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se } var buf bytes.Buffer - for _, se := range n.k.ListSockets() { + for _, se := range k.ListSockets() { s := se.Sock.Get() if s == nil { log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) @@ -369,7 +567,7 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se if !ok { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } - if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { + if family, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { s.DecRef() // Not tcp4 sockets. continue @@ -384,22 +582,22 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se fmt.Fprintf(&buf, "%4d: ", se.ID) // Field: local_adddress. - var localAddr linux.SockAddrInet + var localAddr linux.SockAddr if t != nil { if local, _, err := sops.GetSockName(t); err == nil { - localAddr = *local.(*linux.SockAddrInet) + localAddr = local } } - writeInetAddr(&buf, localAddr) + writeInetAddr(&buf, fa, localAddr) // Field: rem_address. - var remoteAddr linux.SockAddrInet + var remoteAddr linux.SockAddr if t != nil { if remote, _, err := sops.GetPeerName(t); err == nil { - remoteAddr = *remote.(*linux.SockAddrInet) + remoteAddr = remote } } - writeInetAddr(&buf, remoteAddr) + writeInetAddr(&buf, fa, remoteAddr) // Field: state; socket state. fmt.Fprintf(&buf, "%02X ", sops.State()) @@ -465,7 +663,7 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se data := []seqfile.SeqData{ { - Buf: []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n"), + Buf: header, Handle: n, }, { @@ -476,6 +674,42 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se return data, 0 } +// netTCP implements seqfile.SeqSource for /proc/net/tcp. +// +// +stateify savable +type netTCP struct { + k *kernel.Kernel +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*netTCP) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + header := []byte(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") + return commonReadSeqFileDataTCP(ctx, n, n.k, h, linux.AF_INET, header) +} + +// netTCP6 implements seqfile.SeqSource for /proc/net/tcp6. +// +// +stateify savable +type netTCP6 struct { + k *kernel.Kernel +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*netTCP6) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (n *netTCP6) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + header := []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n") + return commonReadSeqFileDataTCP(ctx, n, n.k, h, linux.AF_INET6, header) +} + // netUDP implements seqfile.SeqSource for /proc/net/udp. // // +stateify savable @@ -529,7 +763,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se localAddr = *local.(*linux.SockAddrInet) } } - writeInetAddr(&buf, localAddr) + writeInetAddr(&buf, linux.AF_INET, &localAddr) // Field: rem_address. var remoteAddr linux.SockAddrInet @@ -538,7 +772,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se remoteAddr = *remote.(*linux.SockAddrInet) } } - writeInetAddr(&buf, remoteAddr) + writeInetAddr(&buf, linux.AF_INET, &remoteAddr) // Field: state; socket state. fmt.Fprintf(&buf, "%02X ", sops.State()) diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index 0ef13f2f5..56e92721e 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -230,7 +230,7 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent // But for whatever crazy reason, you can still walk to the given node. for _, tg := range rpf.iops.pidns.ThreadGroups() { if leader := tg.Leader(); leader != nil { - name := strconv.FormatUint(uint64(tg.ID()), 10) + name := strconv.FormatUint(uint64(rpf.iops.pidns.IDOfThreadGroup(tg)), 10) m[name] = fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice) names = append(names, name) } diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index 20c3eefc8..fe7067be1 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "seqfile", diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index f3b63dfc2..bd93f83fa 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -64,7 +64,7 @@ var _ fs.InodeOperations = (*tcpMemInode)(nil) func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir tcpMemDir) *fs.Inode { tm := &tcpMemInode{ - SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), s: s, dir: dir, } @@ -77,6 +77,11 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir return fs.NewInode(ctx, tm, msrc, sattr) } +// Truncate implements fs.InodeOperations.Truncate. +func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + // GetFile implements fs.InodeOperations.GetFile. func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { flags.Pread = true @@ -168,14 +173,15 @@ func writeSize(dirType tcpMemDir, s inet.Stack, size inet.TCPBufferSize) error { // +stateify savable type tcpSack struct { + fsutil.SimpleFileInode + stack inet.Stack `state:"wait"` enabled *bool - fsutil.SimpleFileInode } func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { ts := &tcpSack{ - SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), stack: s, } sattr := fs.StableAttr{ @@ -187,6 +193,11 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f return fs.NewInode(ctx, ts, msrc, sattr) } +// Truncate implements fs.InodeOperations.Truncate. +func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + // GetFile implements fs.InodeOperations.GetFile. func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { flags.Pread = true diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 87184ec67..9bf4b4527 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -67,29 +67,28 @@ type taskDir struct { var _ fs.InodeOperations = (*taskDir)(nil) // newTaskDir creates a new proc task entry. -func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool) *fs.Inode { +func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { contents := map[string]*fs.Inode{ - "auxv": newAuxvec(t, msrc), - "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - "comm": newComm(t, msrc), - "environ": newExecArgInode(t, msrc, environExecArg), - "exe": newExe(t, msrc), - "fd": newFdDir(t, msrc), - "fdinfo": newFdInfoDir(t, msrc), - "gid_map": newGIDMap(t, msrc), - // FIXME(b/123511468): create the correct io file for threads. - "io": newIO(t, msrc), + "auxv": newAuxvec(t, msrc), + "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + "comm": newComm(t, msrc), + "environ": newExecArgInode(t, msrc, environExecArg), + "exe": newExe(t, msrc), + "fd": newFdDir(t, msrc), + "fdinfo": newFdInfoDir(t, msrc), + "gid_map": newGIDMap(t, msrc), + "io": newIO(t, msrc, isThreadGroup), "maps": newMaps(t, msrc), "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), "ns": newNamespaceDir(t, msrc), "smaps": newSmaps(t, msrc), - "stat": newTaskStat(t, msrc, showSubtasks, p.pidns), + "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), "statm": newStatm(t, msrc), "status": newStatus(t, msrc, p.pidns), "uid_map": newUIDMap(t, msrc), } - if showSubtasks { + if isThreadGroup { contents["task"] = p.newSubtasks(t, msrc) } if len(p.cgroupControllers) > 0 { @@ -605,6 +604,10 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps) fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps) fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(&buf, "Mems_allowed:\t1\n") + fmt.Fprintf(&buf, "Mems_allowed_list:\t0\n") return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0 } @@ -619,8 +622,11 @@ type ioData struct { ioUsage } -func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { - return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t) +func newIO(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { + if isThreadGroup { + return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t) + } + return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t}), msrc, fs.SpecialFile, t) } // NeedsUpdate returns whether the generation is old or not. @@ -639,7 +645,7 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se io.Accumulate(i.IOUsage()) var buf bytes.Buffer - fmt.Fprintf(&buf, "char: %d\n", io.CharsRead) + fmt.Fprintf(&buf, "rchar: %d\n", io.CharsRead) fmt.Fprintf(&buf, "wchar: %d\n", io.CharsWritten) fmt.Fprintf(&buf, "syscr: %d\n", io.ReadSyscalls) fmt.Fprintf(&buf, "syscw: %d\n", io.WriteSyscalls) diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 516efcc4c..012cb3e44 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "ramfs", diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index eed1c2854..311798811 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -18,7 +18,6 @@ import ( "io" "sync/atomic" - "gvisor.dev/gvisor/pkg/secio" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/syserror" ) @@ -33,146 +32,131 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, } // Check whether or not the objects being sliced are stream-oriented - // (i.e. pipes or sockets). If yes, we elide checks and offset locks. - srcPipe := IsPipe(src.Dirent.Inode.StableAttr) || IsSocket(src.Dirent.Inode.StableAttr) - dstPipe := IsPipe(dst.Dirent.Inode.StableAttr) || IsSocket(dst.Dirent.Inode.StableAttr) + // (i.e. pipes or sockets). For all stream-oriented files and files + // where a specific offiset is not request, we acquire the file mutex. + // This has two important side effects. First, it provides the standard + // protection against concurrent writes that would mutate the offset. + // Second, it prevents Splice deadlocks. Only internal anonymous files + // implement the ReadFrom and WriteTo methods directly, and since such + // anonymous files are referred to by a unique fs.File object, we know + // that the file mutex takes strict precedence over internal locks. + // Since we enforce lock ordering here, we can't deadlock by using + // using a file in two different splice operations simultaneously. + srcPipe := !IsRegular(src.Dirent.Inode.StableAttr) + dstPipe := !IsRegular(dst.Dirent.Inode.StableAttr) + dstAppend := !dstPipe && dst.Flags().Append + srcLock := srcPipe || !opts.SrcOffset + dstLock := dstPipe || !opts.DstOffset || dstAppend - if !dstPipe && !opts.DstOffset && !srcPipe && !opts.SrcOffset { + switch { + case srcLock && dstLock: switch { case dst.UniqueID < src.UniqueID: // Acquire dst first. if !dst.mu.Lock(ctx) { return 0, syserror.ErrInterrupted } - defer dst.mu.Unlock() if !src.mu.Lock(ctx) { + dst.mu.Unlock() return 0, syserror.ErrInterrupted } - defer src.mu.Unlock() case dst.UniqueID > src.UniqueID: // Acquire src first. if !src.mu.Lock(ctx) { return 0, syserror.ErrInterrupted } - defer src.mu.Unlock() if !dst.mu.Lock(ctx) { + src.mu.Unlock() return 0, syserror.ErrInterrupted } - defer dst.mu.Unlock() case dst.UniqueID == src.UniqueID: // Acquire only one lock; it's the same file. This is a // bit of a edge case, but presumably it's possible. if !dst.mu.Lock(ctx) { return 0, syserror.ErrInterrupted } - defer dst.mu.Unlock() + srcLock = false // Only need one unlock. } // Use both offsets (locked). opts.DstStart = dst.offset opts.SrcStart = src.offset - } else if !dstPipe && !opts.DstOffset { + case dstLock: // Acquire only dst. if !dst.mu.Lock(ctx) { return 0, syserror.ErrInterrupted } - defer dst.mu.Unlock() opts.DstStart = dst.offset // Safe: locked. - } else if !srcPipe && !opts.SrcOffset { + case srcLock: // Acquire only src. if !src.mu.Lock(ctx) { return 0, syserror.ErrInterrupted } - defer src.mu.Unlock() opts.SrcStart = src.offset // Safe: locked. } - // Check append-only mode and the limit. - if !dstPipe { + var err error + if dstAppend { unlock := dst.Dirent.Inode.lockAppendMu(dst.Flags().Append) defer unlock() - if dst.Flags().Append { - if opts.DstOffset { - // We need to acquire the lock. - if !dst.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted - } - defer dst.mu.Unlock() - } - // Figure out the appropriate offset to use. - if err := dst.offsetForAppend(ctx, &opts.DstStart); err != nil { - return 0, err - } - } + // Figure out the appropriate offset to use. + err = dst.offsetForAppend(ctx, &opts.DstStart) + } + if err == nil && !dstPipe { // Enforce file limits. limit, ok := dst.checkLimit(ctx, opts.DstStart) switch { case ok && limit == 0: - return 0, syserror.ErrExceedsFileSizeLimit + err = syserror.ErrExceedsFileSizeLimit case ok && limit < opts.Length: opts.Length = limit // Cap the write. } } + if err != nil { + if dstLock { + dst.mu.Unlock() + } + if srcLock { + src.mu.Unlock() + } + return 0, err + } - // Attempt to do a WriteTo; this is likely the most efficient. - // - // The underlying implementation may be able to donate buffers. - newOpts := SpliceOpts{ - Length: opts.Length, - SrcStart: opts.SrcStart, - SrcOffset: !srcPipe, - Dup: opts.Dup, - DstStart: opts.DstStart, - DstOffset: !dstPipe, + // Construct readers and writers for the splice. This is used to + // provide a safer locking path for the WriteTo/ReadFrom operations + // (since they will otherwise go through public interface methods which + // conflict with locking done above), and simplifies the fallback path. + w := &lockedWriter{ + Ctx: ctx, + File: dst, + Offset: opts.DstStart, } - n, err := src.FileOperations.WriteTo(ctx, src, dst, newOpts) - if n == 0 && err != nil { - // Attempt as a ReadFrom. If a WriteTo, a ReadFrom may also - // be more efficient than a copy if buffers are cached or readily - // available. (It's unlikely that they can actually be donate - n, err = dst.FileOperations.ReadFrom(ctx, dst, src, newOpts) + r := &lockedReader{ + Ctx: ctx, + File: src, + Offset: opts.SrcStart, } - if n == 0 && err != nil { - // If we've failed up to here, and at least one of the sources - // is a pipe or socket, then we can't properly support dup. - // Return an error indicating that this operation is not - // supported. - if (srcPipe || dstPipe) && newOpts.Dup { - return 0, syserror.EINVAL - } - // We failed to splice the files. But that's fine; we just fall - // back to a slow path in this case. This copies without doing - // any mode changes, so should still be more efficient. - var ( - r io.Reader - w io.Writer - ) - fw := &lockedWriter{ - Ctx: ctx, - File: dst, - } - if newOpts.DstOffset { - // Use the provided offset. - w = secio.NewOffsetWriter(fw, newOpts.DstStart) - } else { - // Writes will proceed with no offset. - w = fw - } - fr := &lockedReader{ - Ctx: ctx, - File: src, - } - if newOpts.SrcOffset { - // Limit to the given offset and length. - r = io.NewSectionReader(fr, opts.SrcStart, opts.Length) - } else { - // Limit just to the given length. - r = &io.LimitedReader{fr, opts.Length} - } + // Attempt to do a WriteTo; this is likely the most efficient. + n, err := src.FileOperations.WriteTo(ctx, src, w, opts.Length, opts.Dup) + if n == 0 && err == syserror.ENOSYS && !opts.Dup { + // Attempt as a ReadFrom. If a WriteTo, a ReadFrom may also be + // more efficient than a copy if buffers are cached or readily + // available. (It's unlikely that they can actually be donated). + n, err = dst.FileOperations.ReadFrom(ctx, dst, r, opts.Length) + } - // Copy between the two. - n, err = io.Copy(w, r) + // Support one last fallback option, but only if at least one of + // the source and destination are regular files. This is because + // if we block at some point, we could lose data. If the source is + // not a pipe then reading is not destructive; if the destination + // is a regular file, then it is guaranteed not to block writing. + if n == 0 && err == syserror.ENOSYS && !opts.Dup && (!dstPipe || !srcPipe) { + // Fallback to an in-kernel copy. + n, err = io.Copy(w, &io.LimitedReader{ + R: r, + N: opts.Length, + }) } // Update offsets, if required. @@ -185,5 +169,13 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, } } + // Drop locks. + if dstLock { + dst.mu.Unlock() + } + if srcLock { + src.mu.Unlock() + } + return n, err } diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD index 70fa3af89..25f0f124e 100644 --- a/pkg/sentry/fs/sys/BUILD +++ b/pkg/sentry/fs/sys/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "sys", srcs = [ diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD index 1d80daeaf..a215c1b95 100644 --- a/pkg/sentry/fs/timerfd/BUILD +++ b/pkg/sentry/fs/timerfd/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "timerfd", srcs = ["timerfd.go"], diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index 59403d9db..f8bf663bb 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -141,9 +141,10 @@ func (t *TimerOperations) Write(context.Context, *fs.File, usermem.IOSequence, i } // Notify implements ktime.TimerListener.Notify. -func (t *TimerOperations) Notify(exp uint64) { +func (t *TimerOperations) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { atomic.AddUint64(&t.val, exp) t.events.Notify(waiter.EventIn) + return ktime.Setting{}, false } // Destroy implements ktime.TimerListener.Destroy. diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 8f7eb5757..59ce400c2 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "tmpfs", diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 159fb7c08..69089c8a8 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -324,7 +324,7 @@ type Fifo struct { // NewFifo creates a new named pipe. func NewFifo(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode { // First create a pipe. - p := pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize) + p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize) // Build pipe InodeOperations. iops := pipe.NewInodeOperations(ctx, perms, p) diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 5e9327aec..95ad98cb0 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "tty", @@ -23,6 +24,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 1d128532b..2f639c823 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -129,6 +129,9 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode { // Release implements fs.InodeOperations.Release. func (d *dirInodeOperations) Release(ctx context.Context) { + d.mu.Lock() + defer d.mu.Unlock() + d.master.DecRef() if len(d.slaves) != 0 { panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d)) diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index 92ec1ca18..934828c12 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -76,6 +76,11 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn func (mi *masterInodeOperations) Release(ctx context.Context) { } +// Truncate implements fs.InodeOperations.Truncate. +func (masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + // GetFile implements fs.InodeOperations.GetFile. // // It allocates a new terminal. @@ -172,6 +177,19 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io userme return 0, mf.t.ld.windowSize(ctx, io, args) case linux.TIOCSWINSZ: return 0, mf.t.ld.setWindowSize(ctx, io, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) return 0, syserror.ENOTTY @@ -185,8 +203,6 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { linux.TCSETS, linux.TCSETSW, linux.TCSETSF, - linux.TIOCGPGRP, - linux.TIOCSPGRP, linux.TIOCGWINSZ, linux.TIOCSWINSZ, linux.TIOCSETD, @@ -200,8 +216,6 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { linux.TIOCEXCL, linux.TIOCNXCL, linux.TIOCGEXCL, - linux.TIOCNOTTY, - linux.TIOCSCTTY, linux.TIOCGSID, linux.TIOCGETD, linux.TIOCVHANGUP, diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index e30266404..2a51e6bab 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -72,6 +72,11 @@ func (si *slaveInodeOperations) Release(ctx context.Context) { si.t.DecRef() } +// Truncate implements fs.InodeOperations.Truncate. +func (slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + // GetFile implements fs.InodeOperations.GetFile. // // This may race with destruction of the terminal. If the terminal is gone, it @@ -152,9 +157,16 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem case linux.TIOCSCTTY: // Make the given terminal the controlling terminal of the // calling process. - // TODO(b/129283598): Implement once we have support for job - // control. - return 0, nil + return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) return 0, syserror.ENOTTY diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index b7cecb2ed..917f90cc0 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -17,7 +17,10 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usermem" ) // Terminal is a pseudoterminal. @@ -26,23 +29,100 @@ import ( type Terminal struct { refs.AtomicRefCount - // n is the terminal index. + // n is the terminal index. It is immutable. n uint32 - // d is the containing directory. + // d is the containing directory. It is immutable. d *dirInodeOperations - // ld is the line discipline of the terminal. + // ld is the line discipline of the terminal. It is immutable. ld *lineDiscipline + + // masterKTTY contains the controlling process of the master end of + // this terminal. This field is immutable. + masterKTTY *kernel.TTY + + // slaveKTTY contains the controlling process of the slave end of this + // terminal. This field is immutable. + slaveKTTY *kernel.TTY } func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal { termios := linux.DefaultSlaveTermios t := Terminal{ - d: d, - n: n, - ld: newLineDiscipline(termios), + d: d, + n: n, + ld: newLineDiscipline(termios), + masterKTTY: &kernel.TTY{Index: n}, + slaveKTTY: &kernel.TTY{Index: n}, } t.EnableLeakCheck("tty.Terminal") return &t } + +// setControllingTTY makes tm the controlling terminal of the calling thread +// group. +func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("setControllingTTY must be called from a task context") + } + + return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int()) +} + +// releaseControllingTTY removes tm as the controlling terminal of the calling +// thread group. +func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("releaseControllingTTY must be called from a task context") + } + + return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster)) +} + +// foregroundProcessGroup gets the process group ID of tm's foreground process. +func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("foregroundProcessGroup must be called from a task context") + } + + ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster)) + if err != nil { + return 0, err + } + + // Write it out to *arg. + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err +} + +// foregroundProcessGroup sets tm's foreground process. +func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("setForegroundProcessGroup must be called from a task context") + } + + // Read in the process group ID. + var pgid int32 + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid)) + return uintptr(ret), err +} + +func (tm *Terminal) tty(isMaster bool) *kernel.TTY { + if isMaster { + return tm.masterKTTY + } + return tm.slaveKTTY +} diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 9e8ebb907..880b7bcd3 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -1,8 +1,9 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") +package(licenses = ["notice"]) + go_template_instance( name = "dirent_list", out = "dirent_list.go", @@ -37,6 +38,7 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/fd", + "//pkg/fspath", "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/context", diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD index 9fddb4c4c..bfc46dfa6 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_test") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 10a8083a0..177ce2cb9 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -50,7 +50,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() return nil, nil, nil, nil, err @@ -81,7 +81,11 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf ctx := contexttest.Context(b) creds := auth.CredentialsFromContext(ctx) - if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}); err != nil { + if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } return func() { diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go index cea89bcd9..a2d8c3ad6 100644 --- a/pkg/sentry/fsimpl/ext/block_map_file.go +++ b/pkg/sentry/fsimpl/ext/block_map_file.go @@ -154,7 +154,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds toRead = len(dst) } - n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff)) + n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff)) if n < toRead { return n, syserror.EIO } @@ -174,7 +174,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds curChildOff := relFileOff % childCov for i := startIdx; i < endIdx; i++ { var childPhyBlk uint32 - err := readFromDisk(f.regFile.inode.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) + err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) if err != nil { return read, err } diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go index 213aa3919..181727ef7 100644 --- a/pkg/sentry/fsimpl/ext/block_map_test.go +++ b/pkg/sentry/fsimpl/ext/block_map_test.go @@ -87,12 +87,14 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { mockDisk := make([]byte, mockBMDiskSize) regFile := regularFile{ inode: inode{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, diskInode: &disklayout.InodeNew{ InodeOld: disklayout.InodeOld{ SizeLo: getMockBMFileFize(), }, }, - dev: bytes.NewReader(mockDisk), blkSize: uint64(mockBMBlkSize), }, } diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 054fb42b6..a080cb189 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -41,16 +41,18 @@ func newDentry(in *inode) *dentry { } // IncRef implements vfs.DentryImpl.IncRef. -func (d *dentry) IncRef(vfsfs *vfs.Filesystem) { +func (d *dentry) IncRef() { d.inode.incRef() } // TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { +func (d *dentry) TryIncRef() bool { return d.inode.tryIncRef() } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef(vfsfs *vfs.Filesystem) { - d.inode.decRef(vfsfs.Impl().(*filesystem)) +func (d *dentry) DecRef() { + // FIXME(b/134676337): filesystem.mu may not be locked as required by + // inode.decRef(). + d.inode.decRef() } diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index b51f3e18d..91802dc1e 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -190,10 +190,10 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba } if !cb.Handle(vfs.Dirent{ - Name: child.diskDirent.FileName(), - Type: fs.ToDirentType(childType), - Ino: uint64(child.diskDirent.Inode()), - Off: fd.off, + Name: child.diskDirent.FileName(), + Type: fs.ToDirentType(childType), + Ino: uint64(child.diskDirent.Inode()), + NextOff: fd.off + 1, }) { dir.childList.InsertBefore(child, fd.iter) return nil @@ -301,8 +301,8 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in return offset, nil } -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { // mmap(2) specifies that EACCESS should be returned for non-regular file fds. return syserror.EACCES } diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD index 907d35b7e..fcfaf5c3e 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/BUILD +++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "disklayout", diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index f10accafc..4b7d17dc6 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -40,14 +40,14 @@ var _ vfs.FilesystemType = (*FilesystemType)(nil) // Currently there are two ways of mounting an ext(2/3/4) fs: // 1. Specify a mount with our internal special MountType in the OCI spec. // 2. Expose the device to the container and mount it from application layer. -func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) { +func getDeviceFd(source string, opts vfs.GetFilesystemOptions) (io.ReaderAt, error) { if opts.InternalData == nil { // User mount call. // TODO(b/134676337): Open the device specified by `source` and return that. panic("unimplemented") } - // NewFilesystem call originated from within the sentry. + // GetFilesystem call originated from within the sentry. devFd, ok := opts.InternalData.(int) if !ok { return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device") @@ -91,8 +91,8 @@ func isCompatible(sb disklayout.SuperBlock) bool { return true } -// NewFilesystem implements vfs.FilesystemType.NewFilesystem. -func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // TODO(b/134676337): Ensure that the user is mounting readonly. If not, // EACCESS should be returned according to mount(2). Filesystem independent // flags (like readonly) are currently not available in pkg/sentry/vfs. @@ -103,7 +103,7 @@ func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials } fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} - fs.vfsfs.Init(&fs) + fs.vfsfs.Init(vfsObj, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { return nil, nil, err diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 63cf7aeaf..e9f756732 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -66,7 +66,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() return nil, nil, nil, nil, err @@ -147,55 +147,54 @@ func TestSeek(t *testing.T) { t.Fatalf("vfsfs.OpenAt failed: %v", err) } - if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { + if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { t.Errorf("expected seek position 0, got %d and error %v", n, err) } - stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{}) + stat, err := fd.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err) } // We should be able to seek beyond the end of file. size := int64(stat.Size) - if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { + if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } - if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { + if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err) } // Make sure negative offsets work with SEEK_CUR. - if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { + if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } // Make sure SEEK_END works with regular files. - switch fd.Impl().(type) { - case *regularFileFD: + if _, ok := fd.Impl().(*regularFileFD); ok { // Seek back to 0. - if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { + if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", 0, n, err) } // Seek forward beyond EOF. - if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { + if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } } @@ -456,7 +455,7 @@ func TestRead(t *testing.T) { want := make([]byte, 1) for { n, err := f.Read(want) - fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) + fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) if diff := cmp.Diff(got, want); diff != "" { t.Errorf("file data mismatch (-want +got):\n%s", diff) @@ -464,7 +463,7 @@ func TestRead(t *testing.T) { // Make sure there is no more file data left after getting EOF. if n == 0 || err == io.EOF { - if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { + if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image) } @@ -509,27 +508,27 @@ func TestIterDirents(t *testing.T) { } wantDirents := []vfs.Dirent{ - vfs.Dirent{ + { Name: ".", Type: linux.DT_DIR, }, - vfs.Dirent{ + { Name: "..", Type: linux.DT_DIR, }, - vfs.Dirent{ + { Name: "lost+found", Type: linux.DT_DIR, }, - vfs.Dirent{ + { Name: "file.txt", Type: linux.DT_REG, }, - vfs.Dirent{ + { Name: "bigfile.txt", Type: linux.DT_REG, }, - vfs.Dirent{ + { Name: "symlink.txt", Type: linux.DT_LNK, }, @@ -574,7 +573,7 @@ func TestIterDirents(t *testing.T) { } cb := &iterDirentsCb{} - if err = fd.Impl().IterDirents(ctx, cb); err != nil { + if err = fd.IterDirents(ctx, cb); err != nil { t.Fatalf("dir fd.IterDirents() failed: %v", err) } @@ -584,7 +583,7 @@ func TestIterDirents(t *testing.T) { // Ignore the inode number and offset of dirents because those are likely to // change as the underlying image changes. cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { - return p.String() == "Ino" || p.String() == "Off" + return p.String() == "Ino" || p.String() == "NextOff" }, cmp.Ignore()) if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" { t.Errorf("dirents mismatch (-want +got):\n%s", diff) diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go index 38b68a2d3..3d3ebaca6 100644 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ b/pkg/sentry/fsimpl/ext/extent_file.go @@ -99,7 +99,7 @@ func (f *extentFile) buildExtTree() error { func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) { var header disklayout.ExtentHeader off := entry.PhysicalBlock() * f.regFile.inode.blkSize - err := readFromDisk(f.regFile.inode.dev, int64(off), &header) + err := readFromDisk(f.regFile.inode.fs.dev, int64(off), &header) if err != nil { return nil, err } @@ -115,7 +115,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla curEntry = &disklayout.ExtentIdx{} } - err := readFromDisk(f.regFile.inode.dev, int64(off), curEntry) + err := readFromDisk(f.regFile.inode.fs.dev, int64(off), curEntry) if err != nil { return nil, err } @@ -229,7 +229,7 @@ func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byt toRead = len(dst) } - n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], int64(readStart)) + n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], int64(readStart)) if n < toRead { return n, syserror.EIO } diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go index 42d0a484b..a2382daa3 100644 --- a/pkg/sentry/fsimpl/ext/extent_test.go +++ b/pkg/sentry/fsimpl/ext/extent_test.go @@ -180,13 +180,15 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, [] mockExtentFile := &extentFile{ regFile: regularFile{ inode: inode{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, diskInode: &disklayout.InodeNew{ InodeOld: disklayout.InodeOld{ SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), }, }, blkSize: mockExtentBlkSize, - dev: bytes.NewReader(mockDisk), }, }, } diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index a0065343b..5eca2b83f 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -36,16 +36,13 @@ type fileDescription struct { } func (fd *fileDescription) filesystem() *filesystem { - return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem) + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) inode() *inode { - return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode + return fd.vfsfd.Dentry().Impl().(*dentry).inode } -// OnClose implements vfs.FileDescriptionImpl.OnClose. -func (fd *fileDescription) OnClose() error { return nil } - // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { return fd.flags, nil diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 2d15e8aaf..d7e87979a 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -20,6 +20,7 @@ import ( "sync" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -441,3 +442,46 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return "", err + } + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index e6c847a71..24249525c 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -16,7 +16,6 @@ package ext import ( "fmt" - "io" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -42,13 +41,13 @@ type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 + // fs is the containing filesystem. + fs *filesystem + // inodeNum is the inode number of this inode on disk. This is used to // identify inodes within the ext filesystem. inodeNum uint32 - // dev represents the underlying device. Same as filesystem.dev. - dev io.ReaderAt - // blkSize is the fs data block size. Same as filesystem.sb.BlockSize(). blkSize uint64 @@ -81,10 +80,10 @@ func (in *inode) tryIncRef() bool { // decRef decrements the inode ref count and releases the inode resources if // the ref count hits 0. // -// Precondition: Must have locked fs.mu. -func (in *inode) decRef(fs *filesystem) { +// Precondition: Must have locked filesystem.mu. +func (in *inode) decRef() { if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { - delete(fs.inodeCache, in.inodeNum) + delete(in.fs.inodeCache, in.inodeNum) } else if refs < 0 { panic("ext.inode.decRef() called without holding a reference") } @@ -117,8 +116,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { // Build the inode based on its type. inode := inode{ + fs: fs, inodeNum: inodeNum, - dev: fs.dev, blkSize: blkSize, diskInode: diskInode, } @@ -154,11 +153,14 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v if err := in.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } + mnt := rp.Mount() switch in.impl.(type) { case *regularFile: var fd regularFileFD fd.flags = flags - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -167,8 +169,10 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) fd.flags = flags + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { @@ -177,7 +181,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v } var fd symlinkFD fd.flags = flags - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index ffc76ba5b..aec33e00a 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -152,8 +152,8 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) ( return offset, nil } -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { // TODO(b/134676337): Implement mmap(2). return syserror.ENODEV } diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index e06548a98..bdf8705c1 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -105,7 +105,7 @@ func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int6 return 0, syserror.EBADF } -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return syserror.EBADF } diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD new file mode 100644 index 000000000..52596c090 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -0,0 +1,60 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "slot_list", + out = "slot_list.go", + package = "kernfs", + prefix = "slot", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*slot", + "Linker": "*slot", + }, +) + +go_library( + name = "kernfs", + srcs = [ + "dynamic_bytes_file.go", + "fd_impl_util.go", + "filesystem.go", + "inode_impl_util.go", + "kernfs.go", + "slot_list.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/context", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "kernfs_test", + size = "small", + srcs = ["kernfs_test.go"], + deps = [ + ":kernfs", + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go new file mode 100644 index 000000000..30c06baf0 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -0,0 +1,131 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// DynamicBytesFile implements kernfs.Inode and represents a read-only +// file whose contents are backed by a vfs.DynamicBytesSource. +// +// Must be initialized with Init before first use. +type DynamicBytesFile struct { + InodeAttrs + InodeNoopRefCount + InodeNotDirectory + InodeNotSymlink + + data vfs.DynamicBytesSource +} + +// Init intializes a dynamic bytes file. +func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource) { + f.InodeAttrs.Init(creds, ino, linux.ModeRegular|0444) + f.data = data +} + +// Open implements Inode.Open. +func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &DynamicBytesFD{} + fd.Init(rp.Mount(), vfsd, f.data, flags) + return &fd.vfsfd, nil +} + +// SetStat implements Inode.SetStat. +func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return syserror.EPERM +} + +// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a +// DynamicBytesFile. +// +// Must be initialized with Init before first use. +type DynamicBytesFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DynamicBytesFileDescriptionImpl + + vfsfd vfs.FileDescription + inode Inode + flags uint32 +} + +// Init initializes a DynamicBytesFD. +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { + m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. + d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. + fd.flags = flags + fd.inode = d.Impl().(*Dentry).inode + fd.SetDataSource(data) + fd.vfsfd.Init(fd, m, d) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) +} + +// Read implmenets vfs.FileDescriptionImpl.Read. +func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) +} + +// PRead implmenets vfs.FileDescriptionImpl.PRead. +func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.FileDescriptionDefaultImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.FileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *DynamicBytesFD) Release() {} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(fs), nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return syserror.EPERM +} + +// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. +func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) { + return fd.flags, nil +} + +// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. +func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error { + // None of the flags settable by fcntl(F_SETFL) are supported, so this is a + // no-op. + return nil +} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go new file mode 100644 index 000000000..d6c18937a --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -0,0 +1,207 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory +// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not +// compatible with dynamic directories. +// +// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling +// IterDirents callback. The IterDirents callback therefore cannot hash or +// unhash children, or recursively call IterDirents on the same underlying +// inode. +// +// Must be initialize with Init before first use. +type GenericDirectoryFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DirectoryFileDescriptionDefaultImpl + + vfsfd vfs.FileDescription + children *OrderedChildren + flags uint32 + off int64 +} + +// Init initializes a GenericDirectoryFD. +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) { + m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. + d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. + fd.children = children + fd.flags = flags + fd.vfsfd.Init(fd, m, d) +} + +// VFSFileDescription returns a pointer to the vfs.FileDescription representing +// this object. +func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription { + return &fd.vfsfd +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts) +} + +// Read implmenets vfs.FileDescriptionImpl.Read. +func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts) +} + +// PRead implmenets vfs.FileDescriptionImpl.PRead. +func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDecriptionImpl.Release. +func (fd *GenericDirectoryFD) Release() {} + +func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem() +} + +func (fd *GenericDirectoryFD) inode() Inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode +} + +// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds +// o.mu when calling cb. +func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + vfsFS := fd.filesystem() + fs := vfsFS.Impl().(*Filesystem) + vfsd := fd.vfsfd.VirtualDentry().Dentry() + + fs.mu.Lock() + defer fs.mu.Unlock() + + // Handle ".". + if fd.off == 0 { + stat := fd.inode().Stat(vfsFS) + dirent := vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + Ino: stat.Ino, + NextOff: 1, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + // Handle "..". + if fd.off == 1 { + parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode + stat := parentInode.Stat(vfsFS) + dirent := vfs.Dirent{ + Name: "..", + Type: linux.FileMode(stat.Mode).DirentType(), + Ino: stat.Ino, + NextOff: 2, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + // Handle static children. + fd.children.mu.RLock() + defer fd.children.mu.RUnlock() + // fd.off accounts for "." and "..", but fd.children do not track + // these. + childIdx := fd.off - 2 + for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { + inode := it.Dentry.Impl().(*Dentry).inode + stat := inode.Stat(vfsFS) + dirent := vfs.Dirent{ + Name: it.Name, + Type: linux.FileMode(stat.Mode).DirentType(), + Ino: stat.Ino, + NextOff: fd.off + 1, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + return nil +} + +// Seek implements vfs.FileDecriptionImpl.Seek. +func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fs := fd.filesystem().Impl().(*Filesystem) + fs.mu.Lock() + defer fs.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. +func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) { + return fd.flags, nil +} + +// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. +func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error { + // None of the flags settable by fcntl(F_SETFL) are supported, so this is a + // no-op. + return nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.filesystem() + inode := fd.inode() + return inode.Stat(fs), nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + fs := fd.filesystem() + inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode + return inode.SetStat(fs, opts) +} diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go new file mode 100644 index 000000000..3cbbe4b20 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -0,0 +1,743 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file implements vfs.FilesystemImpl for kernfs. + +package kernfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// stepExistingLocked resolves rp.Component() in parent directory vfsd. +// +// stepExistingLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// +// Postcondition: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, syserror.ENOTDIR + } + // Directory searchable? + if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + d.dirMu.Lock() + nextVFSD, err := rp.ResolveComponent(vfsd) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + if nextVFSD != nil { + // Cached dentry exists, revalidate. + next := nextVFSD.Impl().(*Dentry) + if !next.inode.Valid(ctx) { + d.dirMu.Lock() + rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD) + d.dirMu.Unlock() + fs.deferDecRef(nextVFSD) // Reference from Lookup. + nextVFSD = nil + } + } + if nextVFSD == nil { + // Dentry isn't cached; it either doesn't exist or failed + // revalidation. Attempt to resolve it via Lookup. + name := rp.Component() + var err error + nextVFSD, err = d.inode.Lookup(ctx, name) + // Reference on nextVFSD dropped by a corresponding Valid. + if err != nil { + return nil, err + } + d.InsertChild(name, nextVFSD) + } + next := nextVFSD.Impl().(*Dentry) + + // Resolve any symlink at current path component. + if rp.ShouldFollowSymlink() && d.isSymlink() { + // TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks". + target, err := next.inode.Readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink + + } + rp.Advance() + return nextVFSD, nil +} + +// walkExistingLocked resolves rp to an existing file. +// +// walkExistingLocked is loosely analogous to Linux's +// fs/namei.c:path_lookupat(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { + vfsd := rp.Start() + for !rp.Done() { + var err error + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + if err != nil { + return nil, nil, err + } + } + d := vfsd.Impl().(*Dentry) + if rp.MustBeDir() && !d.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, d.inode, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { + vfsd := rp.Start() + for !rp.Final() { + var err error + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + if err != nil { + return nil, nil, err + } + } + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, d.inode, nil +} + +// checkCreateLocked checks that a file named rp.Component() may be created in +// directory parentVFSD, then returns rp.Component(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. parentInode +// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true. +func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return "", err + } + pc := rp.Component() + if pc == "." || pc == ".." { + return "", syserror.EEXIST + } + childVFSD, err := rp.ResolveChild(parentVFSD, pc) + if err != nil { + return "", err + } + if childVFSD != nil { + return "", syserror.EEXIST + } + if parentVFSD.IsDisowned() { + return "", syserror.ENOENT + } + return pc, nil +} + +// checkDeleteLocked checks that the file represented by vfsd may be deleted. +// +// Preconditions: Filesystem.mu must be locked for at least reading. +func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { + parentVFSD := vfsd.Parent() + if parentVFSD == nil { + return syserror.EBUSY + } + if parentVFSD.IsDisowned() { + return syserror.ENOENT + } + if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + return nil +} + +// checkRenameLocked checks that a rename operation may be performed on the +// target dentry across the given set of parent directories. The target dentry +// may be nil. +// +// Precondition: isDir(dstInode) == true. +func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error { + srcDir := src.Parent() + if srcDir == nil { + return syserror.EBUSY + } + if srcDir.IsDisowned() { + return syserror.ENOENT + } + if dstDir.IsDisowned() { + return syserror.ENOENT + } + // Check for creation permissions on dst dir. + if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + + return nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *Filesystem) Release() { +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *Filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return nil, err + } + + if opts.CheckSearchable { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + vfsd.IncRef() // Ownership transferred to caller. + return vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + + d := vd.Dentry().Impl().(*Dentry) + if d.isDir() { + return syserror.EPERM + } + + child, err := parentInode.NewLink(ctx, pc, d.inode) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child, err := parentInode.NewDir(ctx, pc, opts) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + new, err := parentInode.NewNode(ctx, pc, opts) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, new) + return nil +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Filter out flags that are not supported by kernfs. O_DIRECTORY and + // O_NOFOLLOW have no effect here (they're handled by VFS by setting + // appropriate bits in rp), but are returned by + // FileDescriptionImpl.StatusFlags(). + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW + ats := vfs.AccessTypesForOpenFlags(opts.Flags) + + // Do not create new file. + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return nil, err + } + if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return inode.Open(rp, vfsd, opts.Flags) + } + + // May create new file. + mustCreate := opts.Flags&linux.O_EXCL != 0 + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return inode.Open(rp, vfsd, opts.Flags) + } +afterTrailingSymlink: + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + pc := rp.Component() + if pc == "." || pc == ".." { + return nil, syserror.EISDIR + } + // Determine whether or not we need to create a file. + childVFSD, err := rp.ResolveChild(parentVFSD, pc) + if err != nil { + return nil, err + } + if childVFSD == nil { + // Already checked for searchability above; now check for writability. + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + child, err := parentInode.NewFile(ctx, pc, opts) + if err != nil { + return nil, err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags) + } + // Open existing file or follow symlink. + if mustCreate { + return nil, syserror.EEXIST + } + childDentry := childVFSD.Impl().(*Dentry) + childInode := childDentry.inode + if rp.ShouldFollowSymlink() { + if childDentry.isSymlink() { + target, err := childInode.Readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink + } + } + if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return childInode.Open(rp, childVFSD, opts.Flags) +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + d, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return "", err + } + if !d.Impl().(*Dentry).isSymlink() { + return "", syserror.EINVAL + } + return inode.Readlink(ctx) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { + noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 + exchange := opts.Flags&linux.RENAME_EXCHANGE != 0 + whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0 + if exchange && (noReplace || whiteout) { + // Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE. + return syserror.EINVAL + } + if exchange || whiteout { + // Exchange and Whiteout flags are not supported on kernfs. + return syserror.EINVAL + } + + fs.mu.Lock() + defer fs.mu.Lock() + + mnt := rp.Mount() + if mnt != vd.Mount() { + return syserror.EXDEV + } + + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + + srcVFSD := vd.Dentry() + srcDirVFSD := srcVFSD.Parent() + + // Can we remove the src dentry? + if err := checkDeleteLocked(rp, srcVFSD); err != nil { + return err + } + + // Can we create the dst dentry? + var dstVFSD *vfs.Dentry + pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode) + switch err { + case nil: + // Ok, continue with rename as replacement. + case syserror.EEXIST: + if noReplace { + // Won't overwrite existing node since RENAME_NOREPLACE was requested. + return syserror.EEXIST + } + dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc) + if err != nil { + panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) + } + default: + return err + } + + mntns := vfs.MountNamespaceFromContext(ctx) + virtfs := rp.VirtualFilesystem() + + srcDirDentry := srcDirVFSD.Impl().(*Dentry) + dstDirDentry := dstDirVFSD.Impl().(*Dentry) + + // We can't deadlock here due to lock ordering because we're protected from + // concurrent renames by fs.mu held for writing. + srcDirDentry.dirMu.Lock() + defer srcDirDentry.dirMu.Unlock() + dstDirDentry.dirMu.Lock() + defer dstDirDentry.dirMu.Unlock() + + if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { + return err + } + srcDirInode := srcDirDentry.inode + replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD) + if err != nil { + virtfs.AbortRenameDentry(srcVFSD, dstVFSD) + return err + } + virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(rp, vfsd); err != nil { + return err + } + if !vfsd.Impl().(*Dentry).isDir() { + return syserror.ENOTDIR + } + if inode.HasChildren() { + return syserror.ENOTEMPTY + } + virtfs := rp.VirtualFilesystem() + parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry.dirMu.Lock() + defer parentDentry.dirMu.Unlock() + if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { + virtfs.AbortDeleteDentry(vfsd) + return err + } + virtfs.CommitDeleteDentry(vfsd) + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + if opts.Stat.Mask == 0 { + return nil + } + return inode.SetStat(fs.VFSFilesystem(), opts) +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return linux.Statx{}, err + } + return inode.Stat(fs.VFSFilesystem()), nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return linux.Statfs{}, err + } + // TODO: actually implement statfs + return linux.Statfs{}, syserror.ENOSYS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child, err := parentInode.NewSymlink(ctx, pc, target) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, _, err := fs.walkExistingLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(rp, vfsd); err != nil { + return err + } + if vfsd.Impl().(*Dentry).isDir() { + return syserror.EISDIR + } + virtfs := rp.VirtualFilesystem() + parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry.dirMu.Lock() + defer parentDentry.dirMu.Unlock() + if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { + virtfs.AbortDeleteDentry(vfsd) + return err + } + virtfs.CommitDeleteDentry(vfsd) + return nil +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return nil, err + } + // kernfs currently does not support extended attributes. + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return "", err + } + // kernfs currently does not support extended attributes. + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + // kernfs currently does not support extended attributes. + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + // kernfs currently does not support extended attributes. + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go new file mode 100644 index 000000000..7b45b702a --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -0,0 +1,492 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// InodeNoopRefCount partially implements the Inode interface, specifically the +// inodeRefs sub interface. InodeNoopRefCount implements a simple reference +// count for inodes, performing no extra actions when references are obtained or +// released. This is suitable for simple file inodes that don't reference any +// resources. +type InodeNoopRefCount struct { +} + +// IncRef implements Inode.IncRef. +func (n *InodeNoopRefCount) IncRef() { +} + +// DecRef implements Inode.DecRef. +func (n *InodeNoopRefCount) DecRef() { +} + +// TryIncRef implements Inode.TryIncRef. +func (n *InodeNoopRefCount) TryIncRef() bool { + return true +} + +// Destroy implements Inode.Destroy. +func (n *InodeNoopRefCount) Destroy() { +} + +// InodeDirectoryNoNewChildren partially implements the Inode interface. +// InodeDirectoryNoNewChildren represents a directory inode which does not +// support creation of new children. +type InodeDirectoryNoNewChildren struct{} + +// NewFile implements Inode.NewFile. +func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewDir implements Inode.NewDir. +func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewLink implements Inode.NewLink. +func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewSymlink implements Inode.NewSymlink. +func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewNode implements Inode.NewNode. +func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// InodeNotDirectory partially implements the Inode interface, specifically the +// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not +// represent directories can embed this to provide no-op implementations for +// directory-related functions. +type InodeNotDirectory struct { +} + +// HasChildren implements Inode.HasChildren. +func (*InodeNotDirectory) HasChildren() bool { + return false +} + +// NewFile implements Inode.NewFile. +func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { + panic("NewFile called on non-directory inode") +} + +// NewDir implements Inode.NewDir. +func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { + panic("NewDir called on non-directory inode") +} + +// NewLink implements Inode.NewLinkink. +func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { + panic("NewLink called on non-directory inode") +} + +// NewSymlink implements Inode.NewSymlink. +func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + panic("NewSymlink called on non-directory inode") +} + +// NewNode implements Inode.NewNode. +func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + panic("NewNode called on non-directory inode") +} + +// Unlink implements Inode.Unlink. +func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { + panic("Unlink called on non-directory inode") +} + +// RmDir implements Inode.RmDir. +func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { + panic("RmDir called on non-directory inode") +} + +// Rename implements Inode.Rename. +func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { + panic("Rename called on non-directory inode") +} + +// Lookup implements Inode.Lookup. +func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + panic("Lookup called on non-directory inode") +} + +// Valid implements Inode.Valid. +func (*InodeNotDirectory) Valid(context.Context) bool { + return true +} + +// InodeNoDynamicLookup partially implements the Inode interface, specifically +// the inodeDynamicLookup sub interface. Directory inodes that do not support +// dymanic entries (i.e. entries that are not "hashed" into the +// vfs.Dentry.children) can embed this to provide no-op implementations for +// functions related to dynamic entries. +type InodeNoDynamicLookup struct{} + +// Lookup implements Inode.Lookup. +func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + return nil, syserror.ENOENT +} + +// Valid implements Inode.Valid. +func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { + return true +} + +// InodeNotSymlink partially implements the Inode interface, specifically the +// inodeSymlink sub interface. All inodes that are not symlinks may embed this +// to return the appropriate errors from symlink-related functions. +type InodeNotSymlink struct{} + +// Readlink implements Inode.Readlink. +func (*InodeNotSymlink) Readlink(context.Context) (string, error) { + return "", syserror.EINVAL +} + +// InodeAttrs partially implements the Inode interface, specifically the +// inodeMetadata sub interface. InodeAttrs provides functionality related to +// inode attributes. +// +// Must be initialized by Init prior to first use. +type InodeAttrs struct { + ino uint64 + mode uint32 + uid uint32 + gid uint32 + nlink uint32 +} + +// Init initializes this InodeAttrs. +func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) { + if mode.FileType() == 0 { + panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) + } + + nlink := uint32(1) + if mode.FileType() == linux.ModeDirectory { + nlink = 2 + } + atomic.StoreUint64(&a.ino, ino) + atomic.StoreUint32(&a.mode, uint32(mode)) + atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID)) + atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID)) + atomic.StoreUint32(&a.nlink, nlink) +} + +// Mode implements Inode.Mode. +func (a *InodeAttrs) Mode() linux.FileMode { + return linux.FileMode(atomic.LoadUint32(&a.mode)) +} + +// Stat partially implements Inode.Stat. Note that this function doesn't provide +// all the stat fields, and the embedder should consider extending the result +// with filesystem-specific fields. +func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx { + var stat linux.Statx + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK + stat.Ino = atomic.LoadUint64(&a.ino) + stat.Mode = uint16(a.Mode()) + stat.UID = atomic.LoadUint32(&a.uid) + stat.GID = atomic.LoadUint32(&a.gid) + stat.Nlink = atomic.LoadUint32(&a.nlink) + + // TODO: Implement other stat fields like timestamps. + + return stat +} + +// SetStat implements Inode.SetStat. +func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { + stat := opts.Stat + if stat.Mask&linux.STATX_MODE != 0 { + for { + old := atomic.LoadUint32(&a.mode) + new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT)) + if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped { + break + } + } + } + + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&a.uid, stat.UID) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&a.gid, stat.GID) + } + + // Note that not all fields are modifiable. For example, the file type and + // inode numbers are immutable after node creation. + + // TODO: Implement other stat fields like timestamps. + + return nil +} + +// CheckPermissions implements Inode.CheckPermissions. +func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := a.Mode() + return vfs.GenericCheckPermissions( + creds, + ats, + mode.FileType() == linux.ModeDirectory, + uint16(mode), + auth.KUID(atomic.LoadUint32(&a.uid)), + auth.KGID(atomic.LoadUint32(&a.gid)), + ) +} + +// IncLinks implements Inode.IncLinks. +func (a *InodeAttrs) IncLinks(n uint32) { + if atomic.AddUint32(&a.nlink, n) <= n { + panic("InodeLink.IncLinks called with no existing links") + } +} + +// DecLinks implements Inode.DecLinks. +func (a *InodeAttrs) DecLinks() { + if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) { + // Negative overflow + panic("Inode.DecLinks called at 0 links") + } +} + +type slot struct { + Name string + Dentry *vfs.Dentry + slotEntry +} + +// OrderedChildrenOptions contains initialization options for OrderedChildren. +type OrderedChildrenOptions struct { + // Writable indicates whether vfs.FilesystemImpl methods implemented by + // OrderedChildren may modify the tracked children. This applies to + // operations related to rename, unlink and rmdir. If an OrderedChildren is + // not writable, these operations all fail with EPERM. + Writable bool +} + +// OrderedChildren partially implements the Inode interface. OrderedChildren can +// be embedded in directory inodes to keep track of the children in the +// directory, and can then be used to implement a generic directory FD -- see +// GenericDirectoryFD. OrderedChildren is not compatible with dynamic +// directories. +// +// Must be initialize with Init before first use. +type OrderedChildren struct { + refs.AtomicRefCount + + // Can children be modified by user syscalls? It set to false, interface + // methods that would modify the children return EPERM. Immutable. + writable bool + + mu sync.RWMutex + order slotList + set map[string]*slot +} + +// Init initializes an OrderedChildren. +func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { + o.writable = opts.Writable + o.set = make(map[string]*slot) +} + +// DecRef implements Inode.DecRef. +func (o *OrderedChildren) DecRef() { + o.AtomicRefCount.DecRefWithDestructor(o.Destroy) +} + +// Destroy cleans up resources referenced by this OrderedChildren. +func (o *OrderedChildren) Destroy() { + o.mu.Lock() + defer o.mu.Unlock() + o.order.Reset() + o.set = nil +} + +// Populate inserts children into this OrderedChildren, and d's dentry +// cache. Populate returns the number of directories inserted, which the caller +// may use to update the link count for the parent directory. +// +// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory +// inode. children must not contain any conflicting entries already in o. +func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { + var links uint32 + for name, child := range children { + if child.isDir() { + links++ + } + if err := o.Insert(name, child.VFSDentry()); err != nil { + panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) + } + d.InsertChild(name, child.VFSDentry()) + } + return links +} + +// HasChildren implements Inode.HasChildren. +func (o *OrderedChildren) HasChildren() bool { + o.mu.RLock() + defer o.mu.RUnlock() + return len(o.set) > 0 +} + +// Insert inserts child into o. This ignores the writability of o, as this is +// not part of the vfs.FilesystemImpl interface, and is a lower-level operation. +func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { + o.mu.Lock() + defer o.mu.Unlock() + if _, ok := o.set[name]; ok { + return syserror.EEXIST + } + s := &slot{ + Name: name, + Dentry: child, + } + o.order.PushBack(s) + o.set[name] = s + return nil +} + +// Precondition: caller must hold o.mu for writing. +func (o *OrderedChildren) removeLocked(name string) { + if s, ok := o.set[name]; ok { + delete(o.set, name) + o.order.Remove(s) + } +} + +// Precondition: caller must hold o.mu for writing. +func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry { + if s, ok := o.set[name]; ok { + // Existing slot with given name, simply replace the dentry. + var old *vfs.Dentry + old, s.Dentry = s.Dentry, new + return old + } + + // No existing slot with given name, create and hash new slot. + s := &slot{ + Name: name, + Dentry: new, + } + o.order.PushBack(s) + o.set[name] = s + return nil +} + +// Precondition: caller must hold o.mu for reading or writing. +func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error { + s, ok := o.set[name] + if !ok { + return syserror.ENOENT + } + if s.Dentry != child { + panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child)) + } + return nil +} + +// Unlink implements Inode.Unlink. +func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { + if !o.writable { + return syserror.EPERM + } + o.mu.Lock() + defer o.mu.Unlock() + if err := o.checkExistingLocked(name, child); err != nil { + return err + } + o.removeLocked(name) + return nil +} + +// Rmdir implements Inode.Rmdir. +func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { + // We're not responsible for checking that child is a directory, that it's + // empty, or updating any link counts; so this is the same as unlink. + return o.Unlink(ctx, name, child) +} + +type renameAcrossDifferentImplementationsError struct{} + +func (renameAcrossDifferentImplementationsError) Error() string { + return "rename across inodes with different implementations" +} + +// Rename implements Inode.Rename. +// +// Precondition: Rename may only be called across two directory inodes with +// identical implementations of Rename. Practically, this means filesystems that +// implement Rename by embedding OrderedChildren for any directory +// implementation must use OrderedChildren for all directory implementations +// that will support Rename. +// +// Postcondition: reference on any replaced dentry transferred to caller. +func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) { + dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren) + if !ok { + return nil, renameAcrossDifferentImplementationsError{} + } + if !o.writable || !dst.writable { + return nil, syserror.EPERM + } + // Note: There's a potential deadlock below if concurrent calls to Rename + // refer to the same src and dst directories in reverse. We avoid any + // ordering issues because the caller is required to serialize concurrent + // calls to Rename in accordance with the interface declaration. + o.mu.Lock() + defer o.mu.Unlock() + if dst != o { + dst.mu.Lock() + defer dst.mu.Unlock() + } + if err := o.checkExistingLocked(oldname, child); err != nil { + return nil, err + } + replaced := dst.replaceChildLocked(newname, child) + return replaced, nil +} + +// nthLocked returns an iterator to the nth child tracked by this object. The +// iterator is valid until the caller releases o.mu. Returns nil if the +// requested index falls out of bounds. +// +// Preconditon: Caller must hold o.mu for reading. +func (o *OrderedChildren) nthLocked(i int64) *slot { + for it := o.order.Front(); it != nil && i >= 0; it = it.Next() { + if i == 0 { + return it + } + i-- + } + return nil +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go new file mode 100644 index 000000000..bb01c3d01 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -0,0 +1,405 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernfs provides the tools to implement inode-based filesystems. +// Kernfs has two main features: +// +// 1. The Inode interface, which maps VFS2's path-based filesystem operations to +// specific filesystem nodes. Kernfs uses the Inode interface to provide a +// blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as +// the synchronization mechanism for all filesystem operations by holding a +// filesystem-wide lock across all operations. +// +// 2. Various utility types which provide generic implementations for various +// parts of the Inode and vfs.FileDescription interfaces. Client filesystems +// based on kernfs can embed the appropriate set of these to avoid having to +// reimplement common filesystem operations. See inode_impl_util.go and +// fd_impl_util.go. +// +// Reference Model: +// +// Kernfs dentries represents named pointers to inodes. Dentries and inode have +// independent lifetimes and reference counts. A child dentry unconditionally +// holds a reference on its parent directory's dentry. A dentry also holds a +// reference on the inode it points to. Multiple dentries can point to the same +// inode (for example, in the case of hardlinks). File descriptors hold a +// reference to the dentry they're opened on. +// +// Dentries are guaranteed to exist while holding Filesystem.mu for +// reading. Dropping dentries require holding Filesystem.mu for writing. To +// queue dentries for destruction from a read critical section, see +// Filesystem.deferDecRef. +// +// Lock ordering: +// +// kernfs.Filesystem.mu +// kernfs.Dentry.dirMu +// vfs.VirtualFilesystem.mountMu +// vfs.Dentry.mu +// kernfs.Filesystem.droppedDentriesMu +// (inode implementation locks, if any) +package kernfs + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory +// filesystem. Concrete implementations are expected to embed this in their own +// Filesystem type. +type Filesystem struct { + vfsfs vfs.Filesystem + + droppedDentriesMu sync.Mutex + + // droppedDentries is a list of dentries waiting to be DecRef()ed. This is + // used to defer dentry destruction until mu can be acquired for + // writing. Protected by droppedDentriesMu. + droppedDentries []*vfs.Dentry + + // mu synchronizes the lifetime of Dentries on this filesystem. Holding it + // for reading guarantees continued existence of any resolved dentries, but + // the dentry tree may be modified. + // + // Kernfs dentries can only be DecRef()ed while holding mu for writing. For + // example: + // + // fs.mu.Lock() + // defer fs.mu.Unlock() + // ... + // dentry1.DecRef() + // defer dentry2.DecRef() // Ok, will run before Unlock. + // + // If discarding dentries in a read context, use Filesystem.deferDecRef. For + // example: + // + // fs.mu.RLock() + // fs.mu.processDeferredDecRefs() + // defer fs.mu.RUnlock() + // ... + // fs.deferDecRef(dentry) + mu sync.RWMutex + + // nextInoMinusOne is used to to allocate inode numbers on this + // filesystem. Must be accessed by atomic operations. + nextInoMinusOne uint64 +} + +// deferDecRef defers dropping a dentry ref until the next call to +// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. +// +// Precondition: d must not already be pending destruction. +func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { + fs.droppedDentriesMu.Lock() + fs.droppedDentries = append(fs.droppedDentries, d) + fs.droppedDentriesMu.Unlock() +} + +// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the +// droppedDentries list. See comment on Filesystem.mu. +func (fs *Filesystem) processDeferredDecRefs() { + fs.mu.Lock() + fs.processDeferredDecRefsLocked() + fs.mu.Unlock() +} + +// Precondition: fs.mu must be held for writing. +func (fs *Filesystem) processDeferredDecRefsLocked() { + fs.droppedDentriesMu.Lock() + for _, d := range fs.droppedDentries { + d.DecRef() + } + fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. + fs.droppedDentriesMu.Unlock() +} + +// Init initializes a kernfs filesystem. This should be called from during +// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding +// kernfs. +func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) { + fs.vfsfs.Init(vfsObj, fs) +} + +// VFSFilesystem returns the generic vfs filesystem object. +func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { + return &fs.vfsfs +} + +// NextIno allocates a new inode number on this filesystem. +func (fs *Filesystem) NextIno() uint64 { + return atomic.AddUint64(&fs.nextInoMinusOne, 1) +} + +// These consts are used in the Dentry.flags field. +const ( + // Dentry points to a directory inode. + dflagsIsDir = 1 << iota + + // Dentry points to a symlink inode. + dflagsIsSymlink +) + +// Dentry implements vfs.DentryImpl. +// +// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a +// named reference to an inode. A dentry generally lives as long as it's part of +// a mounted filesystem tree. Kernfs doesn't cache dentries once all references +// to them are removed. Dentries hold a single reference to the inode they point +// to, and child dentries hold a reference on their parent. +// +// Must be initialized by Init prior to first use. +type Dentry struct { + refs.AtomicRefCount + + vfsd vfs.Dentry + inode Inode + + refs uint64 + + // flags caches useful information about the dentry from the inode. See the + // dflags* consts above. Must be accessed by atomic ops. + flags uint32 + + // dirMu protects vfsd.children for directory dentries. + dirMu sync.Mutex +} + +// Init initializes this dentry. +// +// Precondition: Caller must hold a reference on inode. +// +// Postcondition: Caller's reference on inode is transferred to the dentry. +func (d *Dentry) Init(inode Inode) { + d.vfsd.Init(d) + d.inode = inode + ftype := inode.Mode().FileType() + if ftype == linux.ModeDirectory { + d.flags |= dflagsIsDir + } + if ftype == linux.ModeSymlink { + d.flags |= dflagsIsSymlink + } +} + +// VFSDentry returns the generic vfs dentry for this kernfs dentry. +func (d *Dentry) VFSDentry() *vfs.Dentry { + return &d.vfsd +} + +// isDir checks whether the dentry points to a directory inode. +func (d *Dentry) isDir() bool { + return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0 +} + +// isSymlink checks whether the dentry points to a symlink inode. +func (d *Dentry) isSymlink() bool { + return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0 +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *Dentry) DecRef() { + d.AtomicRefCount.DecRefWithDestructor(d.destroy) +} + +// Precondition: Dentry must be removed from VFS' dentry cache. +func (d *Dentry) destroy() { + d.inode.DecRef() // IncRef from Init. + d.inode = nil + if parent := d.vfsd.Parent(); parent != nil { + parent.DecRef() // IncRef from Dentry.InsertChild. + } +} + +// InsertChild inserts child into the vfs dentry cache with the given name under +// this dentry. This does not update the directory inode, so calling this on +// it's own isn't sufficient to insert a child into a directory. InsertChild +// updates the link count on d if required. +// +// Precondition: d must represent a directory inode. +func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { + if !d.isDir() { + panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) + } + vfsDentry := d.VFSDentry() + vfsDentry.IncRef() // DecRef in child's Dentry.destroy. + d.dirMu.Lock() + vfsDentry.InsertChild(child, name) + d.dirMu.Unlock() +} + +// The Inode interface maps filesystem-level operations that operate on paths to +// equivalent operations on specific filesystem nodes. +// +// The interface methods are groups into logical categories as sub interfaces +// below. Generally, an implementation for each sub interface can be provided by +// embedding an appropriate type from inode_impl_utils.go. The sub interfaces +// are purely organizational. Methods declared directly in the main interface +// have no generic implementations, and should be explicitly provided by the +// client filesystem. +// +// Generally, implementations are not responsible for tasks that are common to +// all filesystems. These include: +// +// - Checking that dentries passed to methods are of the appropriate file type. +// - Checking permissions. +// - Updating link and reference counts. +// +// Specific responsibilities of implementations are documented below. +type Inode interface { + // Methods related to reference counting. A generic implementation is + // provided by InodeNoopRefCount. These methods are generally called by the + // equivalent Dentry methods. + inodeRefs + + // Methods related to node metadata. A generic implementation is provided by + // InodeAttrs. + inodeMetadata + + // Method for inodes that represent symlink. InodeNotSymlink provides a + // blanket implementation for all non-symlink inodes. + inodeSymlink + + // Method for inodes that represent directories. InodeNotDirectory provides + // a blanket implementation for all non-directory inodes. + inodeDirectory + + // Method for inodes that represent dynamic directories and their + // children. InodeNoDynamicLookup provides a blanket implementation for all + // non-dynamic-directory inodes. + inodeDynamicLookup + + // Open creates a file description for the filesystem object represented by + // this inode. The returned file description should hold a reference on the + // inode for its lifetime. + // + // Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry. + Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) +} + +type inodeRefs interface { + IncRef() + DecRef() + TryIncRef() bool + // Destroy is called when the inode reaches zero references. Destroy release + // all resources (references) on objects referenced by the inode, including + // any child dentries. + Destroy() +} + +type inodeMetadata interface { + // CheckPermissions checks that creds may access this inode for the + // requested access type, per the the rules of + // fs/namei.c:generic_permission(). + CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error + + // Mode returns the (struct stat)::st_mode value for this inode. This is + // separated from Stat for performance. + Mode() linux.FileMode + + // Stat returns the metadata for this inode. This corresponds to + // vfs.FilesystemImpl.StatAt. + Stat(fs *vfs.Filesystem) linux.Statx + + // SetStat updates the metadata for this inode. This corresponds to + // vfs.FilesystemImpl.SetStatAt. + SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error +} + +// Precondition: All methods in this interface may only be called on directory +// inodes. +type inodeDirectory interface { + // The New{File,Dir,Node,Symlink} methods below should return a new inode + // hashed into this inode. + // + // These inode constructors are inode-level operations rather than + // filesystem-level operations to allow client filesystems to mix different + // implementations based on the new node's location in the + // filesystem. + + // HasChildren returns true if the directory inode has any children. + HasChildren() bool + + // NewFile creates a new regular file inode. + NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) + + // NewDir creates a new directory inode. + NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) + + // NewLink creates a new hardlink to a specified inode in this + // directory. Implementations should create a new kernfs Dentry pointing to + // target, and update target's link count. + NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) + + // NewSymlink creates a new symbolic link inode. + NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) + + // NewNode creates a new filesystem node for a mknod syscall. + NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) + + // Unlink removes a child dentry from this directory inode. + Unlink(ctx context.Context, name string, child *vfs.Dentry) error + + // RmDir removes an empty child directory from this directory + // inode. Implementations must update the parent directory's link count, + // if required. Implementations are not responsible for checking that child + // is a directory, checking for an empty directory. + RmDir(ctx context.Context, name string, child *vfs.Dentry) error + + // Rename is called on the source directory containing an inode being + // renamed. child should point to the resolved child in the source + // directory. If Rename replaces a dentry in the destination directory, it + // should return the replaced dentry or nil otherwise. + // + // Precondition: Caller must serialize concurrent calls to Rename. + Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error) +} + +type inodeDynamicLookup interface { + // Lookup should return an appropriate dentry if name should resolve to a + // child of this dynamic directory inode. This gives the directory an + // opportunity on every lookup to resolve additional entries that aren't + // hashed into the directory. This is only called when the inode is a + // directory. If the inode is not a directory, or if the directory only + // contains a static set of children, the implementer can unconditionally + // return an appropriate error (ENOTDIR and ENOENT respectively). + // + // The child returned by Lookup will be hashed into the VFS dentry tree. Its + // lifetime can be controlled by the filesystem implementation with an + // appropriate implementation of Valid. + // + // Lookup returns the child with an extra reference and the caller owns this + // reference. + Lookup(ctx context.Context, name string) (*vfs.Dentry, error) + + // Valid should return true if this inode is still valid, or needs to + // be resolved again by a call to Lookup. + Valid(ctx context.Context) bool +} + +type inodeSymlink interface { + // Readlink resolves the target of a symbolic link. If an inode is not a + // symlink, the implementation should return EINVAL. + Readlink(ctx context.Context) (string, error) +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go new file mode 100644 index 000000000..f78bb7b04 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -0,0 +1,423 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs_test + +import ( + "bytes" + "fmt" + "io" + "runtime" + "sync" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const defaultMode linux.FileMode = 01777 +const staticFileContent = "This is sample content for a static test file." + +// RootDentryFn is a generator function for creating the root dentry of a test +// filesystem. See newTestSystem. +type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry + +// TestSystem represents the context for a single test. +type TestSystem struct { + t *testing.T + ctx context.Context + creds *auth.Credentials + vfs *vfs.VirtualFilesystem + mns *vfs.MountNamespace + root vfs.VirtualDentry +} + +// newTestSystem sets up a minimal environment for running a test, including an +// instance of a test filesystem. Tests can control the contents of the +// filesystem by providing an appropriate rootFn, which should return a +// pre-populated root dentry. +func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + v := vfs.New() + v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}) + mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("Failed to create testfs root mount: %v", err) + } + + s := &TestSystem{ + t: t, + ctx: ctx, + creds: creds, + vfs: v, + mns: mns, + root: mns.Root(), + } + runtime.SetFinalizer(s, func(s *TestSystem) { s.root.DecRef() }) + return s +} + +// PathOpAtRoot constructs a vfs.PathOperation for a path from the +// root of the test filesystem. +// +// Precondition: path should be relative path. +func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation { + return vfs.PathOperation{ + Root: s.root, + Start: s.root, + Pathname: path, + } +} + +// GetDentryOrDie attempts to resolve a dentry referred to by the +// provided path operation. If unsuccessful, the test fails. +func (s *TestSystem) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry { + vd, err := s.vfs.GetDentryAt(s.ctx, s.creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err) + } + return vd +} + +func (s *TestSystem) ReadToEnd(fd *vfs.FileDescription) (string, error) { + buf := make([]byte, usermem.PageSize) + bufIOSeq := usermem.BytesIOSequence(buf) + opts := vfs.ReadOptions{} + + var content bytes.Buffer + for { + n, err := fd.Impl().Read(s.ctx, bufIOSeq, opts) + if n == 0 || err != nil { + if err == io.EOF { + err = nil + } + return content.String(), err + } + content.Write(buf[:n]) + } +} + +type fsType struct { + rootFn RootDentryFn +} + +type filesystem struct { + kernfs.Filesystem +} + +type file struct { + kernfs.DynamicBytesFile + content string +} + +func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { + f := &file{} + f.content = content + f.DynamicBytesFile.Init(creds, fs.NextIno(), f) + + d := &kernfs.Dentry{} + d.Init(f) + return d +} + +func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%s", f.content) + return nil +} + +type attrs struct { + kernfs.InodeAttrs +} + +func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error { + return syserror.EPERM +} + +type readonlyDir struct { + attrs + kernfs.InodeNotSymlink + kernfs.InodeNoDynamicLookup + kernfs.InodeDirectoryNoNewChildren + + kernfs.OrderedChildren + dentry kernfs.Dentry +} + +func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &readonlyDir{} + dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + dir.dentry.Init(dir) + + dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + + return &dir.dentry +} + +func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +type dir struct { + attrs + kernfs.InodeNotSymlink + kernfs.InodeNoDynamicLookup + + fs *filesystem + dentry kernfs.Dentry + kernfs.OrderedChildren +} + +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &dir{} + dir.fs = fs + dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) + dir.dentry.Init(dir) + + dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + + return &dir.dentry +} + +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { + creds := auth.CredentialsFromContext(ctx) + dir := d.fs.newDir(creds, opts.Mode, nil) + dirVFSD := dir.VFSDentry() + if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { + dir.DecRef() + return nil, err + } + d.IncLinks(1) + return dirVFSD, nil +} + +func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { + creds := auth.CredentialsFromContext(ctx) + f := d.fs.newFile(creds, "") + fVFSD := f.VFSDentry() + if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { + f.DecRef() + return nil, err + } + return fVFSD, nil +} + +func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fs := &filesystem{} + fs.Init(vfsObj) + root := fst.rootFn(creds, fs) + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// -------------------- Remainder of the file are test cases -------------------- + +func TestBasic(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "file1": fs.newFile(creds, staticFileContent), + }) + }) + sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() +} + +func TestMkdirGetDentry(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir1": fs.newDir(creds, 0755, nil), + }) + }) + + pop := sys.PathOpAtRoot("dir1/a new directory") + if err := sys.vfs.MkdirAt(sys.ctx, sys.creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { + t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) + } + sys.GetDentryOrDie(pop).DecRef() +} + +func TestReadStaticFile(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "file1": fs.newFile(creds, staticFileContent), + }) + }) + + pop := sys.PathOpAtRoot("file1") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + content, err := sys.ReadToEnd(fd) + if err != nil { + sys.t.Fatalf("Read failed: %v", err) + } + if diff := cmp.Diff(staticFileContent, content); diff != "" { + sys.t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff) + } +} + +func TestCreateNewFileInStaticDir(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir1": fs.newDir(creds, 0755, nil), + }) + }) + + pop := sys.PathOpAtRoot("dir1/newfile") + opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode} + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, opts) + if err != nil { + sys.t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err) + } + + // Close the file. The file should persist. + fd.DecRef() + + fd, err = sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) + } + fd.DecRef() +} + +// direntCollector provides an implementation for vfs.IterDirentsCallback for +// testing. It simply iterates to the end of a given directory FD and collects +// all dirents emitted by the callback. +type direntCollector struct { + mu sync.Mutex + dirents map[string]vfs.Dirent +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (d *direntCollector) Handle(dirent vfs.Dirent) bool { + d.mu.Lock() + if d.dirents == nil { + d.dirents = make(map[string]vfs.Dirent) + } + d.dirents[dirent.Name] = dirent + d.mu.Unlock() + return true +} + +// count returns the number of dirents currently in the collector. +func (d *direntCollector) count() int { + d.mu.Lock() + defer d.mu.Unlock() + return len(d.dirents) +} + +// contains checks whether the collector has a dirent with the given name and +// type. +func (d *direntCollector) contains(name string, typ uint8) error { + d.mu.Lock() + defer d.mu.Unlock() + dirent, ok := d.dirents[name] + if !ok { + return fmt.Errorf("No dirent named %q found", name) + } + if dirent.Type != typ { + return fmt.Errorf("Dirent named %q found, but was expecting type %d, got: %+v", name, typ, dirent) + } + return nil +} + +func TestDirFDReadWrite(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, nil) + }) + + pop := sys.PathOpAtRoot("/") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + // Read/Write should fail for directory FDs. + if _, err := fd.Read(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { + sys.t.Fatalf("Read for directory FD failed with unexpected error: %v", err) + } + if _, err := fd.Write(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR { + sys.t.Fatalf("Wrire for directory FD failed with unexpected error: %v", err) + } +} + +func TestDirFDIterDirents(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + // Fill root with nodes backed by various inode implementations. + "dir1": fs.newReadonlyDir(creds, 0755, nil), + "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir3": fs.newDir(creds, 0755, nil), + }), + "file1": fs.newFile(creds, staticFileContent), + }) + }) + + pop := sys.PathOpAtRoot("/") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + collector := &direntCollector{} + if err := fd.IterDirents(sys.ctx, collector); err != nil { + sys.t.Fatalf("IterDirent failed: %v", err) + } + + // Root directory should contain ".", ".." and 3 children: + if collector.count() != 5 { + sys.t.Fatalf("IterDirent returned too many dirents") + } + for _, dirName := range []string{".", "..", "dir1", "dir2"} { + if err := collector.contains(dirName, linux.DT_DIR); err != nil { + sys.t.Fatalf("IterDirent had unexpected results: %v", err) + } + } + if err := collector.contains("file1", linux.DT_REG); err != nil { + sys.t.Fatalf("IterDirent had unexpected results: %v", err) + } + +} diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD index d2450e810..0cc751eb8 100644 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) @@ -23,14 +24,19 @@ go_library( "directory.go", "filesystem.go", "memfs.go", + "named_pipe.go", "regular_file.go", "symlink.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs", deps = [ "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/fspath", + "//pkg/sentry/arch", "//pkg/sentry/context", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", @@ -44,6 +50,7 @@ go_test( deps = [ ":memfs", "//pkg/abi/linux", + "//pkg/refs", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", @@ -53,3 +60,19 @@ go_test( "//pkg/syserror", ], ) + +go_test( + name = "memfs_test", + size = "small", + srcs = ["pipe_test.go"], + embed = [":memfs"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go index a94b17db6..4a7a94a52 100644 --- a/pkg/sentry/fsimpl/memfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go @@ -21,6 +21,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -160,6 +161,8 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { b.Fatalf("stat(%q) failed: %v", filePath, err) } } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() }) } } @@ -173,10 +176,11 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } + defer mntns.DecRef(vfsObj) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') @@ -186,7 +190,6 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { defer root.DecRef() vd := root vd.IncRef() - defer vd.DecRef() for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) pop := vfs.PathOperation{ @@ -219,6 +222,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) + vd.DecRef() + vd = vfs.VirtualDentry{} if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } @@ -243,6 +248,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { b.Fatalf("got wrong permissions (%0o)", stat.Mode) } } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() }) } } @@ -343,6 +350,8 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { b.Fatalf("stat(%q) failed: %v", filePath, err) } } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() }) } } @@ -356,10 +365,11 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } + defer mntns.DecRef(vfsObj) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') @@ -384,7 +394,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { } defer mountPoint.DecRef() // Create and mount the submount. - if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil { + if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } filePathBuilder.WriteString(mountPointName) @@ -395,7 +405,6 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount root: %v", err) } - defer vd.DecRef() for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) pop := vfs.PathOperation{ @@ -435,6 +444,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) + vd.DecRef() if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } @@ -459,6 +469,14 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { b.Fatalf("got wrong permissions (%0o)", stat.Mode) } } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() }) } } + +func init() { + // Turn off reference leak checking for a fair comparison between vfs1 and + // vfs2. + refs.SetLeakMode(refs.NoLeakChecking) +} diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go index c52dc781c..0bd82e480 100644 --- a/pkg/sentry/fsimpl/memfs/directory.go +++ b/pkg/sentry/fsimpl/memfs/directory.go @@ -32,7 +32,7 @@ type directory struct { childList dentryList } -func (fs *filesystem) newDirectory(creds *auth.Credentials, mode uint16) *inode { +func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode { dir := &directory{} dir.inode.init(dir, fs, creds, mode) dir.inode.nlink = 2 // from "." and parent directory or ".." for root @@ -75,10 +75,10 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba if fd.off == 0 { if !cb.Handle(vfs.Dirent{ - Name: ".", - Type: linux.DT_DIR, - Ino: vfsd.Impl().(*dentry).inode.ino, - Off: 0, + Name: ".", + Type: linux.DT_DIR, + Ino: vfsd.Impl().(*dentry).inode.ino, + NextOff: 1, }) { return nil } @@ -87,10 +87,10 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba if fd.off == 1 { parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode if !cb.Handle(vfs.Dirent{ - Name: "..", - Type: parentInode.direntType(), - Ino: parentInode.ino, - Off: 1, + Name: "..", + Type: parentInode.direntType(), + Ino: parentInode.ino, + NextOff: 2, }) { return nil } @@ -112,10 +112,10 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Skip other directoryFD iterators. if child.inode != nil { if !cb.Handle(vfs.Dirent{ - Name: child.vfsd.Name(), - Type: child.inode.direntType(), - Ino: child.inode.ino, - Off: fd.off, + Name: child.vfsd.Name(), + Type: child.inode.direntType(), + Ino: child.inode.ino, + NextOff: fd.off + 1, }) { dir.childList.InsertBefore(child, fd.iter) return nil diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index f79e2d9c8..22f1e811f 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -159,7 +160,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op return nil, err } } - inode.incRef() // vfsd.IncRef(&fs.vfsfs) + inode.incRef() return vfsd, nil } @@ -233,7 +234,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v if err != nil { return err } - _, err = checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) if err != nil { return err } @@ -241,8 +242,40 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - // TODO: actually implement mknod - return syserror.EPERM + + switch opts.Mode.FileType() { + case 0: + // "Zero file type is equivalent to type S_IFREG." - mknod(2) + fallthrough + case linux.ModeRegular: + // TODO(b/138862511): Implement. + return syserror.EINVAL + + case linux.ModeNamedPipe: + child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode)) + parentVFSD.InsertChild(&child.vfsd, pc) + parentInode.impl.(*directory).childList.PushBack(child) + return nil + + case linux.ModeSocket: + // TODO(b/138862511): Implement. + return syserror.EINVAL + + case linux.ModeCharacterDevice: + fallthrough + case linux.ModeBlockDevice: + // TODO(b/72101894): We don't support creating block or character + // devices at the moment. + // + // When we start supporting block and character devices, we'll + // need to check for CAP_MKNOD here. + return syserror.EPERM + + default: + // "EINVAL - mode requested creation of something other than a + // regular file, device special file, FIFO or socket." - mknod(2) + return syserror.EINVAL + } } // OpenAt implements vfs.FilesystemImpl.OpenAt. @@ -250,8 +283,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // Filter out flags that are not supported by memfs. O_DIRECTORY and // O_NOFOLLOW have no effect here (they're handled by VFS by setting // appropriate bits in rp), but are returned by - // FileDescriptionImpl.StatusFlags(). - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW + // FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by + // pipes. + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() @@ -260,7 +294,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return nil, err } - return inode.open(rp, vfsd, opts.Flags, false) + return inode.open(ctx, rp, vfsd, opts.Flags, false) } mustCreate := opts.Flags&linux.O_EXCL != 0 @@ -275,7 +309,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, syserror.EEXIST } - return inode.open(rp, vfsd, opts.Flags, false) + return inode.open(ctx, rp, vfsd, opts.Flags, false) } afterTrailingSymlink: // Walk to the parent directory of the last path component. @@ -320,7 +354,7 @@ afterTrailingSymlink: child := fs.newDentry(childInode) vfsd.InsertChild(&child.vfsd, pc) inode.impl.(*directory).childList.PushBack(child) - return childInode.open(rp, &child.vfsd, opts.Flags, true) + return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true) } // Open existing file or follow symlink. if mustCreate { @@ -336,16 +370,17 @@ afterTrailingSymlink: // symlink target. goto afterTrailingSymlink } - return childInode.open(rp, childVFSD, opts.Flags, false) + return childInode.open(ctx, rp, childVFSD, opts.Flags, false) } -func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { +func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(flags) if !afterCreate { if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil { return nil, err } } + mnt := rp.Mount() switch impl := i.impl.(type) { case *regularFile: var fd regularFileFD @@ -353,12 +388,14 @@ func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afte fd.readable = vfs.MayReadFileWithOpenFlags(flags) fd.writable = vfs.MayWriteFileWithOpenFlags(flags) if fd.writable { - if err := rp.Mount().CheckBeginWrite(); err != nil { + if err := mnt.CheckBeginWrite(); err != nil { return nil, err } - // Mount.EndWrite() is called by regularFileFD.Release(). + // mnt.EndWrite() is called by regularFileFD.Release(). } - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data = impl.data[:0] @@ -372,12 +409,16 @@ func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afte return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) fd.flags = flags return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP + case *namedPipe: + return newNamedPipeFD(ctx, impl, rp, vfsd, flags) default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } @@ -542,3 +583,58 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error inode.decLinksLocked() return nil } + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, _, err := walkExistingLocked(rp) + if err != nil { + return nil, err + } + // TODO(b/127675828): support extended attributes + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, _, err := walkExistingLocked(rp) + if err != nil { + return "", err + } + // TODO(b/127675828): support extended attributes + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, _, err := walkExistingLocked(rp) + if err != nil { + return err + } + // TODO(b/127675828): support extended attributes + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, _, err := walkExistingLocked(rp) + if err != nil { + return err + } + // TODO(b/127675828): support extended attributes + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go index 45cd42b3e..4cb2a4e0f 100644 --- a/pkg/sentry/fsimpl/memfs/memfs.go +++ b/pkg/sentry/fsimpl/memfs/memfs.go @@ -52,10 +52,10 @@ type filesystem struct { nextInoMinusOne uint64 // accessed using atomic memory operations } -// NewFilesystem implements vfs.FilesystemType.NewFilesystem. -func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { var fs filesystem - fs.vfsfs.Init(&fs) + fs.vfsfs.Init(vfsObj, &fs) root := fs.newDentry(fs.newDirectory(creds, 01777)) return &fs.vfsfs, &root.vfsd, nil } @@ -99,17 +99,17 @@ func (fs *filesystem) newDentry(inode *inode) *dentry { } // IncRef implements vfs.DentryImpl.IncRef. -func (d *dentry) IncRef(vfsfs *vfs.Filesystem) { +func (d *dentry) IncRef() { d.inode.incRef() } // TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { +func (d *dentry) TryIncRef() bool { return d.inode.tryIncRef() } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef(vfsfs *vfs.Filesystem) { +func (d *dentry) DecRef() { d.inode.decRef() } @@ -137,7 +137,7 @@ type inode struct { impl interface{} // immutable } -func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode uint16) { +func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) @@ -227,6 +227,8 @@ func (i *inode) statTo(stat *linux.Statx) { stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(len(impl.target)) stat.Blocks = allocatedBlocksForSize(stat.Size) + case *namedPipe: + stat.Mode |= linux.S_IFIFO default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } @@ -264,11 +266,11 @@ type fileDescription struct { } func (fd *fileDescription) filesystem() *filesystem { - return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem) + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) inode() *inode { - return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode + return fd.vfsfd.Dentry().Impl().(*dentry).inode } // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go new file mode 100644 index 000000000..91cb4b1fc --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/named_pipe.go @@ -0,0 +1,62 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type namedPipe struct { + inode inode + + pipe *pipe.VFSPipe +} + +// Preconditions: +// * fs.mu must be locked. +// * rp.Mount().CheckBeginWrite() has been called successfully. +func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { + file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)} + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // Only the parent has a link. + return &file.inode +} + +// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented +// entirely via struct embedding. +type namedPipeFD struct { + fileDescription + + *pipe.VFSPipeFD +} + +func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + var err error + var fd namedPipeFD + fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags) + if err != nil { + return nil, err + } + mnt := rp.Mount() + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) + return &fd.vfsfd, nil +} diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go new file mode 100644 index 000000000..5bf527c80 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/pipe_test.go @@ -0,0 +1,233 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "bytes" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const fileName = "mypipe" + +func TestSeparateFDs(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the read side. This is done in a concurrently because opening + // One end the pipe blocks until the other end is opened. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Pathname: fileName, + FollowFinalSymlink: true, + } + rfdchan := make(chan *vfs.FileDescription) + go func() { + openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY} + rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + rfdchan <- rfd + }() + + // Open the write side. + openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY} + wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer wfd.DecRef() + + rfd, ok := <-rfdchan + if !ok { + t.Fatalf("failed to open pipe for reading %q", fileName) + } + defer rfd.DecRef() + + const msg = "vamos azul" + checkEmpty(ctx, t, rfd) + checkWrite(ctx, t, wfd, msg) + checkRead(ctx, t, rfd, msg) +} + +func TestNonblockingRead(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the read side as nonblocking. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Pathname: fileName, + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK} + rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for reading %q: %v", fileName, err) + } + defer rfd.DecRef() + + // Open the write side. + openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY} + wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer wfd.DecRef() + + const msg = "geh blau" + checkEmpty(ctx, t, rfd) + checkWrite(ctx, t, wfd, msg) + checkRead(ctx, t, rfd, msg) +} + +func TestNonblockingWriteError(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the write side as nonblocking, which should return ENXIO. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Pathname: fileName, + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK} + _, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != syserror.ENXIO { + t.Fatalf("expected ENXIO, but got error: %v", err) + } +} + +func TestSingleFD(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the pipe as readable and writable. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Pathname: fileName, + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_RDWR} + fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer fd.DecRef() + + const msg = "forza blu" + checkEmpty(ctx, t, fd) + checkWrite(ctx, t, fd, msg) + checkRead(ctx, t, fd, msg) +} + +// setup creates a VFS with a pipe in the root directory at path fileName. The +// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal +// upon failure. +func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("failed to create tmpfs root mount: %v", err) + } + + // Create the pipe. + root := mntns.Root() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Pathname: fileName, + FollowFinalSymlink: true, + } + mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644} + if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil { + t.Fatalf("failed to create file %q: %v", fileName, err) + } + + // Sanity check: the file pipe exists and has the correct mode. + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Pathname: fileName, + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + t.Fatalf("stat(%q) failed: %v", fileName, err) + } + if stat.Mode&^linux.S_IFMT != 0644 { + t.Errorf("got wrong permissions (%0o)", stat.Mode) + } + if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe { + t.Errorf("got wrong file type (%0o)", stat.Mode) + } + + return ctx, creds, vfsObj, root +} + +// checkEmpty calls t.Fatal if the pipe in fd is not empty. +func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { + readData := make([]byte, 1) + dst := usermem.BytesIOSequence(readData) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) + if err != syserror.ErrWouldBlock { + t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) + } + if bytesRead != 0 { + t.Fatalf("expected to read 0 bytes, but got %d", bytesRead) + } +} + +// checkWrite calls t.Fatal if it fails to write all of msg to fd. +func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { + writeData := []byte(msg) + src := usermem.BytesIOSequence(writeData) + bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{}) + if err != nil { + t.Fatalf("error writing to pipe %q: %v", fileName, err) + } + if bytesWritten != int64(len(writeData)) { + t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten) + } +} + +// checkRead calls t.Fatal if it fails to read msg from fd. +func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { + readData := make([]byte, len(msg)) + dst := usermem.BytesIOSequence(readData) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) + if err != nil { + t.Fatalf("error reading from pipe %q: %v", fileName, err) + } + if bytesRead != int64(len(msg)) { + t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead) + } + if !bytes.Equal(readData, []byte(msg)) { + t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData)) + } +} diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go index 55f869798..b7f4853b3 100644 --- a/pkg/sentry/fsimpl/memfs/regular_file.go +++ b/pkg/sentry/fsimpl/memfs/regular_file.go @@ -37,7 +37,7 @@ type regularFile struct { dataLen int64 } -func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *inode { +func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { file := ®ularFile{} file.inode.init(file, fs, creds, mode) file.inode.nlink = 1 // from parent directory diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 3d8a4deaf..ade6ac946 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go index c36c4aff5..0e016bca5 100644 --- a/pkg/sentry/fsimpl/proc/filesystems.go +++ b/pkg/sentry/fsimpl/proc/filesystems.go @@ -19,7 +19,7 @@ package proc // +stateify savable type filesystemsData struct{} -// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for +// TODO(gvisor.dev/issue/1195): Implement vfs.DynamicBytesSource.Generate for // filesystemsData. We would need to retrive filesystem names from // vfs.VirtualFilesystem. Also needs vfs replacement for // fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev. diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go index e81b1e910..8683cf677 100644 --- a/pkg/sentry/fsimpl/proc/mounts.go +++ b/pkg/sentry/fsimpl/proc/mounts.go @@ -16,7 +16,7 @@ package proc import "gvisor.dev/gvisor/pkg/sentry/kernel" -// TODO(b/138862512): Implement mountInfoFile and mountsFile. +// TODO(gvisor.dev/issue/1195): Implement mountInfoFile and mountsFile. // mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo. // diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index c46e05c3a..0d87be52b 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -229,6 +229,10 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(buf, "Mems_allowed:\t1\n") + fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") return nil } diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD index f989f2f8b..359468ccc 100644 --- a/pkg/sentry/hostcpu/BUILD +++ b/pkg/sentry/hostcpu/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) @@ -6,6 +7,7 @@ go_library( name = "hostcpu", srcs = [ "getcpu_amd64.s", + "getcpu_arm64.s", "hostcpu.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/hostcpu", diff --git a/pkg/sentry/hostcpu/getcpu_arm64.s b/pkg/sentry/hostcpu/getcpu_arm64.s new file mode 100644 index 000000000..caf9abb89 --- /dev/null +++ b/pkg/sentry/hostcpu/getcpu_arm64.s @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// GetCPU makes the getcpu(unsigned *cpu, unsigned *node, NULL) syscall for +// the lack of an optimazed way of getting the current CPU number on arm64. + +// func GetCPU() (cpu uint32) +TEXT ·GetCPU(SB), NOSPLIT, $0-4 + MOVW ZR, cpu+0(FP) + MOVD $cpu+0(FP), R0 + MOVD $0x0, R1 // unused + MOVD $0x0, R2 // unused + MOVD $0xA8, R8 // SYS_GETCPU + SVC + RET diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index 184b566d9..8d60ad4ad 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -1,10 +1,10 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + package( default_visibility = ["//:sandbox"], licenses = ["notice"], ) -load("//tools/go_stateify:defs.bzl", "go_library") - go_library( name = "inet", srcs = [ @@ -13,5 +13,8 @@ go_library( "test_stack.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/inet", - deps = ["//pkg/sentry/context"], + deps = [ + "//pkg/sentry/context", + "//pkg/tcpip/stack", + ], ) diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 80f227dbe..a7dfb78a7 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -15,6 +15,8 @@ // Package inet defines semantics for IP stacks. package inet +import "gvisor.dev/gvisor/pkg/tcpip/stack" + // Stack represents a TCP/IP stack. type Stack interface { // Interfaces returns all network interfaces as a mapping from interface @@ -58,6 +60,16 @@ type Stack interface { // Resume restarts the network stack after restore. Resume() + + // RegisteredEndpoints returns all endpoints which are currently registered. + RegisteredEndpoints() []stack.TransportEndpoint + + // CleanupEndpoints returns endpoints currently in the cleanup state. + CleanupEndpoints() []stack.TransportEndpoint + + // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful + // for restoring a stack after a save. + RestoreCleanupEndpoints([]stack.TransportEndpoint) } // Interface contains information about a network interface. @@ -153,3 +165,23 @@ type Route struct { // GatewayAddr is the route gateway address (RTA_GATEWAY). GatewayAddr []byte } + +// Below SNMP metrics are from Linux/usr/include/linux/snmp.h. + +// StatSNMPIP describes Ip line of /proc/net/snmp. +type StatSNMPIP [19]uint64 + +// StatSNMPICMP describes Icmp line of /proc/net/snmp. +type StatSNMPICMP [27]uint64 + +// StatSNMPICMPMSG describes IcmpMsg line of /proc/net/snmp. +type StatSNMPICMPMSG [512]uint64 + +// StatSNMPTCP describes Tcp line of /proc/net/snmp. +type StatSNMPTCP [15]uint64 + +// StatSNMPUDP describes Udp line of /proc/net/snmp. +type StatSNMPUDP [8]uint64 + +// StatSNMPUDPLite describes UdpLite line of /proc/net/snmp. +type StatSNMPUDPLite [8]uint64 diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index b9eed7c3a..dcfcbd97e 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -14,6 +14,8 @@ package inet +import "gvisor.dev/gvisor/pkg/tcpip/stack" + // TestStack is a dummy implementation of Stack for tests. type TestStack struct { InterfacesMap map[int32]Interface @@ -94,5 +96,17 @@ func (s *TestStack) RouteTable() []Route { } // Resume implements Stack.Resume. -func (s *TestStack) Resume() { +func (s *TestStack) Resume() {} + +// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. +func (s *TestStack) RegisteredEndpoints() []stack.TransportEndpoint { + return nil } + +// CleanupEndpoints implements inet.Stack.CleanupEndpoints. +func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint { + return nil +} + +// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. +func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e61d39c82..2706927ff 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -1,10 +1,11 @@ load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") - go_template_instance( name = "pending_signals_list", out = "pending_signals_list.go", @@ -34,7 +35,7 @@ go_template_instance( out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", - template = "//third_party/gvsync:generic_seqatomic", + template = "//pkg/syncutil:generic_seqatomic", types = { "Value": "TaskGoroutineSchedInfo", }, @@ -83,6 +84,12 @@ proto_library( deps = ["//pkg/sentry/arch:registers_proto"], ) +cc_proto_library( + name = "uncaught_signal_cc_proto", + visibility = ["//visibility:public"], + deps = [":uncaught_signal_proto"], +) + go_proto_library( name = "uncaught_signal_go_proto", importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto", @@ -144,6 +151,7 @@ go_library( "threads.go", "timekeeper.go", "timekeeper_state.go", + "tty.go", "uts_namespace.go", "vdso.go", "version.go", @@ -201,12 +209,12 @@ go_library( "//pkg/sentry/usermem", "//pkg/state", "//pkg/state/statefile", + "//pkg/syncutil", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", "//pkg/waiter", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 1d00a6310..04c244447 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -1,14 +1,14 @@ -package(licenses = ["notice"]) - load("//tools/go_generics:defs.bzl", "go_template_instance") load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_template_instance( name = "atomicptr_credentials", out = "atomicptr_credentials_unsafe.go", package = "auth", suffix = "Credentials", - template = "//third_party/gvsync:generic_atomicptr", + template = "//pkg/syncutil:generic_atomicptr", types = { "Value": "Credentials", }, diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index e3f5b0d83..3c9dceaba 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -15,6 +15,8 @@ package kernel import ( + "time" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" ) @@ -97,6 +99,21 @@ func TaskFromContext(ctx context.Context) *Task { return nil } +// Deadline implements context.Context.Deadline. +func (*Task) Deadline() (time.Time, bool) { + return time.Time{}, false +} + +// Done implements context.Context.Done. +func (*Task) Done() <-chan struct{} { + return nil +} + +// Err implements context.Context.Err. +func (*Task) Err() error { + return nil +} + // AsyncContext returns a context.Context that may be used by goroutines that // do work on behalf of t and therefore share its contextual values, but are // not t's task goroutine (e.g. asynchronous I/O). @@ -129,6 +146,21 @@ func (ctx taskAsyncContext) IsLogging(level log.Level) bool { return ctx.t.IsLogging(level) } +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return ctx.t.Deadline() +} + +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return ctx.t.Done() +} + +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return ctx.t.Err() +} + // Value implements context.Context.Value. func (ctx taskAsyncContext) Value(key interface{}) interface{} { return ctx.t.Value(key) diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD index bec13a3d9..3a88a585c 100644 --- a/pkg/sentry/kernel/contexttest/BUILD +++ b/pkg/sentry/kernel/contexttest/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "contexttest", testonly = 1, diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index f46c43128..3361e8b7d 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "epoll_list", diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index 1c5f979d4..e65b961e8 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "eventfd", diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 5eddca115..49d81b712 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "fasync", srcs = ["fasync.go"], diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index cc3f43a45..11f613a11 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -81,6 +81,9 @@ type FDTable struct { // mu protects below. mu sync.Mutex `state:"nosave"` + // next is start position to find fd. + next int32 + // used contains the number of non-nil entries. It must be accessed // atomically. It may be read atomically without holding mu (but not // written). @@ -226,6 +229,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags f.mu.Lock() defer f.mu.Unlock() + // From f.next to find available fd. + if fd < f.next { + fd = f.next + } + // Install all entries. for i := fd; i < end && len(fds) < len(files); i++ { if d, _, _ := f.get(i); d == nil { @@ -242,6 +250,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return nil, syscall.EMFILE } + if fd == f.next { + // Update next search start position. + f.next = fds[len(fds)-1] + 1 + } + return fds, nil } @@ -361,6 +374,11 @@ func (f *FDTable) Remove(fd int32) *fs.File { f.mu.Lock() defer f.mu.Unlock() + // Update current available position. + if fd < f.next { + f.next = fd + } + orig, _, _ := f.get(fd) if orig != nil { orig.IncRef() // Reference for caller. @@ -377,6 +395,10 @@ func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) { f.forEach(func(fd int32, file *fs.File, flags FDFlags) { if cond(file, flags) { f.set(fd, nil, FDFlags{}) // Clear from table. + // Update current available position. + if fd < f.next { + f.next = fd + } } }) } diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 2413788e7..2bcb6216a 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -70,6 +70,42 @@ func TestFDTableMany(t *testing.T) { if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) } + + i := int32(2) + fdTable.Remove(i) + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i { + t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err) + } + }) +} + +func TestFDTableOverLimit(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + if _, err := fdTable.NewFDs(ctx, maxFD, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(maxFD, f): got nil, wanted error") + } + + if _, err := fdTable.NewFDs(ctx, maxFD-2, []*fs.File{file, file, file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(maxFD-2, {f,f,f}): got nil, wanted error") + } + + if fds, err := fdTable.NewFDs(ctx, maxFD-3, []*fs.File{file, file, file}, FDFlags{}); err != nil { + t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err) + } else { + for _, fd := range fds { + fdTable.Remove(fd) + } + } + + if fds, err := fdTable.NewFDs(ctx, maxFD-1, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != maxFD-1 { + t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) + } else if len(fds) != 1 || fds[0] != 0 { + t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds) + } }) } diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 6a31dc044..75ec31761 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -1,14 +1,15 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "atomicptr_bucket", out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", - template = "//third_party/gvsync:generic_atomicptr", + template = "//pkg/syncutil:generic_atomicptr", types = { "Value": "bucket", }, diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 8c1f79ab5..bd3fb4c03 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -24,6 +24,7 @@ // TaskSet.mu // SignalHandlers.mu // Task.mu +// runningTasksMu // // Locking SignalHandlers.mu in multiple SignalHandlers requires locking // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same @@ -135,6 +136,22 @@ type Kernel struct { // syslog is the kernel log. syslog syslog + // runningTasksMu synchronizes disable/enable of cpuClockTicker when + // the kernel is idle (runningTasks == 0). + // + // runningTasksMu is used to exclude critical sections when the timer + // disables itself and when the first active task enables the timer, + // ensuring that tasks always see a valid cpuClock value. + runningTasksMu sync.Mutex `state:"nosave"` + + // runningTasks is the total count of tasks currently in + // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are + // not blocked or stopped. + // + // runningTasks must be accessed atomically. Increments from 0 to 1 are + // further protected by runningTasksMu (see incRunningTasks). + runningTasks int64 + // cpuClock is incremented every linux.ClockTick. cpuClock is used to // measure task CPU usage, since sampling monotonicClock twice on every // syscall turns out to be unreasonably expensive. This is similar to how @@ -150,6 +167,22 @@ type Kernel struct { // cpuClockTicker increments cpuClock. cpuClockTicker *ktime.Timer `state:"nosave"` + // cpuClockTickerDisabled indicates that cpuClockTicker has been + // disabled because no tasks are running. + // + // cpuClockTickerDisabled is protected by runningTasksMu. + cpuClockTickerDisabled bool + + // cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the + // point it was disabled. It is cached here to avoid a lock ordering + // violation with cpuClockTicker.mu when runningTaskMu is held. + // + // cpuClockTickerSetting is only valid when cpuClockTickerDisabled is + // true. + // + // cpuClockTickerSetting is protected by runningTasksMu. + cpuClockTickerSetting ktime.Setting + // fdMapUids is an ever-increasing counter for generating FDTable uids. // // fdMapUids is mutable, and is accessed using atomic memory operations. @@ -358,7 +391,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() - if err := state.Save(w, k.FeatureSet(), nil); err != nil { + if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) @@ -366,7 +399,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // Save the kernel state. kernelStart := time.Now() var stats state.Stats - if err := state.Save(w, k, &stats); err != nil { + if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil { return err } log.Infof("Kernel save stats: %s", &stats) @@ -374,7 +407,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // Save the memory file's state. memoryStart := time.Now() - if err := k.mf.SaveTo(w); err != nil { + if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { return err } log.Infof("Memory save took [%s].", time.Since(memoryStart)) @@ -509,7 +542,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() var features cpuid.FeatureSet - if err := state.Load(r, &features, nil); err != nil { + if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) @@ -525,7 +558,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the kernel state. kernelStart := time.Now() var stats state.Stats - if err := state.Load(r, k, &stats); err != nil { + if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil { return err } log.Infof("Kernel load stats: %s", &stats) @@ -533,7 +566,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the memory file's state. memoryStart := time.Now() - if err := k.mf.LoadFrom(r); err != nil { + if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { return err } log.Infof("Memory load took [%s].", time.Since(memoryStart)) @@ -771,8 +804,21 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, // Create a fresh task context. remainingTraversals = uint(args.MaxSymlinkTraversals) + loadArgs := loader.LoadArgs{ + Mounts: mounts, + Root: root, + WorkingDirectory: wd, + RemainingTraversals: &remainingTraversals, + ResolveFinal: true, + Filename: args.Filename, + File: args.File, + CloseOnExec: false, + Argv: args.Argv, + Envv: args.Envv, + Features: k.featureSet, + } - tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet) + tc, se := k.LoadTaskImage(ctx, loadArgs) if se != nil { return nil, 0, errors.New(se.String()) } @@ -795,9 +841,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, AbstractSocketNamespace: args.AbstractSocketNamespace, ContainerID: args.ContainerID, } - if _, err := k.tasks.NewTask(config); err != nil { + t, err := k.tasks.NewTask(config) + if err != nil { return nil, 0, err } + t.traceExecEvent(tc) // Simulate exec for tracing. // Success. tgid := k.tasks.Root.IDOfThreadGroup(tg) @@ -912,6 +960,102 @@ func (k *Kernel) resumeTimeLocked() { } } +func (k *Kernel) incRunningTasks() { + for { + tasks := atomic.LoadInt64(&k.runningTasks) + if tasks != 0 { + // Standard case. Simply increment. + if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) { + continue + } + return + } + + // Transition from 0 -> 1. Synchronize with other transitions and timer. + k.runningTasksMu.Lock() + tasks = atomic.LoadInt64(&k.runningTasks) + if tasks != 0 { + // We're no longer the first task, no need to + // re-enable. + atomic.AddInt64(&k.runningTasks, 1) + k.runningTasksMu.Unlock() + return + } + + if !k.cpuClockTickerDisabled { + // Timer was never disabled. + atomic.StoreInt64(&k.runningTasks, 1) + k.runningTasksMu.Unlock() + return + } + + // We need to update cpuClock for all of the ticks missed while we + // slept, and then re-enable the timer. + // + // The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify + // always increments cpuClock by 1 regardless of the number of + // expirations as a heuristic to avoid over-accounting in cases of CPU + // throttling. + // + // We want to cover the normal case, when all time should be accounted, + // so we increment for all expirations. Throttling is less concerning + // here because the ticker is only disabled from Notify. This means + // that Notify must schedule and compensate for the throttled period + // before the timer is disabled. Throttling while the timer is disabled + // doesn't matter, as nothing is running or reading cpuClock anyways. + // + // S/R also adds complication, as there are two cases. Recall that + // monotonicClock will jump forward on restore. + // + // 1. If the ticker is enabled during save, then on Restore Notify is + // called with many expirations, covering the time jump, but cpuClock + // is only incremented by 1. + // + // 2. If the ticker is disabled during save, then after Restore the + // first wakeup will call this function and cpuClock will be + // incremented by the number of expirations across the S/R. + // + // These cause very different value of cpuClock. But again, since + // nothing was running while the ticker was disabled, those differences + // don't matter. + setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now()) + if exp > 0 { + atomic.AddUint64(&k.cpuClock, exp) + } + + // Now that cpuClock is updated it is safe to allow other tasks to + // transition to running. + atomic.StoreInt64(&k.runningTasks, 1) + + // N.B. we must unlock before calling Swap to maintain lock ordering. + // + // cpuClockTickerDisabled need not wait until after Swap to become + // true. It is sufficient that the timer *will* be enabled. + k.cpuClockTickerDisabled = false + k.runningTasksMu.Unlock() + + // This won't call Notify (unless it's been ClockTick since setting.At + // above). This means we skip the thread group work in Notify. However, + // since nothing was running while we were disabled, none of the timers + // could have expired. + k.cpuClockTicker.Swap(setting) + + return + } +} + +func (k *Kernel) decRunningTasks() { + tasks := atomic.AddInt64(&k.runningTasks, -1) + if tasks < 0 { + panic(fmt.Sprintf("Invalid running count %d", tasks)) + } + + // Nothing to do. The next CPU clock tick will disable the timer if + // there is still nothing running. This provides approximately one tick + // of slack in which we can switch back and forth between idle and + // active without an expensive transition. +} + // WaitExited blocks until all tasks in k have exited. func (k *Kernel) WaitExited() { k.tasks.liveGoroutines.Wait() @@ -976,6 +1120,22 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { return lastErr } +// RebuildTraceContexts rebuilds the trace context for all tasks. +// +// Unfortunately, if these are built while tracing is not enabled, then we will +// not have meaningful trace data. Rebuilding here ensures that we can do so +// after tracing has been enabled. +func (k *Kernel) RebuildTraceContexts() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + for t, tid := range k.tasks.Root.tids { + t.rebuildTraceContext(tid) + } +} + // FeatureSet returns the FeatureSet. func (k *Kernel) FeatureSet() *cpuid.FeatureSet { return k.featureSet @@ -1180,6 +1340,7 @@ func (k *Kernel) ListSockets() []*SocketEntry { return socks } +// supervisorContext is a privileged context. type supervisorContext struct { context.NoopSleeper log.Logger diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD index ebcfaa619..d7a7d1169 100644 --- a/pkg/sentry/kernel/memevent/BUILD +++ b/pkg/sentry/kernel/memevent/BUILD @@ -1,5 +1,6 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@rules_cc//cc:defs.bzl", "cc_proto_library") package(licenses = ["notice"]) @@ -24,6 +25,12 @@ proto_library( visibility = ["//visibility:public"], ) +cc_proto_library( + name = "memory_events_cc_proto", + visibility = ["//visibility:public"], + deps = [":memory_events_proto"], +) + go_proto_library( name = "memory_events_go_proto", importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto", diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 4d15cca85..9d34f6d4d 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "buffer_list", @@ -23,8 +24,10 @@ go_library( "device.go", "node.go", "pipe.go", + "pipe_util.go", "reader.go", "reader_writer.go", + "vfs.go", "writer.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/pipe", @@ -39,6 +42,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/safemem", "//pkg/sentry/usermem", + "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go index 69ef2a720..95bee2d37 100644 --- a/pkg/sentry/kernel/pipe/buffer.go +++ b/pkg/sentry/kernel/pipe/buffer.go @@ -15,6 +15,7 @@ package pipe import ( + "io" "sync" "gvisor.dev/gvisor/pkg/sentry/safemem" @@ -67,6 +68,17 @@ func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { return n, err } +// WriteFromReader writes to the buffer from an io.Reader. +func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) { + dst := b.data[b.write:] + if count < int64(len(dst)) { + dst = b.data[b.write:][:count] + } + n, err := r.Read(dst) + b.write += n + return int64(n), err +} + // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write])) @@ -75,6 +87,19 @@ func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { return n, err } +// ReadToWriter reads from the buffer into an io.Writer. +func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) { + src := b.data[b.read:b.write] + if count < int64(len(src)) { + src = b.data[b.read:][:count] + } + n, err := w.Write(src) + if !dup { + b.read += n + } + return int64(n), err +} + // bufferPool is a pool for buffers. var bufferPool = sync.Pool{ New: func() interface{} { diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index a2dc72204..4a19ab7ce 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -18,7 +18,6 @@ import ( "sync" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -91,10 +90,10 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi switch { case flags.Read && !flags.Write: // O_RDONLY. r := i.p.Open(ctx, d, flags) - i.newHandleLocked(&i.rWakeup) + newHandleLocked(&i.rWakeup) if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { - if !i.waitFor(&i.wWakeup, ctx) { + if !waitFor(&i.mu, &i.wWakeup, ctx) { r.DecRef() return nil, syserror.ErrInterrupted } @@ -107,7 +106,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi case flags.Write && !flags.Read: // O_WRONLY. w := i.p.Open(ctx, d, flags) - i.newHandleLocked(&i.wWakeup) + newHandleLocked(&i.wWakeup) if i.p.isNamed && !i.p.HasReaders() { // On a nonblocking, write-only open, the open fails with ENXIO if the @@ -117,7 +116,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi return nil, syserror.ENXIO } - if !i.waitFor(&i.rWakeup, ctx) { + if !waitFor(&i.mu, &i.rWakeup, ctx) { w.DecRef() return nil, syserror.ErrInterrupted } @@ -127,8 +126,8 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi case flags.Read && flags.Write: // O_RDWR. // Pipes opened for read-write always succeeds without blocking. rw := i.p.Open(ctx, d, flags) - i.newHandleLocked(&i.rWakeup) - i.newHandleLocked(&i.wWakeup) + newHandleLocked(&i.rWakeup) + newHandleLocked(&i.wWakeup) return rw, nil default: @@ -136,65 +135,6 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi } } -// waitFor blocks until the underlying pipe has at least one reader/writer is -// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this -// function will block for either readers or writers, depending on where -// 'wakeupChan' points. -// -// f.mu must be held by the caller. waitFor returns with f.mu held, but it will -// drop f.mu before blocking for any reader/writers. -func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { - // Ideally this function would simply use a condition variable. However, the - // wait needs to be interruptible via 'sleeper', so we must sychronize via a - // channel. The synchronization below relies on the fact that closing a - // channel unblocks all receives on the channel. - - // Does an appropriate wakeup channel already exist? If not, create a new - // one. This is all done under f.mu to avoid races. - if *wakeupChan == nil { - *wakeupChan = make(chan struct{}) - } - - // Grab a local reference to the wakeup channel since it may disappear as - // soon as we drop f.mu. - wakeup := *wakeupChan - - // Drop the lock and prepare to sleep. - i.mu.Unlock() - cancel := sleeper.SleepStart() - - // Wait for either a new reader/write to be signalled via 'wakeup', or - // for the sleep to be cancelled. - select { - case <-wakeup: - sleeper.SleepFinish(true) - case <-cancel: - sleeper.SleepFinish(false) - } - - // Take the lock and check if we were woken. If we were woken and - // interrupted, the former takes priority. - i.mu.Lock() - select { - case <-wakeup: - return true - default: - return false - } -} - -// newHandleLocked signals a new pipe reader or writer depending on where -// 'wakeupChan' points. This unblocks any corresponding reader or writer -// waiting for the other end of the channel to be opened, see Fifo.waitFor. -// -// i.mu must be held. -func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) { - if *wakeupChan != nil { - close(*wakeupChan) - *wakeupChan = nil - } -} - func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { return syserror.EPIPE } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index adbad7764..16fa80abe 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -85,11 +85,11 @@ func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs. } func newNamedPipe(t *testing.T) *Pipe { - return NewPipe(contexttest.Context(t), true, DefaultPipeSize, usermem.PageSize) + return NewPipe(true, DefaultPipeSize, usermem.PageSize) } func newAnonPipe(t *testing.T) *Pipe { - return NewPipe(contexttest.Context(t), false, DefaultPipeSize, usermem.PageSize) + return NewPipe(false, DefaultPipeSize, usermem.PageSize) } // assertRecvBlocks ensures that a recv attempt on c blocks for at least diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 247e2928e..1a1b38f83 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -99,7 +98,7 @@ type Pipe struct { // NewPipe initializes and returns a pipe. // // N.B. The size and atomicIOBytes will be bounded. -func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe { +func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe { if sizeBytes < MinimumPipeSize { sizeBytes = MinimumPipeSize } @@ -112,17 +111,33 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) if atomicIOBytes > sizeBytes { atomicIOBytes = sizeBytes } - return &Pipe{ - isNamed: isNamed, - max: sizeBytes, - atomicIOBytes: atomicIOBytes, + var p Pipe + initPipe(&p, isNamed, sizeBytes, atomicIOBytes) + return &p +} + +func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) { + if sizeBytes < MinimumPipeSize { + sizeBytes = MinimumPipeSize + } + if sizeBytes > MaximumPipeSize { + sizeBytes = MaximumPipeSize + } + if atomicIOBytes <= 0 { + atomicIOBytes = 1 } + if atomicIOBytes > sizeBytes { + atomicIOBytes = sizeBytes + } + pipe.isNamed = isNamed + pipe.max = sizeBytes + pipe.atomicIOBytes = atomicIOBytes } // NewConnectedPipe initializes a pipe and returns a pair of objects // representing the read and write ends of the pipe. func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) { - p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes) + p := NewPipe(false /* isNamed */, sizeBytes, atomicIOBytes) // Build an fs.Dirent for the pipe which will be shared by both // returned files. @@ -173,13 +188,24 @@ func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.F } } +type readOps struct { + // left returns the bytes remaining. + left func() int64 + + // limit limits subsequence reads. + limit func(int64) + + // read performs the actual read operation. + read func(*buffer) (int64, error) +} + // read reads data from the pipe into dst and returns the number of bytes // read, or returns ErrWouldBlock if the pipe is empty. // // Precondition: this pipe must have readers. -func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) { +func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { // Don't block for a zero-length read even if the pipe is empty. - if dst.NumBytes() == 0 { + if ops.left() == 0 { return 0, nil } @@ -196,12 +222,12 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) } // Limit how much we consume. - if dst.NumBytes() > p.size { - dst = dst.TakeFirst64(p.size) + if ops.left() > p.size { + ops.limit(p.size) } done := int64(0) - for dst.NumBytes() > 0 { + for ops.left() > 0 { // Pop the first buffer. first := p.data.Front() if first == nil { @@ -209,10 +235,9 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) } // Copy user data. - n, err := dst.CopyOutFrom(ctx, first) + n, err := ops.read(first) done += int64(n) p.size -= n - dst = dst.DropFirst64(n) // Empty buffer? if first.Empty() { @@ -230,12 +255,57 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) return done, nil } +// dup duplicates all data from this pipe into the given writer. +// +// There is no blocking behavior implemented here. The writer may propagate +// some blocking error. All the writes must be complete writes. +func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + + // Is the pipe empty? + if p.size == 0 { + if !p.HasWriters() { + // See above. + return 0, nil + } + return 0, syserror.ErrWouldBlock + } + + // Limit how much we consume. + if ops.left() > p.size { + ops.limit(p.size) + } + + done := int64(0) + for buf := p.data.Front(); buf != nil; buf = buf.Next() { + n, err := ops.read(buf) + done += n + if err != nil { + return done, err + } + } + + return done, nil +} + +type writeOps struct { + // left returns the bytes remaining. + left func() int64 + + // limit should limit subsequent writes. + limit func(int64) + + // write should write to the provided buffer. + write func(*buffer) (int64, error) +} + // write writes data from sv into the pipe and returns the number of bytes // written. If no bytes are written because the pipe is full (or has less than // atomicIOBytes free capacity), write returns ErrWouldBlock. // // Precondition: this pipe must have writers. -func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) { +func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { p.mu.Lock() defer p.mu.Unlock() @@ -246,17 +316,16 @@ func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be // atomic, but requires no atomicity for writes larger than this. - wanted := src.NumBytes() + wanted := ops.left() if avail := p.max - p.size; wanted > avail { if wanted <= p.atomicIOBytes { return 0, syserror.ErrWouldBlock } - // Limit to the available capacity. - src = src.TakeFirst64(avail) + ops.limit(avail) } done := int64(0) - for src.NumBytes() > 0 { + for ops.left() > 0 { // Need a new buffer? last := p.data.Back() if last == nil || last.Full() { @@ -266,10 +335,9 @@ func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) } // Copy user data. - n, err := src.CopyInTo(ctx, last) + n, err := ops.write(last) done += int64(n) p.size += n - src = src.DropFirst64(n) // Handle errors. if err != nil { diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go new file mode 100644 index 000000000..ef9641e6a --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -0,0 +1,213 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "io" + "math" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// This file contains Pipe file functionality that is tied to neither VFS nor +// the old fs architecture. + +// Release cleans up the pipe's state. +func (p *Pipe) Release() { + p.rClose() + p.wClose() + + // Wake up readers and writers. + p.Notify(waiter.EventIn | waiter.EventOut) +} + +// Read reads from the Pipe into dst. +func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) { + n, err := p.read(ctx, readOps{ + left: func() int64 { + return dst.NumBytes() + }, + limit: func(l int64) { + dst = dst.TakeFirst64(l) + }, + read: func(buf *buffer) (int64, error) { + n, err := dst.CopyOutFrom(ctx, buf) + dst = dst.DropFirst64(n) + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventOut) + } + return n, err +} + +// WriteTo writes to w from the Pipe. +func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) { + ops := readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(buf *buffer) (int64, error) { + n, err := buf.ReadToWriter(w, count, dup) + count -= n + return n, err + }, + } + if dup { + // There is no notification for dup operations. + return p.dup(ctx, ops) + } + n, err := p.read(ctx, ops) + if n > 0 { + p.Notify(waiter.EventOut) + } + return n, err +} + +// Write writes to the Pipe from src. +func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) { + n, err := p.write(ctx, writeOps{ + left: func() int64 { + return src.NumBytes() + }, + limit: func(l int64) { + src = src.TakeFirst64(l) + }, + write: func(buf *buffer) (int64, error) { + n, err := src.CopyInTo(ctx, buf) + src = src.DropFirst64(n) + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventIn) + } + return n, err +} + +// ReadFrom reads from r to the Pipe. +func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) { + n, err := p.write(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(buf *buffer) (int64, error) { + n, err := buf.WriteFromReader(r, count) + count -= n + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventIn) + } + return n, err +} + +// Readiness returns the ready events in the underlying pipe. +func (p *Pipe) Readiness(mask waiter.EventMask) waiter.EventMask { + return p.rwReadiness() & mask +} + +// Ioctl implements ioctls on the Pipe. +func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Switch on ioctl request. + switch int(args[1].Int()) { + case linux.FIONREAD: + v := p.queued() + if v > math.MaxInt32 { + v = math.MaxInt32 // Silently truncate. + } + // Copy result to user-space. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + default: + return 0, syscall.ENOTTY + } +} + +// waitFor blocks until the underlying pipe has at least one reader/writer is +// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this +// function will block for either readers or writers, depending on where +// 'wakeupChan' points. +// +// mu must be held by the caller. waitFor returns with mu held, but it will +// drop mu before blocking for any reader/writers. +func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { + // Ideally this function would simply use a condition variable. However, the + // wait needs to be interruptible via 'sleeper', so we must sychronize via a + // channel. The synchronization below relies on the fact that closing a + // channel unblocks all receives on the channel. + + // Does an appropriate wakeup channel already exist? If not, create a new + // one. This is all done under f.mu to avoid races. + if *wakeupChan == nil { + *wakeupChan = make(chan struct{}) + } + + // Grab a local reference to the wakeup channel since it may disappear as + // soon as we drop f.mu. + wakeup := *wakeupChan + + // Drop the lock and prepare to sleep. + mu.Unlock() + cancel := sleeper.SleepStart() + + // Wait for either a new reader/write to be signalled via 'wakeup', or + // for the sleep to be cancelled. + select { + case <-wakeup: + sleeper.SleepFinish(true) + case <-cancel: + sleeper.SleepFinish(false) + } + + // Take the lock and check if we were woken. If we were woken and + // interrupted, the former takes priority. + mu.Lock() + select { + case <-wakeup: + return true + default: + return false + } +} + +// newHandleLocked signals a new pipe reader or writer depending on where +// 'wakeupChan' points. This unblocks any corresponding reader or writer +// waiting for the other end of the channel to be opened, see Fifo.waitFor. +// +// Precondition: the mutex protecting wakeupChan must be held. +func newHandleLocked(wakeupChan *chan struct{}) { + if *wakeupChan != nil { + close(*wakeupChan) + *wakeupChan = nil + } +} diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go index f69dbf27b..b4d29fc77 100644 --- a/pkg/sentry/kernel/pipe/reader_writer.go +++ b/pkg/sentry/kernel/pipe/reader_writer.go @@ -15,16 +15,13 @@ package pipe import ( - "math" - "syscall" + "io" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/waiter" ) // ReaderWriter satisfies the FileOperations interface and services both @@ -44,53 +41,27 @@ type ReaderWriter struct { *Pipe } -// Release implements fs.FileOperations.Release. -func (rw *ReaderWriter) Release() { - rw.Pipe.rClose() - rw.Pipe.wClose() - - // Wake up readers and writers. - rw.Pipe.Notify(waiter.EventIn | waiter.EventOut) -} - // Read implements fs.FileOperations.Read. func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { - n, err := rw.Pipe.read(ctx, dst) - if n > 0 { - rw.Pipe.Notify(waiter.EventOut) - } - return n, err + return rw.Pipe.Read(ctx, dst) +} + +// WriteTo implements fs.FileOperations.WriteTo. +func (rw *ReaderWriter) WriteTo(ctx context.Context, _ *fs.File, w io.Writer, count int64, dup bool) (int64, error) { + return rw.Pipe.WriteTo(ctx, w, count, dup) } // Write implements fs.FileOperations.Write. func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { - n, err := rw.Pipe.write(ctx, src) - if n > 0 { - rw.Pipe.Notify(waiter.EventIn) - } - return n, err + return rw.Pipe.Write(ctx, src) } -// Readiness returns the ready events in the underlying pipe. -func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask { - return rw.Pipe.rwReadiness() & mask +// ReadFrom implements fs.FileOperations.WriteTo. +func (rw *ReaderWriter) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { + return rw.Pipe.ReadFrom(ctx, r, count) } // Ioctl implements fs.FileOperations.Ioctl. func (rw *ReaderWriter) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - // Switch on ioctl request. - switch int(args[1].Int()) { - case linux.FIONREAD: - v := rw.queued() - if v > math.MaxInt32 { - v = math.MaxInt32 // Silently truncate. - } - // Copy result to user-space. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) - return 0, err - default: - return 0, syscall.ENOTTY - } + return rw.Pipe.Ioctl(ctx, io, args) } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go new file mode 100644 index 000000000..6416e0dd8 --- /dev/null +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -0,0 +1,220 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// This file contains types enabling the pipe package to be used with the vfs +// package. + +// VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should +// not be copied. +type VFSPipe struct { + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // pipe is the underlying pipe. + pipe Pipe + + // Channels for synchronizing the creation of new readers and writers + // of this fifo. See waitFor and newHandleLocked. + // + // These are not saved/restored because all waiters are unblocked on + // save, and either automatically restart (via ERESTARTSYS) or return + // EINTR on resume. On restarts via ERESTARTSYS, the appropriate + // channel will be recreated. + rWakeup chan struct{} `state:"nosave"` + wWakeup chan struct{} `state:"nosave"` +} + +// NewVFSPipe returns an initialized VFSPipe. +func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe { + var vp VFSPipe + initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes) + return &vp +} + +// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics +// during open: +// +// "Normally, opening the FIFO blocks until the other end is opened also. A +// process can open a FIFO in nonblocking mode. In this case, opening for +// read-only will succeed even if no-one has opened on the write side yet, +// opening for write-only will fail with ENXIO (no such device or address) +// unless the other end has already been opened. Under Linux, opening a FIFO +// for read and write will succeed both in blocking and nonblocking mode. POSIX +// leaves this behavior undefined. This can be used to open a FIFO for writing +// while there are no readers available." - fifo(7) +func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { + vp.mu.Lock() + defer vp.mu.Unlock() + + readable := vfs.MayReadFileWithOpenFlags(flags) + writable := vfs.MayWriteFileWithOpenFlags(flags) + if !readable && !writable { + return nil, syserror.EINVAL + } + + vfd, err := vp.open(rp, vfsd, vfsfd, flags) + if err != nil { + return nil, err + } + + switch { + case readable && writable: + // Pipes opened for read-write always succeed without blocking. + newHandleLocked(&vp.rWakeup) + newHandleLocked(&vp.wWakeup) + + case readable: + newHandleLocked(&vp.rWakeup) + // If this pipe is being opened as nonblocking and there's no + // writer, we have to wait for a writer to open the other end. + if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + return nil, syserror.EINTR + } + + case writable: + newHandleLocked(&vp.wWakeup) + + if !vp.pipe.HasReaders() { + // Nonblocking, write-only opens fail with ENXIO when + // the read side isn't open yet. + if flags&linux.O_NONBLOCK != 0 { + return nil, syserror.ENXIO + } + // Wait for a reader to open the other end. + if !waitFor(&vp.mu, &vp.rWakeup, ctx) { + return nil, syserror.EINTR + } + } + + default: + panic("invalid pipe flags: must be readable, writable, or both") + } + + return vfd, nil +} + +// Preconditions: vp.mu must be held. +func (vp *VFSPipe) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { + var fd VFSPipeFD + fd.flags = flags + fd.readable = vfs.MayReadFileWithOpenFlags(flags) + fd.writable = vfs.MayWriteFileWithOpenFlags(flags) + fd.vfsfd = vfsfd + fd.pipe = &vp.pipe + if fd.writable { + // The corresponding Mount.EndWrite() is in VFSPipe.Release(). + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + } + + switch { + case fd.readable && fd.writable: + vp.pipe.rOpen() + vp.pipe.wOpen() + case fd.readable: + vp.pipe.rOpen() + case fd.writable: + vp.pipe.wOpen() + default: + panic("invalid pipe flags: must be readable, writable, or both") + } + + return &fd, nil +} + +// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is +// expected that filesystesm will use this in a struct implementing +// vfs.FileDescriptionImpl. +type VFSPipeFD struct { + pipe *Pipe + flags uint32 + readable bool + writable bool + vfsfd *vfs.FileDescription +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *VFSPipeFD) Release() { + var event waiter.EventMask + if fd.readable { + fd.pipe.rClose() + event |= waiter.EventIn + } + if fd.writable { + fd.pipe.wClose() + event |= waiter.EventOut + } + if event == 0 { + panic("invalid pipe flags: must be readable, writable, or both") + } + + if fd.writable { + fd.vfsfd.VirtualDentry().Mount().EndWrite() + } + + fd.pipe.Notify(event) +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *VFSPipeFD) OnClose(_ context.Context) error { + return nil +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EINVAL + } + + return fd.pipe.Read(ctx, dst) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EINVAL + } + + return fd.pipe.Write(ctx, src) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return fd.pipe.Ioctl(ctx, uio, args) +} diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go index c5d095af7..2e861a5a8 100644 --- a/pkg/sentry/kernel/posixtimer.go +++ b/pkg/sentry/kernel/posixtimer.go @@ -117,9 +117,9 @@ func (it *IntervalTimer) signalRejectedLocked() { } // Notify implements ktime.TimerListener.Notify. -func (it *IntervalTimer) Notify(exp uint64) { +func (it *IntervalTimer) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { if it.target == nil { - return + return ktime.Setting{}, false } it.target.tg.pidns.owner.mu.RLock() @@ -129,7 +129,7 @@ func (it *IntervalTimer) Notify(exp uint64) { if it.sigpending { it.overrunCur += exp - return + return ktime.Setting{}, false } // sigpending must be set before sendSignalTimerLocked() so that it can be @@ -148,6 +148,8 @@ func (it *IntervalTimer) Notify(exp uint64) { if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil { it.signalRejectedLocked() } + + return ktime.Setting{}, false } // Destroy implements ktime.TimerListener.Destroy. Users of Timer should call diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 0acdf769d..61e412911 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -17,7 +17,6 @@ package kernel import ( - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD index 1725b8562..98ea7a0d8 100644 --- a/pkg/sentry/kernel/sched/BUILD +++ b/pkg/sentry/kernel/sched/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 36edf10f3..f4c00cd86 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "waiter_list", diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 93fe68a3e..de9617e9d 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred return syserror.ERANGE } - // TODO(b/29354920): Clear undo entries in all processes + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = val sem.pid = pid s.changeTime = ktime.NowFromContext(ctx) @@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti for i, val := range vals { sem := &s.sems[i] - // TODO(b/29354920): Clear undo entries in all processes + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = int16(val) sem.pid = pid sem.wakeWaiters() @@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch } // All operations succeeded, apply them. - // TODO(b/29354920): handle undo operations. + // TODO(gvisor.dev/issue/137): handle undo operations. for i, v := range tmpVals { s.sems[i].value = v s.sems[i].wakeWaiters() diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 81fcd8258..047b5214d 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -47,6 +47,11 @@ type Session struct { // The id is immutable. id SessionID + // foreground is the foreground process group. + // + // This is protected by TaskSet.mu. + foreground *ProcessGroup + // ProcessGroups is a list of process groups in this Session. This is // protected by TaskSet.mu. processGroups processGroupList @@ -260,12 +265,14 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { func (tg *ThreadGroup) CreateSession() error { tg.pidns.owner.mu.Lock() defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() return tg.createSession() } // createSession creates a new session for a threadgroup. // -// Precondition: callers must hold TaskSet.mu for writing. +// Precondition: callers must hold TaskSet.mu and the signal mutex for writing. func (tg *ThreadGroup) createSession() error { // Get the ID for this thread in the current namespace. id := tg.pidns.tgids[tg] @@ -321,8 +328,14 @@ func (tg *ThreadGroup) createSession() error { childTG.processGroup.incRefWithParent(pg) childTG.processGroup.decRefWithParent(oldParentPG) }) - tg.processGroup.decRefWithParent(oldParentPG) + // If tg.processGroup is an orphan, decRefWithParent will lock + // the signal mutex of each thread group in tg.processGroup. + // However, tg's signal mutex may already be locked at this + // point. We change tg's process group before calling + // decRefWithParent to avoid locking tg's signal mutex twice. + oldPG := tg.processGroup tg.processGroup = pg + oldPG.decRefWithParent(oldParentPG) } else { // The current process group may be nil only in the case of an // unparented thread group (i.e. the init process). This would @@ -346,6 +359,9 @@ func (tg *ThreadGroup) createSession() error { ns.processGroups[ProcessGroupID(local)] = pg } + // Disconnect from the controlling terminal. + tg.tty = nil + return nil } diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index aa7471eb6..cd48945e6 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "shm", srcs = [ diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD new file mode 100644 index 000000000..9f7e19b4d --- /dev/null +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -0,0 +1,22 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "signalfd", + srcs = ["signalfd.go"], + importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel", + "//pkg/sentry/usermem", + "//pkg/syserror", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go new file mode 100644 index 000000000..4b08d7d72 --- /dev/null +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -0,0 +1,140 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package signalfd provides an implementation of signal file descriptors. +package signalfd + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/anon" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SignalOperations represent a file with signalfd semantics. +// +// +stateify savable +type SignalOperations struct { + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoWrite `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + // target is the original task target. + // + // The semantics here are a bit broken. Linux will always use current + // for all reads, regardless of where the signalfd originated. We can't + // do exactly that because we need to plumb the context through + // EventRegister in order to support proper blocking behavior. This + // will undoubtedly become very complicated quickly. + target *kernel.Task + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // mask is the signal mask. Protected by mu. + mask linux.SignalSet +} + +// New creates a new signalfd object with the supplied mask. +func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // No task context? Not valid. + return nil, syserror.EINVAL + } + // name matches fs/signalfd.c:signalfd4. + dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[signalfd]") + return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &SignalOperations{ + target: t, + mask: mask, + }), nil +} + +// Release implements fs.FileOperations.Release. +func (s *SignalOperations) Release() {} + +// Mask returns the signal mask. +func (s *SignalOperations) Mask() linux.SignalSet { + s.mu.Lock() + mask := s.mask + s.mu.Unlock() + return mask +} + +// SetMask sets the signal mask. +func (s *SignalOperations) SetMask(mask linux.SignalSet) { + s.mu.Lock() + s.mask = mask + s.mu.Unlock() +} + +// Read implements fs.FileOperations.Read. +func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + // Attempt to dequeue relevant signals. + info, err := s.target.Sigtimedwait(s.Mask(), 0) + if err != nil { + // There must be no signal available. + return 0, syserror.ErrWouldBlock + } + + // Copy out the signal info using the specified format. + var buf [128]byte + binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + Signo: uint32(info.Signo), + Errno: info.Errno, + Code: info.Code, + PID: uint32(info.Pid()), + UID: uint32(info.Uid()), + Status: info.Status(), + Overrun: uint32(info.Overrun()), + Addr: info.Addr(), + }) + n, err := dst.CopyOut(ctx, buf[:]) + return int64(n), err +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SignalOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + if mask&waiter.EventIn != 0 && s.target.PendingSignals()&s.Mask() != 0 { + return waiter.EventIn // Pending signals. + } + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SignalOperations) EventRegister(entry *waiter.Entry, _ waiter.EventMask) { + // Register for the signal set; ignore the passed events. + s.target.SignalRegister(entry, waiter.EventMask(s.Mask())) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SignalOperations) EventUnregister(entry *waiter.Entry) { + // Unregister the original entry. + s.target.SignalUnregister(entry) +} diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 220fa73a2..2fdee0282 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -339,6 +339,14 @@ func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { return nil } +// LookupName looks up a syscall name. +func (s *SyscallTable) LookupName(sysno uintptr) string { + if sc, ok := s.Table[sysno]; ok { + return sc.Name + } + return fmt.Sprintf("sys_%d", sysno) // Unlikely. +} + // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e91f82bb3..ab0c6c4aa 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -15,6 +15,8 @@ package kernel import ( + gocontext "context" + "runtime/trace" "sync" "sync/atomic" @@ -35,7 +37,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/third_party/gvsync" + "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/waiter" ) // Task represents a thread of execution in the untrusted app. It @@ -82,7 +85,7 @@ type Task struct { // // gosched is protected by goschedSeq. gosched is owned by the task // goroutine. - goschedSeq gvsync.SeqCount `state:"nosave"` + goschedSeq syncutil.SeqCount `state:"nosave"` gosched TaskGoroutineSchedInfo // yieldCount is the number of times the task goroutine has called @@ -133,6 +136,13 @@ type Task struct { // signalStack is exclusive to the task goroutine. signalStack arch.SignalStack + // signalQueue is a set of registered waiters for signal-related events. + // + // signalQueue is protected by the signalMutex. Note that the task does + // not implement all queue methods, specifically the readiness checks. + // The task only broadcast a notification on signal delivery. + signalQueue waiter.Queue `state:"zerovalue"` + // If groupStopPending is true, the task should participate in a group // stop in the interrupt path. // @@ -382,7 +392,14 @@ type Task struct { // logPrefix is a string containing the task's thread ID in the root PID // namespace, and is prepended to log messages emitted by Task.Infof etc. - logPrefix atomic.Value `state:".(string)"` + logPrefix atomic.Value `state:"nosave"` + + // traceContext and traceTask are both used for tracing, and are + // updated along with the logPrefix in updateInfoLocked. + // + // These are exclusive to the task goroutine. + traceContext gocontext.Context `state:"nosave"` + traceTask *trace.Task `state:"nosave"` // creds is the task's credentials. // @@ -520,14 +537,6 @@ func (t *Task) loadPtraceTracer(tracer *Task) { t.ptraceTracer.Store(tracer) } -func (t *Task) saveLogPrefix() string { - return t.logPrefix.Load().(string) -} - -func (t *Task) loadLogPrefix(prefix string) { - t.logPrefix.Store(prefix) -} - func (t *Task) saveSyscallFilters() []bpf.Program { if f := t.syscallFilters.Load(); f != nil { return f.([]bpf.Program) @@ -541,6 +550,7 @@ func (t *Task) loadSyscallFilters(filters []bpf.Program) { // afterLoad is invoked by stateify. func (t *Task) afterLoad() { + t.updateInfoLocked() t.interruptChan = make(chan struct{}, 1) t.gosched.State = TaskGoroutineNonexistent if t.stop != nil { @@ -701,9 +711,9 @@ func (t *Task) FDTable() *FDTable { return t.fdTable } -// GetFile is a convenience wrapper t.FDTable().GetFile. +// GetFile is a convenience wrapper for t.FDTable().Get. // -// Precondition: same as FDTable. +// Precondition: same as FDTable.Get. func (t *Task) GetFile(fd int32) *fs.File { f, _ := t.fdTable.Get(fd) return f diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index dd69939f9..4a4a69ee2 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -16,6 +16,7 @@ package kernel import ( "runtime" + "runtime/trace" "time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -133,19 +134,24 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { runtime.Gosched() } + region := trace.StartRegion(t.traceContext, blockRegion) select { case <-C: + region.End() t.SleepFinish(true) + // Woken by event. return nil case <-interrupt: + region.End() t.SleepFinish(false) // Return the indicated error on interrupt. return syserror.ErrInterrupted case <-timerChan: - // We've timed out. + region.End() t.SleepFinish(true) + // We've timed out. return syserror.ETIMEDOUT } } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 0916fd658..3eadfedb4 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -299,6 +299,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // nt that it must receive before its task goroutine starts running. tid := nt.k.tasks.Root.IDOfTask(nt) defer nt.Start(tid) + t.traceCloneEvent(tid) // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 8639d379f..bb5560acf 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -18,10 +18,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -132,30 +130,21 @@ func (t *Task) Stack() *arch.Stack { return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())} } -// LoadTaskImage loads filename into a new TaskContext. +// LoadTaskImage loads a specified file into a new TaskContext. // -// It takes several arguments: -// * mounts: MountNamespace to lookup filename in -// * root: Root to lookup filename under -// * wd: Working directory to lookup filename under -// * maxTraversals: maximum number of symlinks to follow -// * filename: path to binary to load -// * file: an open fs.File object of the binary to load. If set, -// file will be loaded and not filename. -// * argv: Binary argv -// * envv: Binary envv -// * fs: Binary FeatureSet -func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) { - // If File is not nil, we should load that instead of resolving filename. - if file != nil { - filename = file.MappedName(ctx) +// args.MemoryManager does not need to be set by the caller. +func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { + // If File is not nil, we should load that instead of resolving Filename. + if args.File != nil { + args.Filename = args.File.MappedName(ctx) } // Prepare a new user address space to load into. m := mm.NewMemoryManager(k, k) defer m.DecUsers(ctx) + args.MemoryManager = m - os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, k.extraAuxv, k.vdso) + os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) if err != nil { return nil, err } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 17a089b90..90a6190f1 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -129,6 +129,7 @@ type runSyscallAfterExecStop struct { } func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { + t.traceExecEvent(r.tc) t.tg.pidns.owner.mu.Lock() t.tg.execing = nil if t.killed() { @@ -253,7 +254,7 @@ func (t *Task) promoteLocked() { t.tg.leader = t t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) - t.updateLogPrefixLocked() + t.updateInfoLocked() // Reap the original leader. If it has a tracer, detach it instead of // waiting for it to acknowledge the original leader's death. oldLeader.exitParentNotified = true diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 535f03e50..435761e5a 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -236,6 +236,7 @@ func (*runExit) execute(t *Task) taskRunState { type runExitMain struct{} func (*runExitMain) execute(t *Task) taskRunState { + t.traceExitEvent() lastExiter := t.exitThreadGroup() // If the task has a cleartid, and the thread group wasn't killed by a diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index 78ff14b20..ce3e6ef28 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -465,8 +465,8 @@ func (t *Task) SetKeepCaps(k bool) { // disables the features we don't support anyway, is always set. This // drastically simplifies this function. // -// - We don't implement AT_SECURE, because no_new_privs always being set means -// that the conditions that require AT_SECURE never arise. (Compare Linux's +// - We don't set AT_SECURE = 1, because no_new_privs always being set means +// that the conditions that require AT_SECURE = 1 never arise. (Compare Linux's // security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().) // // - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index a29e9b9eb..0fb3661de 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -16,6 +16,7 @@ package kernel import ( "fmt" + "runtime/trace" "sort" "gvisor.dev/gvisor/pkg/log" @@ -127,11 +128,88 @@ func (t *Task) debugDumpStack() { } } -// updateLogPrefix updates the task's cached log prefix to reflect its -// current thread ID. +// trace definitions. +// +// Note that all region names are prefixed by ':' in order to ensure that they +// are lexically ordered before all system calls, which use the naked system +// call name (e.g. "read") for maximum clarity. +const ( + traceCategory = "task" + runRegion = ":run" + blockRegion = ":block" + cpuidRegion = ":cpuid" + faultRegion = ":fault" +) + +// updateInfoLocked updates the task's cached log prefix and tracing +// information to reflect its current thread ID. // // Preconditions: The task's owning TaskSet.mu must be locked. -func (t *Task) updateLogPrefixLocked() { +func (t *Task) updateInfoLocked() { // Use the task's TID in the root PID namespace for logging. - t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t])) + tid := t.tg.pidns.owner.Root.tids[t] + t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid)) + t.rebuildTraceContext(tid) +} + +// rebuildTraceContext rebuilds the trace context. +// +// Precondition: the passed tid must be the tid in the root namespace. +func (t *Task) rebuildTraceContext(tid ThreadID) { + // Re-initialize the trace context. + if t.traceTask != nil { + t.traceTask.End() + } + + // Note that we define the "task type" to be the dynamic TID. This does + // not align perfectly with the documentation for "tasks" in the + // tracing package. Tasks may be assumed to be bounded by analysis + // tools. However, if we just use a generic "task" type here, then the + // "user-defined tasks" page on the tracing dashboard becomes nearly + // unusable, as it loads all traces from all tasks. + // + // We can assume that the number of tasks in the system is not + // arbitrarily large (in general it won't be, especially for cases + // where we're collecting a brief profile), so using the TID is a + // reasonable compromise in this case. + t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid)) +} + +// traceCloneEvent is called when a new task is spawned. +// +// ntid must be the new task's ThreadID in the root namespace. +func (t *Task) traceCloneEvent(ntid ThreadID) { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid) +} + +// traceExitEvent is called when a task exits. +func (t *Task) traceExitEvent() { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status()) +} + +// traceExecEvent is called when a task calls exec. +func (t *Task) traceExecEvent(tc *TaskContext) { + if !trace.IsEnabled() { + return + } + d := tc.MemoryManager.Executable() + if d == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") + return + } + defer d.DecRef() + root := t.fsContext.RootDirectory() + if root == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>") + return + } + defer root.DecRef() + n, _ := d.FullName(root) + trace.Logf(t.traceContext, traceCategory, "exec: %s", n) } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index c92266c59..d97f8c189 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -17,6 +17,7 @@ package kernel import ( "bytes" "runtime" + "runtime/trace" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -205,9 +206,11 @@ func (*runApp) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.RUnlock() } + region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) + region.End() if clearSinglestep { t.Arch().ClearSingleStep() @@ -225,6 +228,7 @@ func (*runApp) execute(t *Task) taskRunState { case platform.ErrContextSignalCPUID: // Is this a CPUID instruction? + region := trace.StartRegion(t.traceContext, cpuidRegion) expected := arch.CPUIDInstruction[:] found := make([]byte, len(expected)) _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) @@ -232,10 +236,12 @@ func (*runApp) execute(t *Task) taskRunState { // Skip the cpuid instruction. t.Arch().CPUIDEmulate(t) t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + region.End() // Resume execution. return (*runApp)(nil) } + region.End() // Not an actual CPUID, but required copy-in. // The instruction at the given RIP was not a CPUID, and we // fallthrough to the default signal deliver behavior below. @@ -251,8 +257,10 @@ func (*runApp) execute(t *Task) taskRunState { // an application-generated signal and we should continue execution // normally. if at.Any() { + region := trace.StartRegion(t.traceContext, faultRegion) addr := usermem.Addr(info.Addr()) err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + region.End() if err == nil { // The fault was handled appropriately. // We can resume running the application. @@ -260,6 +268,12 @@ func (*runApp) execute(t *Task) taskRunState { } // Is this a vsyscall that we need emulate? + // + // Note that we don't track vsyscalls as part of a + // specific trace region. This is because regions don't + // stack, and the actual system call will count as a + // region. We should be able to easily identify + // vsyscalls by having a <fault><syscall> pair. if at.Execute { if sysno, ok := t.tc.st.LookupEmulate(addr); ok { return t.doVsyscall(addr, sysno) diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index e76c069b0..8b148db35 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -126,12 +126,22 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { t.gosched.Timestamp = now t.gosched.State = state t.goschedSeq.EndWrite() + + if state != TaskGoroutineRunningApp { + // Task is blocking/stopping. + t.k.decRunningTasks() + } } // Preconditions: The caller must be running on the task goroutine, and leaving // a state indicated by a previous call to // t.accountTaskGoroutineEnter(state). func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { + if state != TaskGoroutineRunningApp { + // Task is unblocking/continuing. + t.k.incRunningTasks() + } + now := t.k.CPUClockNow() if t.gosched.State != state { panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys)) @@ -330,7 +340,7 @@ func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker { } // Notify implements ktime.TimerListener.Notify. -func (ticker *kernelCPUClockTicker) Notify(exp uint64) { +func (ticker *kernelCPUClockTicker) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { // Only increment cpuClock by 1 regardless of the number of expirations. // This approximately compensates for cases where thread throttling or bad // Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and @@ -426,6 +436,27 @@ func (ticker *kernelCPUClockTicker) Notify(exp uint64) { tgs[i] = nil } ticker.tgs = tgs[:0] + + // If nothing is running, we can disable the timer. + tasks := atomic.LoadInt64(&ticker.k.runningTasks) + if tasks == 0 { + ticker.k.runningTasksMu.Lock() + defer ticker.k.runningTasksMu.Unlock() + tasks := atomic.LoadInt64(&ticker.k.runningTasks) + if tasks != 0 { + // Raced with a 0 -> 1 transition. + return setting, false + } + + // Stop the timer. We must cache the current setting so the + // kernel can access it without violating the lock order. + ticker.k.cpuClockTickerSetting = setting + ticker.k.cpuClockTickerDisabled = true + setting.Enabled = false + return setting, true + } + + return setting, false } // Destroy implements ktime.TimerListener.Destroy. diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 266959a07..39cd1340d 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -28,6 +28,7 @@ import ( ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" ) // SignalAction is an internal signal action. @@ -497,6 +498,9 @@ func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) { // // Preconditions: The signal mutex must be locked. func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool { + // Notify that the signal is queued. + t.signalQueue.Notify(waiter.EventMask(linux.MakeSignalSet(sig))) + // - Do not choose tasks that are blocking the signal. if linux.SignalSetOf(sig)&t.signalMask != 0 { return false @@ -1108,3 +1112,17 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { t.tg.signalHandlers.mu.Unlock() return t.deliverSignal(info, act) } + +// SignalRegister registers a waiter for pending signals. +func (t *Task) SignalRegister(e *waiter.Entry, mask waiter.EventMask) { + t.tg.signalHandlers.mu.Lock() + t.signalQueue.EventRegister(e, mask) + t.tg.signalHandlers.mu.Unlock() +} + +// SignalUnregister unregisters a waiter for pending signals. +func (t *Task) SignalUnregister(e *waiter.Entry) { + t.tg.signalHandlers.mu.Lock() + t.signalQueue.EventUnregister(e) + t.tg.signalHandlers.mu.Unlock() +} diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index d60cd62c7..3522a4ae5 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -154,10 +154,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { // Below this point, newTask is expected not to fail (there is no rollback // of assignTIDsLocked or any of the following). - // Logging on t's behalf will panic if t.logPrefix hasn't been initialized. - // This is the earliest point at which we can do so (since t now has thread - // IDs). - t.updateLogPrefixLocked() + // Logging on t's behalf will panic if t.logPrefix hasn't been + // initialized. This is the earliest point at which we can do so + // (since t now has thread IDs). + t.updateInfoLocked() if cfg.InheritParent != nil { t.parent = cfg.InheritParent.parent @@ -172,9 +172,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { if parentPG := tg.parentPG(); parentPG == nil { tg.createSession() } else { - // Inherit the process group. + // Inherit the process group and terminal. parentPG.incRefWithParent(parentPG) tg.processGroup = parentPG + tg.tty = t.parent.tg.tty } } tg.tasks.PushBack(t) diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index b543d536a..3180f5560 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -17,6 +17,7 @@ package kernel import ( "fmt" "os" + "runtime/trace" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -160,6 +161,10 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u ctrl = ctrlStopAndReinvokeSyscall } else { fn := s.Lookup(sysno) + var region *trace.Region // Only non-nil if tracing == true. + if trace.IsEnabled() { + region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) + } if fn != nil { // Call our syscall implementation. rval, ctrl, err = fn(t, args) @@ -167,6 +172,9 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u // Use the missing function if not found. rval, err = t.SyscallTable().Missing(t, sysno, args) } + if region != nil { + region.End() + } } if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 2a97e3e8e..72568d296 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -19,10 +19,13 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" ) // A ThreadGroup is a logical grouping of tasks that has widespread @@ -245,6 +248,12 @@ type ThreadGroup struct { // // mounts is immutable. mounts *fs.MountNamespace + + // tty is the thread group's controlling terminal. If nil, there is no + // controlling terminal. + // + // tty is protected by the signal mutex. + tty *TTY } // newThreadGroup returns a new, empty thread group in PID namespace ns. The @@ -324,6 +333,176 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) { } } +// SetControllingTTY sets tty as the controlling terminal of tg. +func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error { + tty.mu.Lock() + defer tty.mu.Unlock() + + // We might be asked to set the controlling terminal of multiple + // processes, so we lock both the TaskSet and SignalHandlers. + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // "The calling process must be a session leader and not have a + // controlling terminal already." - tty_ioctl(4) + if tg.processGroup.session.leader != tg || tg.tty != nil { + return syserror.EINVAL + } + + // "If this terminal is already the controlling terminal of a different + // session group, then the ioctl fails with EPERM, unless the caller + // has the CAP_SYS_ADMIN capability and arg equals 1, in which case the + // terminal is stolen, and all processes that had it as controlling + // terminal lose it." - tty_ioctl(4) + if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { + if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 { + return syserror.EPERM + } + // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. + for othertg := range tg.pidns.owner.Root.tgids { + // This won't deadlock by locking tg.signalHandlers + // because at this point: + // - We only lock signalHandlers if it's in the same + // session as the tty's controlling thread group. + // - We know that the calling thread group is not in + // the same session as the tty's controlling thread + // group. + if othertg.processGroup.session == tty.tg.processGroup.session { + othertg.signalHandlers.mu.Lock() + othertg.tty = nil + othertg.signalHandlers.mu.Unlock() + } + } + } + + // Set the controlling terminal and foreground process group. + tg.tty = tty + tg.processGroup.session.foreground = tg.processGroup + // Set this as the controlling process of the terminal. + tty.tg = tg + + return nil +} + +// ReleaseControllingTTY gives up tty as the controlling tty of tg. +func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error { + tty.mu.Lock() + defer tty.mu.Unlock() + + // We might be asked to set the controlling terminal of multiple + // processes, so we lock both the TaskSet and SignalHandlers. + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + // Just below, we may re-lock signalHandlers in order to send signals. + // Thus we can't defer Unlock here. + tg.signalHandlers.mu.Lock() + + if tg.tty == nil || tg.tty != tty { + tg.signalHandlers.mu.Unlock() + return syserror.ENOTTY + } + + // "If the process was session leader, then send SIGHUP and SIGCONT to + // the foreground process group and all processes in the current + // session lose their controlling terminal." - tty_ioctl(4) + // Remove tty as the controlling tty for each process in the session, + // then send them SIGHUP and SIGCONT. + + // If we're not the session leader, we don't have to do much. + if tty.tg != tg { + tg.tty = nil + tg.signalHandlers.mu.Unlock() + return nil + } + + tg.signalHandlers.mu.Unlock() + + // We're the session leader. SIGHUP and SIGCONT the foreground process + // group and remove all controlling terminals in the session. + var lastErr error + for othertg := range tg.pidns.owner.Root.tgids { + if othertg.processGroup.session == tg.processGroup.session { + othertg.signalHandlers.mu.Lock() + othertg.tty = nil + if othertg.processGroup == tg.processGroup.session.foreground { + if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil { + lastErr = err + } + if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil { + lastErr = err + } + } + othertg.signalHandlers.mu.Unlock() + } + } + + return lastErr +} + +// ForegroundProcessGroup returns the process group ID of the foreground +// process group. +func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) { + tty.mu.Lock() + defer tty.mu.Unlock() + + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // "When fd does not refer to the controlling terminal of the calling + // process, -1 is returned" - tcgetpgrp(3) + if tg.tty != tty { + return -1, syserror.ENOTTY + } + + return int32(tg.processGroup.session.foreground.id), nil +} + +// SetForegroundProcessGroup sets the foreground process group of tty to pgid. +func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) { + tty.mu.Lock() + defer tty.mu.Unlock() + + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // TODO(b/129283598): "If tcsetpgrp() is called by a member of a + // background process group in its session, and the calling process is + // not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all + // members of this background process group." + + // tty must be the controlling terminal. + if tg.tty != tty { + return -1, syserror.ENOTTY + } + + // pgid must be positive. + if pgid < 0 { + return -1, syserror.EINVAL + } + + // pg must not be empty. Empty process groups are removed from their + // pid namespaces. + pg, ok := tg.pidns.processGroups[pgid] + if !ok { + return -1, syserror.ESRCH + } + + // pg must be part of this process's session. + if tg.processGroup.session != pg.session { + return -1, syserror.EPERM + } + + tg.processGroup.session.foreground.id = pgid + return 0, nil +} + // itimerRealListener implements ktime.Listener for ITIMER_REAL expirations. // // +stateify savable @@ -332,8 +511,9 @@ type itimerRealListener struct { } // Notify implements ktime.TimerListener.Notify. -func (l *itimerRealListener) Notify(exp uint64) { +func (l *itimerRealListener) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM)) + return ktime.Setting{}, false } // Destroy implements ktime.TimerListener.Destroy. diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 9beae4b31..31847e1df 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "time", srcs = [ diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index aa6c75d25..107394183 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -280,13 +280,16 @@ func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { // A TimerListener receives expirations from a Timer. type TimerListener interface { // Notify is called when its associated Timer expires. exp is the number of - // expirations. + // expirations. setting is the next timer Setting. // // Notify is called with the associated Timer's mutex locked, so Notify // must not take any locks that precede Timer.mu in lock order. // + // If Notify returns true, the timer will use the returned setting + // rather than the passed one. + // // Preconditions: exp > 0. - Notify(exp uint64) + Notify(exp uint64, setting Setting) (newSetting Setting, update bool) // Destroy is called when the timer is destroyed. Destroy() @@ -533,7 +536,9 @@ func (t *Timer) Tick() { s, exp := t.setting.At(now) t.setting = s if exp > 0 { - t.listener.Notify(exp) + if newS, ok := t.listener.Notify(exp, t.setting); ok { + t.setting = newS + } } t.resetKickerLocked(now) } @@ -588,7 +593,9 @@ func (t *Timer) Get() (Time, Setting) { s, exp := t.setting.At(now) t.setting = s if exp > 0 { - t.listener.Notify(exp) + if newS, ok := t.listener.Notify(exp, t.setting); ok { + t.setting = newS + } } t.resetKickerLocked(now) return now, s @@ -620,7 +627,9 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { } oldS, oldExp := t.setting.At(now) if oldExp > 0 { - t.listener.Notify(oldExp) + t.listener.Notify(oldExp, oldS) + // N.B. The returned Setting doesn't matter because we're about + // to overwrite. } if f != nil { f() @@ -628,7 +637,9 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { newS, newExp := s.At(now) t.setting = newS if newExp > 0 { - t.listener.Notify(newExp) + if newS, ok := t.listener.Notify(newExp, t.setting); ok { + t.setting = newS + } } t.resetKickerLocked(now) return now, oldS @@ -683,11 +694,13 @@ func NewChannelNotifier() (TimerListener, <-chan struct{}) { } // Notify implements ktime.TimerListener.Notify. -func (c *ChannelNotifier) Notify(uint64) { +func (c *ChannelNotifier) Notify(uint64, Setting) (Setting, bool) { select { case c.tchan <- struct{}{}: default: } + + return Setting{}, false } // Destroy implements ktime.TimerListener.Destroy and will close the channel. diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go new file mode 100644 index 000000000..048de26dc --- /dev/null +++ b/pkg/sentry/kernel/tty.go @@ -0,0 +1,39 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "sync" + +// TTY defines the relationship between a thread group and its controlling +// terminal. +// +// +stateify savable +type TTY struct { + // Index is the terminal index. It is immutable. + Index uint32 + + mu sync.Mutex `state:"nosave"` + + // tg is protected by mu. + tg *ThreadGroup +} + +// TTY returns the thread group's controlling terminal. If nil, there is no +// controlling terminal. +func (tg *ThreadGroup) TTY() *TTY { + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.tty +} diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD index 40025d62d..156e67bf8 100644 --- a/pkg/sentry/limits/BUILD +++ b/pkg/sentry/limits/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "limits", diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index 3b322f5f3..2890393bd 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -1,9 +1,8 @@ load("@io_bazel_rules_go//go:def.bzl", "go_embed_data") +load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) -load("//tools/go_stateify:defs.bzl", "go_library") - go_embed_data( name = "vdso_bin", src = "//vdso:vdso.so", diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index ba9c9ce12..6299a3e2f 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -323,18 +323,22 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf. return syserror.ENOEXEC } + // N.B. Linux uses vm_brk_flags to map these pages, which only + // honors the X bit, always mapping at least RW. ignoring These + // pages are not included in the final brk region. + prot := usermem.ReadWrite + if phdr.Flags&elf.PF_X == elf.PF_X { + prot.Execute = true + } + if _, err := m.MMap(ctx, memmap.MMapOpts{ Length: uint64(anonSize), Addr: anonAddr, // Fixed without Unmap will fail the mmap if something is // already at addr. - Fixed: true, - Private: true, - // N.B. Linux uses vm_brk to map these pages, ignoring - // the segment protections, instead always mapping RW. - // These pages are not included in the final brk - // region. - Perms: usermem.ReadWrite, + Fixed: true, + Private: true, + Perms: prot, MaxPerms: usermem.AnyAccess, }); err != nil { ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err) @@ -404,6 +408,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el start = vaddr } if vaddr < end { + // NOTE(b/37474556): Linux allows out-of-order + // segments, in violation of the spec. ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end) return loadedELF{}, syserror.ENOEXEC } @@ -620,15 +626,15 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, in return loadParsedELF(ctx, m, f, info, 0) } -// loadELF loads f into the Task address space. +// loadELF loads args.File into the Task address space. // // If loadELF returns ErrSwitchFile it should be called again with the returned // path and argv. // // Preconditions: -// * f is an ELF file -func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) { - bin, ac, err := loadInitialELF(ctx, m, fs, f) +// * args.File is an ELF file +func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error) { + bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File) if err != nil { ctx.Infof("Error loading binary: %v", err) return loadedELF{}, nil, err @@ -636,7 +642,14 @@ func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace var interp loadedELF if bin.interpreter != "" { - d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter) + // Even if we do not allow the final link of the script to be + // resolved, the interpreter should still be resolved if it is + // a symlink. + args.ResolveFinal = true + // Refresh the traversal limit. + *args.RemainingTraversals = linux.MaxSymlinkTraversals + args.Filename = bin.interpreter + d, i, err := openPath(ctx, args) if err != nil { ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err @@ -645,7 +658,7 @@ func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace // We don't need the Dirent. d.DecRef() - interp, err = loadInterpreterELF(ctx, m, i, bin) + interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin) if err != nil { ctx.Infof("Error loading interpreter: %v", err) return loadedELF{}, nil, err diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index f6f1ae762..b03eeb005 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package loader loads a binary into a MemoryManager. +// Package loader loads an executable file into a MemoryManager. package loader import ( @@ -20,6 +20,7 @@ import ( "fmt" "io" "path" + "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" @@ -35,6 +36,54 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LoadArgs holds specifications for an executable file to be loaded. +type LoadArgs struct { + // MemoryManager is the memory manager to load the executable into. + MemoryManager *mm.MemoryManager + + // Mounts is the mount namespace in which to look up Filename. + Mounts *fs.MountNamespace + + // Root is the root directory under which to look up Filename. + Root *fs.Dirent + + // WorkingDirectory is the working directory under which to look up + // Filename. + WorkingDirectory *fs.Dirent + + // RemainingTraversals is the maximum number of symlinks to follow to + // resolve Filename. This counter is passed by reference to keep it + // updated throughout the call stack. + RemainingTraversals *uint + + // ResolveFinal indicates whether the final link of Filename should be + // resolved, if it is a symlink. + ResolveFinal bool + + // Filename is the path for the executable. + Filename string + + // File is an open fs.File object of the executable. If File is not + // nil, then File will be loaded and Filename will be ignored. + File *fs.File + + // CloseOnExec indicates that the executable (or one of its parent + // directories) was opened with O_CLOEXEC. If the executable is an + // interpreter script, then cause an ENOENT error to occur, since the + // script would otherwise be inaccessible to the interpreter. + CloseOnExec bool + + // Argv is the vector of arguments to pass to the executable. + Argv []string + + // Envv is the vector of environment variables to pass to the + // executable. + Envv []string + + // Features specifies the CPU feature set for the executable. + Features *cpuid.FeatureSet +} + // readFull behaves like io.ReadFull for an *fs.File. func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { var total int64 @@ -51,80 +100,82 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in return total, nil } -// openPath opens name for loading. +// openPath opens args.Filename and checks that it is valid for loading. // -// openPath returns the fs.Dirent and an *fs.File for name, which is not +// openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not // installed in the Task FDTable. The caller takes ownership of both. // -// name must be a readable, executable, regular file. -func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) { - if name == "" { +// args.Filename must be a readable, executable, regular file. +func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) { + if args.Filename == "" { ctx.Infof("cannot open empty name") return nil, nil, syserror.ENOENT } - d, err := mm.FindInode(ctx, root, wd, name, maxTraversals) + var d *fs.Dirent + var err error + if args.ResolveFinal { + d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals) + } else { + d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals) + } if err != nil { return nil, nil, err } - - // Open file will take a reference to Dirent, so destroy this one. + // Defer a DecRef for the sake of failure cases. defer d.DecRef() - return openFile(ctx, nil, d, name) -} + if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) { + return nil, nil, syserror.ELOOP + } -// openFile performs checks on a file to be executed. If provided a *fs.File, -// openFile takes that file's Dirent and performs checks on it. If provided a -// *fs.Dirent and not a *fs.File, it creates a *fs.File object from the Dirent's -// Inode and performs checks on that. -// -// openFile returns an *fs.File and *fs.Dirent, and the caller takes ownership -// of both. -// -// "dirent" and "file" must not both be nil and point to a readable, executable, regular file. -func openFile(ctx context.Context, file *fs.File, dirent *fs.Dirent, name string) (*fs.Dirent, *fs.File, error) { - // file and dirent must not be nil. - if dirent == nil && file == nil { - ctx.Infof("dirent and file cannot both be nil.") - return nil, nil, syserror.ENOENT + if err := checkPermission(ctx, d); err != nil { + return nil, nil, err } - if file != nil { - dirent = file.Dirent + // If they claim it's a directory, then make sure. + // + // N.B. we reject directories below, but we must first reject + // non-directories passed as directories. + if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) { + return nil, nil, syserror.ENOTDIR } - // Perform permissions checks on the file. - if err := checkFile(ctx, dirent, name); err != nil { + if err := checkIsRegularFile(ctx, d, args.Filename); err != nil { return nil, nil, err } - if file == nil { - var ferr error - if file, ferr = dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true}); ferr != nil { - return nil, nil, ferr - } - } else { - // GetFile takes a reference to the created file, so make one in the case - // that the file reference already existed. - file.IncRef() + f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) + if err != nil { + return nil, nil, err + } + // Defer a DecRef for the sake of failure cases. + defer f.DecRef() + + if err := checkPread(ctx, f, args.Filename); err != nil { + return nil, nil, err + } + + d.IncRef() + f.IncRef() + return d, f, err +} + +// checkFile performs checks on a file to be executed. +func checkFile(ctx context.Context, f *fs.File, filename string) error { + if err := checkPermission(ctx, f.Dirent); err != nil { + return err } - // We must be able to read at arbitrary offsets. - if !file.Flags().Pread { - file.DecRef() - ctx.Infof("%s cannot be read at an offset: %+v", file.MappedName(ctx), file.Flags()) - return nil, nil, syserror.EACCES + if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil { + return err } - // Grab reference for caller. - dirent.IncRef() - return dirent, file, nil + return checkPread(ctx, f, filename) } -// checkFile performs file permissions checks for binaries called in openPath -// and openFile -func checkFile(ctx context.Context, d *fs.Dirent, name string) error { +// checkPermission checks whether the file is readable and executable. +func checkPermission(ctx context.Context, d *fs.Dirent) error { perms := fs.PermMask{ // TODO(gvisor.dev/issue/160): Linux requires only execute // permission, not read. However, our backing filesystems may @@ -135,26 +186,26 @@ func checkFile(ctx context.Context, d *fs.Dirent, name string) error { Read: true, Execute: true, } - if err := d.Inode.CheckPermission(ctx, perms); err != nil { - return err - } + return d.Inode.CheckPermission(ctx, perms) +} - // If they claim it's a directory, then make sure. - // - // N.B. we reject directories below, but we must first reject - // non-directories passed as directories. - if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR +// checkIsRegularFile prevents us from trying to execute a directory, pipe, etc. +func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error { + attr := d.Inode.StableAttr + if !fs.IsRegular(attr) { + ctx.Infof("%s is not regular: %v", filename, attr) + return syserror.EACCES } + return nil +} - // No exec-ing directories, pipes, etc! - if !fs.IsRegular(d.Inode.StableAttr) { - ctx.Infof("%s is not regular: %v", name, d.Inode.StableAttr) +// checkPread checks whether we can read the file at arbitrary offsets. +func checkPread(ctx context.Context, f *fs.File, filename string) error { + if !f.Flags().Pread { + ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags()) return syserror.EACCES } - return nil - } // allocStack allocates and maps a stack in to any available part of the address space. @@ -173,45 +224,49 @@ const ( maxLoaderAttempts = 6 ) -// loadBinary loads a binary that is pointed to by "file". If nil, the path -// "filename" is resolved and loaded. +// loadExecutable loads an executable that is pointed to by args.File. If nil, +// the path args.Filename is resolved and loaded. If the executable is an +// interpreter script rather than an ELF, the binary of the corresponding +// interpreter will be loaded. // // It returns: // * loadedELF, description of the loaded binary // * arch.Context matching the binary arch // * fs.Dirent of the binary file -// * Possibly updated argv -func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) { +// * Possibly updated args.Argv +func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) { for i := 0; i < maxLoaderAttempts; i++ { var ( d *fs.Dirent - f *fs.File err error ) - if passedFile == nil { - d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename) - + if args.File == nil { + d, args.File, err = openPath(ctx, args) + // We will return d in the successful case, but defer a DecRef for the + // sake of intermediate loops and failure cases. + if d != nil { + defer d.DecRef() + } + if args.File != nil { + defer args.File.DecRef() + } } else { - d, f, err = openFile(ctx, passedFile, nil, "") - // Set to nil in case we loop on a Interpreter Script. - passedFile = nil + d = args.File.Dirent + d.IncRef() + defer d.DecRef() + err = checkFile(ctx, args.File, args.Filename) } - if err != nil { - ctx.Infof("Error opening %s: %v", filename, err) + ctx.Infof("Error opening %s: %v", args.Filename, err) return loadedELF{}, nil, nil, nil, err } - defer f.DecRef() - // We will return d in the successful case, but defer a DecRef - // for intermediate loops and failure cases. - defer d.DecRef() // Check the header. Is this an ELF or interpreter script? var hdr [4]uint8 // N.B. We assume that reading from a regular file cannot block. - _, err = readFull(ctx, f, usermem.BytesIOSequence(hdr[:]), 0) - // Allow unexpected EOF, as a valid executable could be only three - // bytes (e.g., #!a). + _, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0) + // Allow unexpected EOF, as a valid executable could be only three bytes + // (e.g., #!a). if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { err = syserror.ENOEXEC @@ -221,33 +276,38 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp switch { case bytes.Equal(hdr[:], []byte(elfMagic)): - loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, features, f) + loaded, ac, err := loadELF(ctx, args) if err != nil { ctx.Infof("Error loading ELF: %v", err) return loadedELF{}, nil, nil, nil, err } // An ELF is always terminal. Hold on to d. d.IncRef() - return loaded, ac, d, argv, err + return loaded, ac, d, args.Argv, err case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)): - newpath, newargv, err := parseInterpreterScript(ctx, filename, f, argv) + if args.CloseOnExec { + return loadedELF{}, nil, nil, nil, syserror.ENOENT + } + args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv) if err != nil { ctx.Infof("Error loading interpreter script: %v", err) return loadedELF{}, nil, nil, nil, err } - filename = newpath - argv = newargv + // Refresh the traversal limit for the interpreter. + *args.RemainingTraversals = linux.MaxSymlinkTraversals default: ctx.Infof("Unknown magic: %v", hdr) return loadedELF{}, nil, nil, nil, syserror.ENOEXEC } + // Set to nil in case we loop on a Interpreter Script. + args.File = nil } return loadedELF{}, nil, nil, nil, syserror.ELOOP } -// Load loads "file" into a MemoryManager. If file is nil, the path "filename" -// is resolved and loaded instead. +// Load loads args.File into a MemoryManager. If args.File is nil, the path +// args.Filename is resolved and loaded instead. // // If Load returns ErrSwitchFile it should be called again with the returned // path and argv. @@ -255,37 +315,37 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp // Preconditions: // * The Task MemoryManager is empty. // * Load is called on the Task goroutine. -func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) { - // Load the binary itself. - loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv) +func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) { + // Load the executable itself. + loaded, ac, d, newArgv, err := loadExecutable(ctx, args) if err != nil { - return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux()) + return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } defer d.DecRef() // Load the VDSO. - vdsoAddr, err := loadVDSO(ctx, m, vdso, loaded) + vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux()) } // Setup the heap. brk starts at the next page after the end of the - // binary. Userspace can assume that the remainer of the page after + // executable. Userspace can assume that the remainer of the page after // loaded.end is available for its use. e, ok := loaded.end.RoundUp() if !ok { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("brk overflows: %#x", loaded.end), linux.ENOEXEC) } - m.BrkSetup(ctx, e) + args.MemoryManager.BrkSetup(ctx, e) // Allocate our stack. - stack, err := allocStack(ctx, m, ac) + stack, err := allocStack(ctx, args.MemoryManager, ac) if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to allocate stack: %v", err), syserr.FromError(err).ToLinux()) } // Push the original filename to the stack, for AT_EXECFN. - execfn, err := stack.Push(filename) + execfn, err := stack.Push(args.Filename) if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux()) } @@ -308,6 +368,9 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r arch.AuxEntry{linux.AT_EUID, usermem.Addr(c.EffectiveKUID.In(c.UserNamespace).OrOverflow())}, arch.AuxEntry{linux.AT_GID, usermem.Addr(c.RealKGID.In(c.UserNamespace).OrOverflow())}, arch.AuxEntry{linux.AT_EGID, usermem.Addr(c.EffectiveKGID.In(c.UserNamespace).OrOverflow())}, + // The conditions that require AT_SECURE = 1 never arise. See + // kernel.Task.updateCredsForExecLocked. + arch.AuxEntry{linux.AT_SECURE, 0}, arch.AuxEntry{linux.AT_CLKTCK, linux.CLOCKS_PER_SEC}, arch.AuxEntry{linux.AT_EXECFN, execfn}, arch.AuxEntry{linux.AT_RANDOM, random}, @@ -316,11 +379,12 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r }...) auxv = append(auxv, extraAuxv...) - sl, err := stack.Load(argv, envv, auxv) + sl, err := stack.Load(newArgv, args.Envv, auxv) if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load stack: %v", err), syserr.FromError(err).ToLinux()) } + m := args.MemoryManager m.SetArgvStart(sl.ArgvStart) m.SetArgvEnd(sl.ArgvEnd) m.SetEnvvStart(sl.EnvvStart) @@ -331,7 +395,7 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r ac.SetIP(uintptr(loaded.entry)) ac.SetStack(uintptr(stack.Bottom)) - name := path.Base(filename) + name := path.Base(args.Filename) if len(name) > linux.TASK_COMM_LEN-1 { name = name[:linux.TASK_COMM_LEN-1] } diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index ada28aea3..df8a81907 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -268,6 +268,8 @@ func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, er // some applications may not be able to handle multiple [vdso] // hints. vdso: mm.NewSpecialMappable("", mfp, vdso), + os: info.os, + arch: info.arch, phdrs: info.phdrs, }, nil } diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index 29c14ec56..112794e9c 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "mappable_range", @@ -40,7 +41,6 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", - "//pkg/refs", "//pkg/sentry/context", "//pkg/sentry/platform", "//pkg/sentry/usermem", diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index 03b99aaea..16a722a13 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -18,7 +18,6 @@ package memmap import ( "fmt" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -235,8 +234,11 @@ type InvalidateOpts struct { // coincidental; fs.File implements MappingIdentity, and some // fs.InodeOperations implement Mappable.) type MappingIdentity interface { - // MappingIdentity is reference-counted. - refs.RefCounter + // IncRef increments the MappingIdentity's reference count. + IncRef() + + // DecRef decrements the MappingIdentity's reference count. + DecRef() // MappedName returns the application-visible name shown in // /proc/[pid]/maps. diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 072745a08..839931f67 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "file_refcount_set", @@ -117,9 +118,9 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/syncutil", "//pkg/syserror", "//pkg/tcpip/buffer", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index f350e0109..58a5c186d 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -44,7 +44,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/third_party/gvsync" + "gvisor.dev/gvisor/pkg/syncutil" ) // MemoryManager implements a virtual address space. @@ -82,7 +82,7 @@ type MemoryManager struct { users int32 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. - mappingMu gvsync.DowngradableRWMutex `state:"nosave"` + mappingMu syncutil.DowngradableRWMutex `state:"nosave"` // vmas stores virtual memory areas. Since vmas are stored by value, // clients should usually use vmaIterator.ValuePtr() instead of @@ -125,7 +125,7 @@ type MemoryManager struct { // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. - activeMu gvsync.DowngradableRWMutex `state:"nosave"` + activeMu syncutil.DowngradableRWMutex `state:"nosave"` // pmas stores platform mapping areas used to implement vmas. Since pmas // are stored by value, clients should usually use pmaIterator.ValuePtr() diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 858f895f2..f404107af 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "evictable_range", diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index 1effc7735..aafce1d00 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -16,6 +16,7 @@ package pgalloc import ( "bytes" + "context" "fmt" "io" "runtime" @@ -29,7 +30,7 @@ import ( ) // SaveTo writes f's state to the given stream. -func (f *MemoryFile) SaveTo(w io.Writer) error { +func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error { // Wait for reclaim. f.mu.Lock() defer f.mu.Unlock() @@ -78,10 +79,10 @@ func (f *MemoryFile) SaveTo(w io.Writer) error { } // Save metadata. - if err := state.Save(w, &f.fileSize, nil); err != nil { + if err := state.Save(ctx, w, &f.fileSize, nil); err != nil { return err } - if err := state.Save(w, &f.usage, nil); err != nil { + if err := state.Save(ctx, w, &f.usage, nil); err != nil { return err } @@ -114,9 +115,9 @@ func (f *MemoryFile) SaveTo(w io.Writer) error { } // LoadFrom loads MemoryFile state from the given stream. -func (f *MemoryFile) LoadFrom(r io.Reader) error { +func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error { // Load metadata. - if err := state.Load(r, &f.fileSize, nil); err != nil { + if err := state.Load(ctx, r, &f.fileSize, nil); err != nil { return err } if err := f.file.Truncate(f.fileSize); err != nil { @@ -124,7 +125,7 @@ func (f *MemoryFile) LoadFrom(r io.Reader) error { } newMappings := make([]uintptr, f.fileSize>>chunkShift) f.mappings.Store(newMappings) - if err := state.Load(r, &f.usage, nil); err != nil { + if err := state.Load(ctx, r, &f.usage, nil); err != nil { return err } diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 9aa6ec507..157bffa81 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -1,8 +1,8 @@ -package(licenses = ["notice"]) - load("//tools/go_generics:defs.bzl", "go_template_instance") load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_template_instance( name = "file_range", out = "file_range.go", diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD index eeb634644..b6d008dbe 100644 --- a/pkg/sentry/platform/interrupt/BUILD +++ b/pkg/sentry/platform/interrupt/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 2046df525..f3afd98da 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) @@ -11,17 +12,26 @@ go_library( "bluepill_amd64.go", "bluepill_amd64.s", "bluepill_amd64_unsafe.go", + "bluepill_arm64.go", + "bluepill_arm64.s", + "bluepill_arm64_unsafe.go", "bluepill_fault.go", "bluepill_unsafe.go", "context.go", - "filters.go", + "filters_amd64.go", + "filters_arm64.go", "kvm.go", "kvm_amd64.go", "kvm_amd64_unsafe.go", + "kvm_arm64.go", + "kvm_arm64_unsafe.go", "kvm_const.go", + "kvm_const_arm64.go", "machine.go", "machine_amd64.go", "machine_amd64_unsafe.go", + "machine_arm64.go", + "machine_arm64_unsafe.go", "machine_unsafe.go", "physical_map.go", "physical_map_amd64.go", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index acd41f73d..ea8b9632e 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -127,7 +127,7 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac // not have physical mappings, the KVM module may inject // spurious exceptions when emulation fails (i.e. it tries to // emulate because the RIP is pointed at those pages). - as.machine.mapPhysical(physical, length) + as.machine.mapPhysical(physical, length, physicalRegions, _KVM_MEM_FLAGS_NONE) // Install the page table mappings. Note that the ordering is // important; if the pagetable mappings were installed before diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go index 80942e9c9..3f35414bb 100644 --- a/pkg/sentry/platform/kvm/allocator.go +++ b/pkg/sentry/platform/kvm/allocator.go @@ -54,7 +54,7 @@ func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr { // //go:nosplit func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs { - virtualStart, physicalStart, _, ok := calculateBluepillFault(physical) + virtualStart, physicalStart, _, ok := calculateBluepillFault(physical, physicalRegions) if !ok { panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) } diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 043de51b3..30dbb74d6 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -20,6 +20,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" ) @@ -36,6 +37,18 @@ func sighandler() func dieTrampoline() var ( + // bounceSignal is the signal used for bouncing KVM. + // + // We use SIGCHLD because it is not masked by the runtime, and + // it will be ignored properly by other parts of the kernel. + bounceSignal = syscall.SIGCHLD + + // bounceSignalMask has only bounceSignal set. + bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) + + // bounce is the interrupt vector used to return to the kernel. + bounce = uint32(ring0.VirtualizationException) + // savedHandler is a pointer to the previous handler. // // This is called by bluepillHandler. @@ -45,6 +58,13 @@ var ( dieTrampolineAddr uintptr ) +// redpill invokes a syscall with -1. +// +//go:nosplit +func redpill() { + syscall.RawSyscall(^uintptr(0), 0, 0, 0) +} + // dieHandler is called by dieTrampoline. // //go:nosplit @@ -73,8 +93,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) { func init() { // Install the handler. - if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil { - panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err)) + if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } // Extract the address for the trampoline. diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index 421c88220..133c2203d 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -24,26 +24,10 @@ import ( ) var ( - // bounceSignal is the signal used for bouncing KVM. - // - // We use SIGCHLD because it is not masked by the runtime, and - // it will be ignored properly by other parts of the kernel. - bounceSignal = syscall.SIGCHLD - - // bounceSignalMask has only bounceSignal set. - bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) - - // bounce is the interrupt vector used to return to the kernel. - bounce = uint32(ring0.VirtualizationException) + // The action for bluepillSignal is changed by sigaction(). + bluepillSignal = syscall.SIGSEGV ) -// redpill on amd64 invokes a syscall with -1. -// -//go:nosplit -func redpill() { - syscall.RawSyscall(^uintptr(0), 0, 0, 0) -} - // bluepillArchEnter is called during bluepillEnter. // //go:nosplit diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index 9d8af143e..a63a6a071 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -23,13 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) -// bluepillArchContext returns the arch-specific context. -// -//go:nosplit -func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { - return &((*arch.UContext64)(context).MContext) -} - // dieArchSetup initializes the state for dieTrampoline. // // The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go new file mode 100644 index 000000000..552341721 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -0,0 +1,79 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" +) + +var ( + // The action for bluepillSignal is changed by sigaction(). + bluepillSignal = syscall.SIGILL +) + +// bluepillArchEnter is called during bluepillEnter. +// +//go:nosplit +func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) { + c = vCPUPtr(uintptr(context.Regs[8])) + regs := c.CPU.Registers() + regs.Regs = context.Regs + regs.Sp = context.Sp + regs.Pc = context.Pc + regs.Pstate = context.Pstate + regs.Pstate &^= uint64(ring0.KernelFlagsClear) + regs.Pstate |= ring0.KernelFlagsSet + return +} + +// bluepillArchExit is called during bluepillEnter. +// +//go:nosplit +func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { + regs := c.CPU.Registers() + context.Regs = regs.Regs + context.Sp = regs.Sp + context.Pc = regs.Pc + context.Pstate = regs.Pstate + context.Pstate &^= uint64(ring0.UserFlagsClear) + context.Pstate |= ring0.UserFlagsSet +} + +// KernelSyscall handles kernel syscalls. +// +//go:nosplit +func (c *vCPU) KernelSyscall() { + regs := c.Registers() + if regs.Regs[8] != ^uint64(0) { + regs.Pc -= 4 // Rewind. + } + ring0.Halt() +} + +// KernelException handles kernel exceptions. +// +//go:nosplit +func (c *vCPU) KernelException(vector ring0.Vector) { + regs := c.Registers() + if vector == ring0.Vector(bounce) { + regs.Pc = 0 + } + ring0.Halt() +} diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s new file mode 100644 index 000000000..c61700892 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_arm64.s @@ -0,0 +1,87 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// VCPU_CPU is the location of the CPU in the vCPU struct. +// +// This is guaranteed to be zero. +#define VCPU_CPU 0x0 + +// CPU_SELF is the self reference in ring0's percpu. +// +// This is guaranteed to be zero. +#define CPU_SELF 0x0 + +// Context offsets. +// +// Only limited use of the context is done in the assembly stub below, most is +// done in the Go handlers. +#define SIGINFO_SIGNO 0x0 +#define CONTEXT_PC 0x1B8 +#define CONTEXT_R0 0xB8 + +// See bluepill.go. +TEXT ·bluepill(SB),NOSPLIT,$0 +begin: + MOVD vcpu+0(FP), R8 + MOVD $VCPU_CPU(R8), R9 + ORR $0xffff000000000000, R9, R9 + // Trigger sigill. + // In ring0.Start(), the value of R8 will be stored into tpidr_el1. + // When the context was loaded into vcpu successfully, + // we will check if the value of R10 and R9 are the same. + WORD $0xd538d08a // MRS TPIDR_EL1, R10 +check_vcpu: + CMP R10, R9 + BEQ right_vCPU +wrong_vcpu: + CALL ·redpill(SB) + B begin +right_vCPU: + RET + +// sighandler: see bluepill.go for documentation. +// +// The arguments are the following: +// +// R0 - The signal number. +// R1 - Pointer to siginfo_t structure. +// R2 - Pointer to ucontext structure. +// +TEXT ·sighandler(SB),NOSPLIT,$0 + // si_signo should be sigill. + MOVD SIGINFO_SIGNO(R1), R7 + CMPW $4, R7 + BNE fallback + + MOVD CONTEXT_PC(R2), R7 + CMPW $0, R7 + BEQ fallback + + MOVD R2, 8(RSP) + BL ·bluepillHandler(SB) // Call the handler. + + RET + +fallback: + // Jump to the previous signal handler. + MOVD ·savedHandler(SB), R7 + B (R7) + +// dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation. +TEXT ·dieTrampoline(SB),NOSPLIT,$0 + // TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64. + MOVD R9, 8(RSP) + BL ·dieHandler(SB) diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go new file mode 100644 index 000000000..e5fac0d6a --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +//go:nosplit +func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { + // TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64. +} diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index b97476053..f6459cda9 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -46,9 +46,9 @@ func yield() { // calculateBluepillFault calculates the fault address range. // //go:nosplit -func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) { +func calculateBluepillFault(physical uintptr, phyRegions []physicalRegion) (virtualStart, physicalStart, length uintptr, ok bool) { alignedPhysical := physical &^ uintptr(usermem.PageSize-1) - for _, pr := range physicalRegions { + for _, pr := range phyRegions { end := pr.physical + pr.length if physical < pr.physical || physical >= end { continue @@ -77,12 +77,12 @@ func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, leng // The corresponding virtual address is returned. This may throw on error. // //go:nosplit -func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) { +func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegion, flags uint32) (uintptr, bool) { // Paging fault: we need to map the underlying physical pages for this // fault. This all has to be done in this function because we're in a // signal handler context. (We can't call any functions that might // split the stack.) - virtualStart, physicalStart, length, ok := calculateBluepillFault(physical) + virtualStart, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions) if !ok { return 0, false } @@ -96,7 +96,7 @@ func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) { yield() // Race with another call. slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0)) } - errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart) + errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart, flags) if errno == 0 { // Successfully added region; we can increment nextSlot and // allow another set to proceed here. diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 7e8e9f42a..9add7c944 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.14 +// +build !go1.15 // Check go:linkname function signatures when updating Go version. @@ -23,6 +23,8 @@ import ( "sync/atomic" "syscall" "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/arch" ) //go:linkname throw runtime.throw @@ -49,6 +51,13 @@ func uintptrValue(addr *byte) uintptr { return (uintptr)(unsafe.Pointer(addr)) } +// bluepillArchContext returns the UContext64. +// +//go:nosplit +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { + return &((*arch.UContext64)(context).MContext) +} + // bluepillHandler is called from the signal stub. // // The world may be stopped while this is executing, and it executes on the @@ -80,13 +89,17 @@ func bluepillHandler(context unsafe.Pointer) { // interrupted KVM. Since we're in a signal handler // currently, all signals are masked and the signal // must have been delivered directly to this thread. + timeout := syscall.Timespec{} sig, _, errno := syscall.RawSyscall6( syscall.SYS_RT_SIGTIMEDWAIT, uintptr(unsafe.Pointer(&bounceSignalMask)), - 0, // siginfo. - 0, // timeout. - 8, // sigset size. + 0, // siginfo. + uintptr(unsafe.Pointer(&timeout)), // timeout. + 8, // sigset size. 0, 0) + if errno == syscall.EAGAIN { + continue + } if errno != 0 { throw("error waiting for pending signal") } @@ -162,7 +175,7 @@ func bluepillHandler(context unsafe.Pointer) { // For MMIO, the physical address is the first data item. physical := uintptr(c.runData.data[0]) - virtual, ok := handleBluepillFault(c.machine, physical) + virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE) if !ok { c.die(bluepillArchContext(context), "invalid physical address") return diff --git a/pkg/sentry/platform/kvm/filters.go b/pkg/sentry/platform/kvm/filters_amd64.go index 7d949f1dd..7d949f1dd 100644 --- a/pkg/sentry/platform/kvm/filters.go +++ b/pkg/sentry/platform/kvm/filters_amd64.go diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go new file mode 100644 index 000000000..9245d07c2 --- /dev/null +++ b/pkg/sentry/platform/kvm/filters_arm64.go @@ -0,0 +1,32 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// SyscallFilters returns syscalls made exclusively by the KVM platform. +func (*KVM) SyscallFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_IOCTL: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_RT_SIGSUSPEND: {}, + syscall.SYS_RT_SIGTIMEDWAIT: {}, + 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host. + } +} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index ee4cd2f4d..f2c2c059e 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -21,7 +21,6 @@ import ( "sync" "syscall" - "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" @@ -56,9 +55,7 @@ func New(deviceFile *os.File) (*KVM, error) { // Ensure global initialization is done. globalOnce.Do(func() { - physicalInit() - globalErr = updateSystemValues(int(fd)) - ring0.Init(cpuid.HostFeatureSet()) + globalErr = updateGlobalOnce(int(fd)) }) if globalErr != nil { return nil, globalErr diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go index 5d8ef4761..c5a6f9c7d 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64.go +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -17,6 +17,7 @@ package kvm import ( + "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) @@ -211,3 +212,11 @@ type cpuidEntries struct { _ uint32 entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry } + +// updateGlobalOnce does global initialization. It has to be called only once. +func updateGlobalOnce(fd int) error { + physicalInit() + err := updateSystemValues(int(fd)) + ring0.Init(cpuid.HostFeatureSet()) + return err +} diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go new file mode 100644 index 000000000..2319c86d3 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -0,0 +1,83 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "syscall" +) + +// userMemoryRegion is a region of physical memory. +// +// This mirrors kvm_memory_region. +type userMemoryRegion struct { + slot uint32 + flags uint32 + guestPhysAddr uint64 + memorySize uint64 + userspaceAddr uint64 +} + +type kvmOneReg struct { + id uint64 + addr uint64 +} + +const KVM_NR_SPSR = 5 + +type userFpsimdState struct { + vregs [64]uint64 + fpsr uint32 + fpcr uint32 + reserved [2]uint32 +} + +type userRegs struct { + Regs syscall.PtraceRegs + sp_el1 uint64 + elr_el1 uint64 + spsr [KVM_NR_SPSR]uint64 + fpRegs userFpsimdState +} + +// runData is the run structure. This may be mapped for synchronous register +// access (although that doesn't appear to be supported by my kernel at least). +// +// This mirrors kvm_run. +type runData struct { + requestInterruptWindow uint8 + _ [7]uint8 + + exitReason uint32 + readyForInterruptInjection uint8 + ifFlag uint8 + _ [2]uint8 + + cr8 uint64 + apicBase uint64 + + // This is the union data for exits. Interpretation depends entirely on + // the exitReason above (see vCPU code for more information). + data [32]uint64 +} + +// updateGlobalOnce does global initialization. It has to be called only once. +func updateGlobalOnce(fd int) error { + physicalInit() + err := updateSystemValues(int(fd)) + updateVectorTable() + return err +} diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go new file mode 100644 index 000000000..6531bae1d --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go @@ -0,0 +1,39 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "fmt" + "syscall" +) + +var ( + runDataSize int +) + +func updateSystemValues(fd int) error { + // Extract the mmap size. + sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0) + if errno != 0 { + return fmt.Errorf("getting VCPU mmap size: %v", errno) + } + // Save the data. + runDataSize = int(sz) + + // Success. + return nil +} diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go index d05f05c29..1d5c77ff4 100644 --- a/pkg/sentry/platform/kvm/kvm_const.go +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -49,11 +49,13 @@ const ( _KVM_EXIT_SHUTDOWN = 0x8 _KVM_EXIT_FAIL_ENTRY = 0x9 _KVM_EXIT_INTERNAL_ERROR = 0x11 + _KVM_EXIT_SYSTEM_EVENT = 0x18 ) // KVM capability options. const ( - _KVM_CAP_MAX_VCPUS = 0x42 + _KVM_CAP_MAX_VCPUS = 0x42 + _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5 ) // KVM limits. @@ -62,3 +64,10 @@ const ( _KVM_NR_INTERRUPTS = 0x100 _KVM_NR_CPUID_ENTRIES = 0x100 ) + +// KVM kvm_memory_region::flags. +const ( + _KVM_MEM_LOG_DIRTY_PAGES = uint32(1) << 0 + _KVM_MEM_READONLY = uint32(1) << 1 + _KVM_MEM_FLAGS_NONE = 0 +) diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go new file mode 100644 index 000000000..5a74c6e36 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go @@ -0,0 +1,132 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +// KVM ioctls for Arm64. +const ( + _KVM_GET_ONE_REG = 0x4010aeab + _KVM_SET_ONE_REG = 0x4010aeac + + _KVM_ARM_PREFERRED_TARGET = 0x8020aeaf + _KVM_ARM_VCPU_INIT = 0x4020aeae + _KVM_ARM64_REGS_PSTATE = 0x6030000000100042 + _KVM_ARM64_REGS_SP_EL1 = 0x6030000000100044 + _KVM_ARM64_REGS_R0 = 0x6030000000100000 + _KVM_ARM64_REGS_R1 = 0x6030000000100002 + _KVM_ARM64_REGS_R2 = 0x6030000000100004 + _KVM_ARM64_REGS_R3 = 0x6030000000100006 + _KVM_ARM64_REGS_R8 = 0x6030000000100010 + _KVM_ARM64_REGS_R18 = 0x6030000000100024 + _KVM_ARM64_REGS_PC = 0x6030000000100040 + _KVM_ARM64_REGS_MAIR_EL1 = 0x603000000013c510 + _KVM_ARM64_REGS_TCR_EL1 = 0x603000000013c102 + _KVM_ARM64_REGS_TTBR0_EL1 = 0x603000000013c100 + _KVM_ARM64_REGS_TTBR1_EL1 = 0x603000000013c101 + _KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080 + _KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082 + _KVM_ARM64_REGS_VBAR_EL1 = 0x603000000013c600 +) + +// Arm64: Architectural Feature Access Control Register EL1. +const ( + _FPEN_NOTRAP = 0x3 + _FPEN_SHIFT = 0x20 +) + +// Arm64: System Control Register EL1. +const ( + _SCTLR_M = 1 << 0 + _SCTLR_C = 1 << 2 + _SCTLR_I = 1 << 12 +) + +// Arm64: Translation Control Register EL1. +const ( + _TCR_IPS_40BITS = 2 << 32 // PA=40 + _TCR_IPS_48BITS = 5 << 32 // PA=48 + + _TCR_T0SZ_OFFSET = 0 + _TCR_T1SZ_OFFSET = 16 + _TCR_IRGN0_SHIFT = 8 + _TCR_IRGN1_SHIFT = 24 + _TCR_ORGN0_SHIFT = 10 + _TCR_ORGN1_SHIFT = 26 + _TCR_SH0_SHIFT = 12 + _TCR_SH1_SHIFT = 28 + _TCR_TG0_SHIFT = 14 + _TCR_TG1_SHIFT = 30 + + _TCR_T0SZ_VA48 = 64 - 48 // VA=48 + _TCR_T1SZ_VA48 = 64 - 48 // VA=48 + + _TCR_ASID16 = 1 << 36 + _TCR_TBI0 = 1 << 37 + + _TCR_TXSZ_VA48 = (_TCR_T0SZ_VA48 << _TCR_T0SZ_OFFSET) | (_TCR_T1SZ_VA48 << _TCR_T1SZ_OFFSET) + + _TCR_TG0_4K = 0 << _TCR_TG0_SHIFT // 4K + _TCR_TG0_64K = 1 << _TCR_TG0_SHIFT // 64K + + _TCR_TG1_4K = 2 << _TCR_TG1_SHIFT + + _TCR_TG_FLAGS = _TCR_TG0_4K | _TCR_TG1_4K + + _TCR_IRGN0_WBWA = 1 << _TCR_IRGN0_SHIFT + _TCR_IRGN1_WBWA = 1 << _TCR_IRGN1_SHIFT + _TCR_IRGN_WBWA = _TCR_IRGN0_WBWA | _TCR_IRGN1_WBWA + + _TCR_ORGN0_WBWA = 1 << _TCR_ORGN0_SHIFT + _TCR_ORGN1_WBWA = 1 << _TCR_ORGN1_SHIFT + + _TCR_ORGN_WBWA = _TCR_ORGN0_WBWA | _TCR_ORGN1_WBWA + + _TCR_SHARED = (3 << _TCR_SH0_SHIFT) | (3 << _TCR_SH1_SHIFT) + + _TCR_CACHE_FLAGS = _TCR_IRGN_WBWA | _TCR_ORGN_WBWA +) + +// Arm64: Memory Attribute Indirection Register EL1. +const ( + _MT_DEVICE_nGnRnE = 0 + _MT_DEVICE_nGnRE = 1 + _MT_DEVICE_GRE = 2 + _MT_NORMAL_NC = 3 + _MT_NORMAL = 4 + _MT_NORMAL_WT = 5 + _MT_EL1_INIT = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8) +) + +const ( + _KVM_ARM_VCPU_POWER_OFF = 0 // CPU is started in OFF state + _KVM_ARM_VCPU_PSCI_0_2 = 2 // CPU uses PSCI v0.2 +) + +// Arm64: Exception Syndrome Register EL1. +const ( + _ESR_ELx_FSC = 0x3F + + _ESR_SEGV_MAPERR_L0 = 0x4 + _ESR_SEGV_MAPERR_L1 = 0x5 + _ESR_SEGV_MAPERR_L2 = 0x6 + _ESR_SEGV_MAPERR_L3 = 0x7 + + _ESR_SEGV_ACCERR_L1 = 0x9 + _ESR_SEGV_ACCERR_L2 = 0xa + _ESR_SEGV_ACCERR_L3 = 0xb + + _ESR_SEGV_PEMERR_L1 = 0xd + _ESR_SEGV_PEMERR_L2 = 0xe + _ESR_SEGV_PEMERR_L3 = 0xf +) diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index cc6c138b2..7d02ebf19 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -215,6 +215,17 @@ func newMachine(vm int) (*machine, error) { return true // Keep iterating. }) + var physicalRegionsReadOnly []physicalRegion + var physicalRegionsAvailable []physicalRegion + + physicalRegionsReadOnly = rdonlyRegionsForSetMem() + physicalRegionsAvailable = availableRegionsForSetMem() + + // Map all read-only regions. + for _, r := range physicalRegionsReadOnly { + m.mapPhysical(r.physical, r.length, physicalRegionsReadOnly, _KVM_MEM_READONLY) + } + // Ensure that the currently mapped virtual regions are actually // available in the VM. Note that this doesn't guarantee no future // faults, however it should guarantee that everything is available to @@ -223,6 +234,13 @@ func newMachine(vm int) (*machine, error) { if excludeVirtualRegion(vr) { return // skip region. } + + for _, r := range physicalRegionsReadOnly { + if vr.virtual == r.virtual { + return + } + } + for virtual := vr.virtual; virtual < vr.virtual+vr.length; { physical, length, ok := translateToPhysical(virtual) if !ok { @@ -236,7 +254,7 @@ func newMachine(vm int) (*machine, error) { } // Ensure the physical range is mapped. - m.mapPhysical(physical, length) + m.mapPhysical(physical, length, physicalRegionsAvailable, _KVM_MEM_FLAGS_NONE) virtual += length } }) @@ -256,9 +274,9 @@ func newMachine(vm int) (*machine, error) { // not available. This attempts to be efficient for calls in the hot path. // // This panics on error. -func (m *machine) mapPhysical(physical, length uintptr) { +func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) { for end := physical + length; physical < end; { - _, physicalStart, length, ok := calculateBluepillFault(physical) + _, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions) if !ok { // Should never happen. panic("mapPhysical on unknown physical address") @@ -266,7 +284,7 @@ func (m *machine) mapPhysical(physical, length uintptr) { if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok { // Not present in the cache; requires setting the slot. - if _, ok := handleBluepillFault(m, physical); !ok { + if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok { panic("handleBluepillFault failed") } } diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index c1cbe33be..b99fe425e 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -355,3 +355,13 @@ func (m *machine) retryInGuest(fn func()) { } } } + +// On x86 platform, the flags for "setMemoryRegion" can always be set as 0. +// There is no need to return read-only physicalRegions. +func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { + return nil +} + +func availableRegionsForSetMem() (phyRegions []physicalRegion) { + return physicalRegions +} diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 506ec9af1..7156c245f 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -26,30 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/time" ) -// setMemoryRegion initializes a region. -// -// This may be called from bluepillHandler, and therefore returns an errno -// directly (instead of wrapping in an error) to avoid allocations. -// -//go:nosplit -func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno { - userRegion := userMemoryRegion{ - slot: uint32(slot), - flags: 0, - guestPhysAddr: uint64(physical), - memorySize: uint64(length), - userspaceAddr: uint64(virtual), - } - - // Set the region. - _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(m.fd), - _KVM_SET_USER_MEMORY_REGION, - uintptr(unsafe.Pointer(&userRegion))) - return errno -} - // loadSegments copies the current segments. // // This may be called from within the signal context and throws on error. @@ -159,3 +135,43 @@ func (c *vCPU) setSignalMask() error { } return nil } + +// setUserRegisters sets user registers in the vCPU. +func (c *vCPU) setUserRegisters(uregs *userRegs) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_REGS, + uintptr(unsafe.Pointer(uregs))); errno != 0 { + return fmt.Errorf("error setting user registers: %v", errno) + } + return nil +} + +// getUserRegisters reloads user registers in the vCPU. +// +// This is safe to call from a nosplit context. +// +//go:nosplit +func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_GET_REGS, + uintptr(unsafe.Pointer(uregs))); errno != 0 { + return errno + } + return 0 +} + +// setSystemRegisters sets system registers. +func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SREGS, + uintptr(unsafe.Pointer(sregs))); errno != 0 { + return fmt.Errorf("error setting system registers: %v", errno) + } + return nil +} diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go new file mode 100644 index 000000000..7ae47f291 --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -0,0 +1,183 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +type vCPUArchState struct { + // PCIDs is the set of PCIDs for this vCPU. + // + // This starts above fixedKernelPCID. + PCIDs *pagetables.PCIDs +} + +const ( + // fixedKernelPCID is a fixed kernel PCID used for the kernel page + // tables. We must start allocating user PCIDs above this in order to + // avoid any conflict (see below). + fixedKernelPCID = 1 + + // poolPCIDs is the number of PCIDs to record in the database. As this + // grows, assignment can take longer, since it is a simple linear scan. + // Beyond a relatively small number, there are likely few perform + // benefits, since the TLB has likely long since lost any translations + // from more than a few PCIDs past. + poolPCIDs = 8 +) + +// Get all read-only physicalRegions. +func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { + var rdonlyRegions []region + + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) { + return + } + + if !vr.accessType.Write && vr.accessType.Read { + rdonlyRegions = append(rdonlyRegions, vr.region) + } + }) + + for _, r := range rdonlyRegions { + physical, _, ok := translateToPhysical(r.virtual) + if !ok { + continue + } + + phyRegions = append(phyRegions, physicalRegion{ + region: region{ + virtual: r.virtual, + length: r.length, + }, + physical: physical, + }) + } + + return phyRegions +} + +// Get all available physicalRegions. +func availableRegionsForSetMem() (phyRegions []physicalRegion) { + var excludeRegions []region + applyVirtualRegions(func(vr virtualRegion) { + if !vr.accessType.Write { + excludeRegions = append(excludeRegions, vr.region) + } + }) + + phyRegions = computePhysicalRegions(excludeRegions) + + return phyRegions +} + +// dropPageTables drops cached page table entries. +func (m *machine) dropPageTables(pt *pagetables.PageTables) { + m.mu.Lock() + defer m.mu.Unlock() + + // Clear from all PCIDs. + for _, c := range m.vCPUs { + c.PCIDs.Drop(pt) + } +} + +// nonCanonical generates a canonical address return. +// +//go:nosplit +func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + *info = arch.SignalInfo{ + Signo: signal, + Code: arch.SignalInfoKernel, + } + info.SetAddr(addr) // Include address. + return usermem.NoAccess, platform.ErrContextSignal +} + +// fault generates an appropriate fault return. +// +//go:nosplit +func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + faultAddr := c.GetFaultAddr() + code, user := c.ErrorCode() + + // Reset the pointed SignalInfo. + *info = arch.SignalInfo{Signo: signal} + info.SetAddr(uint64(faultAddr)) + + read := true + write := false + execute := true + + ret := code & _ESR_ELx_FSC + switch ret { + case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3: + info.Code = 1 //SEGV_MAPERR + read = false + write = true + execute = false + case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3: + info.Code = 2 // SEGV_ACCERR. + read = true + write = false + execute = false + default: + info.Code = 2 + } + + if !user { + read = true + write = false + execute = true + + } + accessType := usermem.AccessType{ + Read: read, + Write: write, + Execute: execute, + } + + return accessType, platform.ErrContextSignal +} + +// retryInGuest runs the given function in guest mode. +// +// If the function does not complete in guest mode (due to execution of a +// system call due to a GC stall, for example), then it will be retried. The +// given function must be idempotent as a result of the retry mechanism. +func (m *machine) retryInGuest(fn func()) { + c := m.Get() + defer m.Put(c) + for { + c.ClearErrorCode() // See below. + bluepill(c) // Force guest mode. + fn() // Execute the given function. + _, user := c.ErrorCode() + if user { + // If user is set, then we haven't bailed back to host + // mode via a kernel exception or system call. We + // consider the full function to have executed in guest + // mode and we can return. + break + } + } +} diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go new file mode 100644 index 000000000..3f2f97a6b --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -0,0 +1,362 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "fmt" + "reflect" + "sync/atomic" + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// setMemoryRegion initializes a region. +// +// This may be called from bluepillHandler, and therefore returns an errno +// directly (instead of wrapping in an error) to avoid allocations. +// +//go:nosplit +func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno { + userRegion := userMemoryRegion{ + slot: uint32(slot), + flags: 0, + guestPhysAddr: uint64(physical), + memorySize: uint64(length), + userspaceAddr: uint64(virtual), + } + + // Set the region. + _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_SET_USER_MEMORY_REGION, + uintptr(unsafe.Pointer(&userRegion))) + return errno +} + +type kvmVcpuInit struct { + target uint32 + features [7]uint32 +} + +var vcpuInit kvmVcpuInit + +// initArchState initializes architecture-specific state. +func (m *machine) initArchState() error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_ARM_PREFERRED_TARGET, + uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { + panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno)) + } + return nil +} + +func getPageWithReflect(p uintptr) []byte { + return (*(*[0xFFFFFF]byte)(unsafe.Pointer(p & ^uintptr(syscall.Getpagesize()-1))))[:syscall.Getpagesize()] +} + +// Work around: move ring0.Vectors() into a specific address with 11-bits alignment. +// +// According to the design documentation of Arm64, +// the start address of exception vector table should be 11-bits aligned. +// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S +// But, we can't align a function's start address to a specific address by using golang. +// We have raised this question in golang community: +// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I +// This function will be removed when golang supports this feature. +// +// There are 2 jobs were implemented in this function: +// 1, move the start address of exception vector table into the specific address. +// 2, modify the offset of each instruction. +func updateVectorTable() { + fromLocation := reflect.ValueOf(ring0.Vectors).Pointer() + offset := fromLocation & (1<<11 - 1) + if offset != 0 { + offset = 1<<11 - offset + } + + toLocation := fromLocation + offset + page := getPageWithReflect(toLocation) + if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil { + panic(err) + } + + page = getPageWithReflect(toLocation + 4096) + if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil { + panic(err) + } + + // Move exception-vector-table into the specific address. + var entry *uint32 + var entryFrom *uint32 + for i := 1; i <= 0x800; i++ { + entry = (*uint32)(unsafe.Pointer(toLocation + 0x800 - uintptr(i))) + entryFrom = (*uint32)(unsafe.Pointer(fromLocation + 0x800 - uintptr(i))) + *entry = *entryFrom + } + + // The offset from the address of each unconditionally branch is changed. + // We should modify the offset of each instruction. + nums := []uint32{0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, 0x500, 0x580, 0x600, 0x680, 0x700, 0x780} + for _, num := range nums { + entry = (*uint32)(unsafe.Pointer(toLocation + uintptr(num))) + *entry = *entry - (uint32)(offset/4) + } + + page = getPageWithReflect(toLocation) + if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil { + panic(err) + } + + page = getPageWithReflect(toLocation + 4096) + if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil { + panic(err) + } +} + +// initArchState initializes architecture-specific state. +func (c *vCPU) initArchState() error { + var ( + reg kvmOneReg + data uint64 + regGet kvmOneReg + dataGet uint64 + ) + + reg.addr = uint64(reflect.ValueOf(&data).Pointer()) + regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer()) + + vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_ARM_VCPU_INIT, + uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { + panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno)) + } + + // cpacr_el1 + reg.id = _KVM_ARM64_REGS_CPACR_EL1 + data = (_FPEN_NOTRAP << _FPEN_SHIFT) + if err := c.setOneRegister(®); err != nil { + return err + } + + // sctlr_el1 + regGet.id = _KVM_ARM64_REGS_SCTLR_EL1 + if err := c.getOneRegister(®Get); err != nil { + return err + } + + dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I) + data = dataGet + reg.id = _KVM_ARM64_REGS_SCTLR_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // tcr_el1 + data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS + reg.id = _KVM_ARM64_REGS_TCR_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // mair_el1 + data = _MT_EL1_INIT + reg.id = _KVM_ARM64_REGS_MAIR_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // ttbr0_el1 + data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0) + + reg.id = _KVM_ARM64_REGS_TTBR0_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + c.SetTtbr0Kvm(uintptr(data)) + + // ttbr1_el1 + data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0) + + reg.id = _KVM_ARM64_REGS_TTBR1_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // sp_el1 + data = c.CPU.StackTop() + reg.id = _KVM_ARM64_REGS_SP_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // pc + reg.id = _KVM_ARM64_REGS_PC + data = uint64(reflect.ValueOf(ring0.Start).Pointer()) + if err := c.setOneRegister(®); err != nil { + return err + } + + // r8 + reg.id = _KVM_ARM64_REGS_R8 + data = uint64(reflect.ValueOf(&c.CPU).Pointer()) + if err := c.setOneRegister(®); err != nil { + return err + } + + // vbar_el1 + reg.id = _KVM_ARM64_REGS_VBAR_EL1 + + fromLocation := reflect.ValueOf(ring0.Vectors).Pointer() + offset := fromLocation & (1<<11 - 1) + if offset != 0 { + offset = 1<<11 - offset + } + + toLocation := fromLocation + offset + data = uint64(ring0.KernelStartAddress | toLocation) + if err := c.setOneRegister(®); err != nil { + return err + } + + data = ring0.PsrDefaultSet | ring0.KernelFlagsSet + reg.id = _KVM_ARM64_REGS_PSTATE + if err := c.setOneRegister(®); err != nil { + return err + } + + return nil +} + +//go:nosplit +func (c *vCPU) loadSegments(tid uint64) { + // TODO(gvisor.dev/issue/1238): TLS is not supported. + // Get TLS from tpidr_el0. + atomic.StoreUint64(&c.tid, tid) +} + +func (c *vCPU) setOneRegister(reg *kvmOneReg) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_ONE_REG, + uintptr(unsafe.Pointer(reg))); errno != 0 { + return fmt.Errorf("error setting one register: %v", errno) + } + return nil +} + +func (c *vCPU) getOneRegister(reg *kvmOneReg) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_GET_ONE_REG, + uintptr(unsafe.Pointer(reg))); errno != 0 { + return fmt.Errorf("error setting one register: %v", errno) + } + return nil +} + +// setCPUID sets the CPUID to be used by the guest. +func (c *vCPU) setCPUID() error { + return nil +} + +// setSystemTime sets the TSC for the vCPU. +func (c *vCPU) setSystemTime() error { + return nil +} + +// setSignalMask sets the vCPU signal mask. +// +// This must be called prior to running the vCPU. +func (c *vCPU) setSignalMask() error { + // The layout of this structure implies that it will not necessarily be + // the same layout chosen by the Go compiler. It gets fudged here. + var data struct { + length uint32 + mask1 uint32 + mask2 uint32 + _ uint32 + } + data.length = 8 // Fixed sigset size. + data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) + data.mask2 = ^uint32(bounceSignalMask >> 32) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SIGNAL_MASK, + uintptr(unsafe.Pointer(&data))); errno != 0 { + return fmt.Errorf("error setting signal mask: %v", errno) + } + + return nil +} + +// SwitchToUser unpacks architectural-details. +func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) { + // Check for canonical addresses. + if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) { + return nonCanonical(regs.Pc, int32(syscall.SIGSEGV), info) + } else if !ring0.IsCanonical(regs.Sp) { + return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info) + } + + var vector ring0.Vector + ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0) + c.SetTtbr0App(uintptr(ttbr0App)) + + // TODO(gvisor.dev/issue/1238): full context-switch supporting for Arm64. + // The Arm64 user-mode execution state consists of: + // x0-x30 + // PC, SP, PSTATE + // V0-V31: 32 128-bit registers for floating point, and simd + // FPSR + // TPIDR_EL0, used for TLS + appRegs := switchOpts.Registers + c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs))) + + entersyscall() + bluepill(c) + vector = c.CPU.SwitchToUser(switchOpts) + exitsyscall() + + switch vector { + case ring0.Syscall: + // Fast path: system call executed. + return usermem.NoAccess, nil + + case ring0.PageFault: + return c.fault(int32(syscall.SIGSEGV), info) + case 0xaa: + return usermem.NoAccess, nil + default: + return usermem.NoAccess, platform.ErrContextSignal + } + +} diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index 405e00292..f04be2ab5 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.14 +// +build !go1.15 // Check go:linkname function signatures when updating Go version. @@ -35,6 +35,30 @@ func entersyscall() //go:linkname exitsyscall runtime.exitsyscall func exitsyscall() +// setMemoryRegion initializes a region. +// +// This may be called from bluepillHandler, and therefore returns an errno +// directly (instead of wrapping in an error) to avoid allocations. +// +//go:nosplit +func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr, flags uint32) syscall.Errno { + userRegion := userMemoryRegion{ + slot: uint32(slot), + flags: uint32(flags), + guestPhysAddr: uint64(physical), + memorySize: uint64(length), + userspaceAddr: uint64(virtual), + } + + // Set the region. + _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_SET_USER_MEMORY_REGION, + uintptr(unsafe.Pointer(&userRegion))) + return errno +} + // mapRunData maps the vCPU run data. func mapRunData(fd int) (*runData, error) { r, _, errno := syscall.RawSyscall6( @@ -63,46 +87,6 @@ func unmapRunData(r *runData) error { return nil } -// setUserRegisters sets user registers in the vCPU. -func (c *vCPU) setUserRegisters(uregs *userRegs) error { - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_REGS, - uintptr(unsafe.Pointer(uregs))); errno != 0 { - return fmt.Errorf("error setting user registers: %v", errno) - } - return nil -} - -// getUserRegisters reloads user registers in the vCPU. -// -// This is safe to call from a nosplit context. -// -//go:nosplit -func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno { - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_GET_REGS, - uintptr(unsafe.Pointer(uregs))); errno != 0 { - return errno - } - return 0 -} - -// setSystemRegisters sets system registers. -func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_SREGS, - uintptr(unsafe.Pointer(sregs))); errno != 0 { - return fmt.Errorf("error setting system registers: %v", errno) - } - return nil -} - // atomicAddressSpace is an atomic address space pointer. type atomicAddressSpace struct { pointer unsafe.Pointer diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD index 77a449a8b..b0e45f159 100644 --- a/pkg/sentry/platform/kvm/testutil/BUILD +++ b/pkg/sentry/platform/kvm/testutil/BUILD @@ -9,6 +9,8 @@ go_library( "testutil.go", "testutil_amd64.go", "testutil_amd64.s", + "testutil_arm64.go", + "testutil_arm64.s", ], importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil", visibility = ["//pkg/sentry/platform/kvm:__pkg__"], diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go index 6cf2359a3..5c1efa0fd 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil.go +++ b/pkg/sentry/platform/kvm/testutil/testutil.go @@ -41,9 +41,6 @@ func TwiddleRegsFault() // TwiddleRegsSyscall twiddles registers then executes a syscall. func TwiddleRegsSyscall() -// TwiddleSegments reads segments into known registers. -func TwiddleSegments() - // FloatingPointWorks is a floating point test. // // It returns true or false. diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go index 203d71528..4c108abbf 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go @@ -21,6 +21,9 @@ import ( "syscall" ) +// TwiddleSegments reads segments into known registers. +func TwiddleSegments() + // SetTestTarget sets the rip appropriately. func SetTestTarget(regs *syscall.PtraceRegs, fn func()) { regs.Rip = uint64(reflect.ValueOf(fn).Pointer()) diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go new file mode 100644 index 000000000..40b2e4acc --- /dev/null +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -0,0 +1,59 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package testutil + +import ( + "fmt" + "reflect" + "syscall" +) + +// SetTestTarget sets the rip appropriately. +func SetTestTarget(regs *syscall.PtraceRegs, fn func()) { + regs.Pc = uint64(reflect.ValueOf(fn).Pointer()) +} + +// SetTouchTarget sets rax appropriately. +func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) { + if target != nil { + regs.Regs[8] = uint64(reflect.ValueOf(target).Pointer()) + } else { + regs.Regs[8] = 0 + } +} + +// RewindSyscall rewinds a syscall RIP. +func RewindSyscall(regs *syscall.PtraceRegs) { + regs.Pc -= 4 +} + +// SetTestRegs initializes registers to known values. +func SetTestRegs(regs *syscall.PtraceRegs) { + for i := 0; i <= 30; i++ { + regs.Regs[i] = uint64(i) + 1 + } +} + +// CheckTestRegs checks that registers were twiddled per TwiddleRegs. +func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) { + for i := 0; i <= 30; i++ { + if need := ^uint64(i + 1); regs.Regs[i] != need { + err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need) + } + } + return +} diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s new file mode 100644 index 000000000..0bebee852 --- /dev/null +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s @@ -0,0 +1,106 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +// test_util_arm64.s provides ARM64 test functions. + +#include "funcdata.h" +#include "textflag.h" + +#define SYS_GETPID 172 + +// This function simulates the getpid syscall. +TEXT ·Getpid(SB),NOSPLIT,$0 + NO_LOCAL_POINTERS + MOVD $SYS_GETPID, R8 + SVC + RET + +TEXT ·Touch(SB),NOSPLIT,$0 +start: + MOVD 0(R8), R1 + MOVD $SYS_GETPID, R8 // getpid + SVC + B start + +TEXT ·HaltLoop(SB),NOSPLIT,$0 +start: + HLT + B start + +// This function simulates a loop of syscall. +TEXT ·SyscallLoop(SB),NOSPLIT,$0 +start: + SVC + B start + +TEXT ·SpinLoop(SB),NOSPLIT,$0 +start: + B start + +TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 + NO_LOCAL_POINTERS + FMOVD $(9.9), F0 + MOVD $SYS_GETPID, R8 // getpid + SVC + FMOVD $(9.9), F1 + FCMPD F0, F1 + BNE isNaN + MOVD $1, R0 + MOVD R0, ret+0(FP) + RET +isNaN: + MOVD $0, ret+0(FP) + RET + +// MVN: bitwise logical NOT +// This case simulates an application that modified R0-R30. +#define TWIDDLE_REGS() \ + MVN R0, R0; \ + MVN R1, R1; \ + MVN R2, R2; \ + MVN R3, R3; \ + MVN R4, R4; \ + MVN R5, R5; \ + MVN R6, R6; \ + MVN R7, R7; \ + MVN R8, R8; \ + MVN R9, R9; \ + MVN R10, R10; \ + MVN R11, R11; \ + MVN R12, R12; \ + MVN R13, R13; \ + MVN R14, R14; \ + MVN R15, R15; \ + MVN R16, R16; \ + MVN R17, R17; \ + MVN R18_PLATFORM, R18_PLATFORM; \ + MVN R19, R19; \ + MVN R20, R20; \ + MVN R21, R21; \ + MVN R22, R22; \ + MVN R23, R23; \ + MVN R24, R24; \ + MVN R25, R25; \ + MVN R26, R26; \ + MVN R27, R27; \ + MVN g, g; \ + MVN R29, R29; \ + MVN R30, R30; + +TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 + TWIDDLE_REGS() + SVC + RET // never reached diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index ebcc8c098..0df8cfa0f 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/procid", "//pkg/seccomp", "//pkg/sentry/arch", + "//pkg/sentry/hostcpu", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/safecopy", diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index 47957bb3b..72c7ec564 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -154,3 +154,19 @@ func (t *thread) clone() (*thread, error) { cpu: ^uint32(0), }, nil } + +// getEventMessage retrieves a message about the ptrace event that just happened. +func (t *thread) getEventMessage() (uintptr, error) { + var msg uintptr + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_GETEVENTMSG, + uintptr(t.tid), + 0, + uintptr(unsafe.Pointer(&msg)), + 0, 0) + if errno != 0 { + return msg, errno + } + return msg, nil +} diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 6bf7cd097..ddb1f41e3 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -267,7 +267,7 @@ func (s *subprocess) newThread() *thread { // attach attaches to the thread. func (t *thread) attach() { - if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_ATTACH, uintptr(t.tid), 0); errno != 0 { + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("unable to attach: %v", errno)) } @@ -327,6 +327,20 @@ func (t *thread) dumpAndPanic(message string) { panic(message) } +func (t *thread) unexpectedStubExit() { + msg, err := t.getEventMessage() + status := syscall.WaitStatus(msg) + if status.Signaled() && status.Signal() == syscall.SIGKILL { + // SIGKILL can be only sent by an user or OOM-killer. In both + // these cases, we don't need to panic. There is no reasons to + // think that something wrong in gVisor. + log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) + pid := os.Getpid() + syscall.Tgkill(pid, pid, syscall.Signal(syscall.SIGKILL)) + } + t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err)) +} + // wait waits for a stop event. // // Precondition: outcome is a valid waitOutcome. @@ -355,7 +369,7 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal { } if stopSig == syscall.SIGTRAP { if status.TrapCause() == syscall.PTRACE_EVENT_EXIT { - t.dumpAndPanic("wait failed: the process exited") + t.unexpectedStubExit() } // Re-encode the trap cause the way it's expected. return stopSig | syscall.Signal(status.TrapCause()<<8) @@ -416,7 +430,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { for { // Execute the syscall instruction. - if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 { + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } @@ -426,12 +440,15 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { break } else { // Some other signal caused a thread stop; ignore. + if sig != syscall.SIGSTOP && sig != syscall.SIGCHLD { + log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig) + } continue } } // Complete the actual system call. - if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 { + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } @@ -522,17 +539,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { for { // Start running until the next system call. if isSingleStepping(regs) { - if _, _, errno := syscall.RawSyscall( + if _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU_SINGLESTEP, - uintptr(t.tid), 0); errno != 0 { + uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } } else { - if _, _, errno := syscall.RawSyscall( + if _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, - uintptr(t.tid), 0); errno != 0 { + uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } } diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 4649a94a7..606dc2b1d 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -21,6 +21,8 @@ import ( "strings" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) @@ -143,3 +145,49 @@ func (t *thread) adjustInitRegsRip() { func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { initregs.R15 = uint64(ppid) } + +// patchSignalInfo patches the signal info to account for hitting the seccomp +// filters from vsyscall emulation, specified below. We allow for SIGSYS as a +// synchronous trap, but patch the structure to appear like a SIGSEGV with the +// Rip as the faulting address. +// +// Note that this should only be called after verifying that the signalInfo has +// been generated by the kernel. +func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { + if linux.Signal(signalInfo.Signo) == linux.SIGSYS { + signalInfo.Signo = int32(linux.SIGSEGV) + + // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered + // with the si_call_addr field pointing to the current RIP. This field + // aligns with the si_addr field for a SIGSEGV, so we don't need to touch + // anything there. We do need to unwind emulation however, so we set the + // instruction pointer to the faulting value, and "unpop" the stack. + regs.Rip = signalInfo.Addr() + regs.Rsp -= 8 + } +} + +// enableCpuidFault enables cpuid-faulting. +// +// This may fail on older kernels or hardware, so we just disregard the result. +// Host CPUID will be enabled. +// +// This is safe to call in an afterFork context. +// +//go:nosplit +func enableCpuidFault() { + syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) +} + +// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. +// Ref attachedThread() for more detail. +func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { + return append(rules, seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, + }, + }, + Action: linux.SECCOMP_RET_ALLOW, + }) +} diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index bec884ba5..62a686ee7 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -17,8 +17,12 @@ package ptrace import ( + "fmt" + "strings" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) @@ -37,7 +41,7 @@ const ( // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. -func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) { +func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { } // createSyscallRegs sets up syscall registers. @@ -124,3 +128,36 @@ func (t *thread) adjustInitRegsRip() { func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { initregs.Regs[7] = uint64(ppid) } + +// patchSignalInfo patches the signal info to account for hitting the seccomp +// filters from vsyscall emulation, specified below. We allow for SIGSYS as a +// synchronous trap, but patch the structure to appear like a SIGSEGV with the +// Rip as the faulting address. +// +// Note that this should only be called after verifying that the signalInfo has +// been generated by the kernel. +func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { + if linux.Signal(signalInfo.Signo) == linux.SIGSYS { + signalInfo.Signo = int32(linux.SIGSEGV) + + // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered + // with the si_call_addr field pointing to the current RIP. This field + // aligns with the si_addr field for a SIGSEGV, so we don't need to touch + // anything there. We do need to unwind emulation however, so we set the + // instruction pointer to the faulting value, and "unpop" the stack. + regs.Pc = signalInfo.Addr() + regs.Sp -= 8 + } +} + +// Noop on arm64. +// +//go:nosplit +func enableCpuidFault() { +} + +// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. +// Ref attachedThread() for more detail. +func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { + return rules +} diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index f09b0b3d0..cf13ea5e4 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -20,6 +20,7 @@ import ( "fmt" "syscall" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" @@ -53,7 +54,7 @@ func probeSeccomp() bool { for { // Attempt an emulation. - if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 { + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } @@ -77,27 +78,6 @@ func probeSeccomp() bool { } } -// patchSignalInfo patches the signal info to account for hitting the seccomp -// filters from vsyscall emulation, specified below. We allow for SIGSYS as a -// synchronous trap, but patch the structure to appear like a SIGSEGV with the -// Rip as the faulting address. -// -// Note that this should only be called after verifying that the signalInfo has -// been generated by the kernel. -func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { - if linux.Signal(signalInfo.Signo) == linux.SIGSYS { - signalInfo.Signo = int32(linux.SIGSEGV) - - // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered - // with the si_call_addr field pointing to the current RIP. This field - // aligns with the si_addr field for a SIGSEGV, so we don't need to touch - // anything there. We do need to unwind emulation however, so we set the - // instruction pointer to the faulting value, and "unpop" the stack. - regs.Rip = signalInfo.Addr() - regs.Rsp -= 8 - } -} - // createStub creates a fresh stub processes. // // Precondition: the runtime OS thread must be locked. @@ -129,6 +109,9 @@ func createStub() (*thread, error) { // transitively) will be killed as well. It's simply not possible to // safely handle a single stub getting killed: the exact state of // execution is unknown and not recoverable. + // + // In addition, we set the PTRACE_O_TRACEEXIT option to log more + // information about a stub process when it receives a fatal signal. return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction) } @@ -146,7 +129,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro Rules: seccomp.SyscallRules{ syscall.SYS_GETTIMEOFDAY: {}, syscall.SYS_TIME: {}, - 309: {}, // SYS_GETCPU. + unix.SYS_GETCPU: {}, // SYS_GETCPU was not defined in package syscall on amd64. }, Action: linux.SECCOMP_RET_TRAP, Vsyscall: true, @@ -170,10 +153,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // For the initial process creation. syscall.SYS_WAIT4: {}, - syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, - }, - syscall.SYS_EXIT: {}, + syscall.SYS_EXIT: {}, // For the stub prctl dance (all). syscall.SYS_PRCTL: []seccomp.Rule{ @@ -193,6 +173,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro }, Action: linux.SECCOMP_RET_ALLOW, }) + + rules = appendArchSeccompRules(rules) } instrs, err := seccomp.BuildProgram(rules, defaultAction) if err != nil { @@ -264,9 +246,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) } - // Enable cpuid-faulting; this may fail on older kernels or hardware, - // so we just disregard the result. Host CPUID will be enabled. - syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0) + // Enable cpuid-faulting. + enableCpuidFault() // Call the stub; should not return. stubCall(stubStart, ppid) diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go index de6783fb0..2e6fbe488 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -25,6 +25,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/hostcpu" ) // maskPool contains reusable CPU masks for setting affinity. Unfortunately, @@ -49,20 +50,6 @@ func unmaskAllSignals() syscall.Errno { return errno } -// getCPU gets the current CPU. -// -// Precondition: the current runtime thread should be locked. -func getCPU() (uint32, error) { - var cpu uintptr - if _, _, errno := syscall.RawSyscall( - unix.SYS_GETCPU, - uintptr(unsafe.Pointer(&cpu)), - 0, 0); errno != 0 { - return 0, errno - } - return uint32(cpu), nil -} - // setCPU sets the CPU affinity. func (t *thread) setCPU(cpu uint32) error { mask := maskPool.Get().([]uintptr) @@ -93,10 +80,8 @@ func (t *thread) setCPU(cpu uint32) error { // // Precondition: the current runtime thread should be locked. func (t *thread) bind() { - currentCPU, err := getCPU() - if err != nil { - return - } + currentCPU := hostcpu.GetCPU() + if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU { // Set the affinity on the thread and save the CPU for next // round; we don't expect CPUs to bounce around too frequently. diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go index b80a3604d..2ae6b9f9d 100644 --- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.14 +// +build !go1.15 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index 8ed6c7652..87f4552b5 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -1,11 +1,10 @@ load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") - go_template( - name = "defs", + name = "defs_amd64", srcs = [ "defs.go", "defs_amd64.go", @@ -15,11 +14,29 @@ go_template( visibility = [":__subpackages__"], ) +go_template( + name = "defs_arm64", + srcs = [ + "aarch64.go", + "defs.go", + "defs_arm64.go", + "offsets_arm64.go", + ], + visibility = [":__subpackages__"], +) + +go_template_instance( + name = "defs_impl_amd64", + out = "defs_impl_amd64.go", + package = "ring0", + template = ":defs_amd64", +) + go_template_instance( - name = "defs_impl", - out = "defs_impl.go", + name = "defs_impl_arm64", + out = "defs_impl_arm64.go", package = "ring0", - template = ":defs", + template = ":defs_arm64", ) genrule( @@ -30,17 +47,31 @@ genrule( tools = ["//pkg/sentry/platform/ring0/gen_offsets"], ) +genrule( + name = "entry_impl_arm64", + srcs = ["entry_arm64.s"], + outs = ["entry_impl_arm64.s"], + cmd = "(echo -e '// build +arm64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", + tools = ["//pkg/sentry/platform/ring0/gen_offsets"], +) + go_library( name = "ring0", srcs = [ - "defs_impl.go", + "defs_impl_amd64.go", + "defs_impl_arm64.go", "entry_amd64.go", + "entry_arm64.go", "entry_impl_amd64.s", + "entry_impl_arm64.s", "kernel.go", "kernel_amd64.go", + "kernel_arm64.go", "kernel_unsafe.go", "lib_amd64.go", "lib_amd64.s", + "lib_arm64.go", + "lib_arm64.s", "ring0.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0", diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go new file mode 100644 index 000000000..6b078cd1e --- /dev/null +++ b/pkg/sentry/platform/ring0/aarch64.go @@ -0,0 +1,109 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +// Useful bits. +const ( + _PGD_PGT_BASE = 0x1000 + _PGD_PGT_SIZE = 0x1000 + _PUD_PGT_BASE = 0x2000 + _PUD_PGT_SIZE = 0x1000 + _PMD_PGT_BASE = 0x3000 + _PMD_PGT_SIZE = 0x4000 + _PTE_PGT_BASE = 0x7000 + _PTE_PGT_SIZE = 0x1000 + + _PSR_MODE_EL0t = 0x0 + _PSR_MODE_EL1t = 0x4 + _PSR_MODE_EL1h = 0x5 + _PSR_EL_MASK = 0xf + + _PSR_D_BIT = 0x200 + _PSR_A_BIT = 0x100 + _PSR_I_BIT = 0x80 + _PSR_F_BIT = 0x40 +) + +const ( + // KernelFlagsSet should always be set in the kernel. + KernelFlagsSet = _PSR_MODE_EL1h + + // UserFlagsSet are always set in userspace. + UserFlagsSet = _PSR_MODE_EL0t + + KernelFlagsClear = _PSR_EL_MASK + UserFlagsClear = _PSR_EL_MASK + + PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT +) + +// Vector is an exception vector. +type Vector uintptr + +// Exception vectors. +const ( + El1SyncInvalid = iota + El1IrqInvalid + El1FiqInvalid + El1ErrorInvalid + El1Sync + El1Irq + El1Fiq + El1Error + El0Sync + El0Irq + El0Fiq + El0Error + El0Sync_invalid + El0Irq_invalid + El0Fiq_invalid + El0Error_invalid + El1Sync_da + El1Sync_ia + El1Sync_sp_pc + El1Sync_undef + El1Sync_dbg + El1Sync_inv + El0Sync_svc + El0Sync_da + El0Sync_ia + El0Sync_fpsimd_acc + El0Sync_sve_acc + El0Sync_sys + El0Sync_sp_pc + El0Sync_undef + El0Sync_dbg + El0Sync_inv + VirtualizationException + _NR_INTERRUPTS +) + +// System call vectors. +const ( + Syscall Vector = El0Sync_svc + PageFault Vector = El0Sync_da +) + +// VirtualAddressBits returns the number bits available for virtual addresses. +func VirtualAddressBits() uint32 { + return 48 +} + +// PhysicalAddressBits returns the number of bits available for physical addresses. +func PhysicalAddressBits() uint32 { + return 40 +} diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go index 076063f85..3f094c2a7 100644 --- a/pkg/sentry/platform/ring0/defs.go +++ b/pkg/sentry/platform/ring0/defs.go @@ -20,17 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usermem" ) -var ( - // UserspaceSize is the total size of userspace. - UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) - - // MaximumUserAddress is the largest possible user address. - MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) - - // KernelStartAddress is the starting kernel address. - KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) -) - // Kernel is a global kernel object. // // This contains global state, shared by multiple CPUs. diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 7206322b1..10dbd381f 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -20,6 +20,17 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" ) +var ( + // UserspaceSize is the total size of userspace. + UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) + + // MaximumUserAddress is the largest possible user address. + MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) + + // KernelStartAddress is the starting kernel address. + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) +) + // Segment indices and Selectors. const ( // Index into GDT array. diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go new file mode 100644 index 000000000..dc0eeec01 --- /dev/null +++ b/pkg/sentry/platform/ring0/defs_arm64.go @@ -0,0 +1,136 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +import ( + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" +) + +var ( + // UserspaceSize is the total size of userspace. + UserspaceSize = uintptr(1) << (VirtualAddressBits()) + + // MaximumUserAddress is the largest possible user address. + MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) + + // KernelStartAddress is the starting kernel address. + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) +) + +// KernelOpts has initialization options for the kernel. +type KernelOpts struct { + // PageTables are the kernel pagetables; this must be provided. + PageTables *pagetables.PageTables +} + +// KernelArchState contains architecture-specific state. +type KernelArchState struct { + KernelOpts +} + +// CPUArchState contains CPU-specific arch state. +type CPUArchState struct { + // stack is the stack used for interrupts on this CPU. + stack [512]byte + + // errorCode is the error code from the last exception. + errorCode uintptr + + // errorType indicates the type of error code here, it is always set + // along with the errorCode value above. + // + // It will either by 1, which indicates a user error, or 0 indicating a + // kernel error. If the error code below returns false (kernel error), + // then it cannot provide relevant information about the last + // exception. + errorType uintptr + + // faultAddr is the value of far_el1. + faultAddr uintptr + + // ttbr0Kvm is the value of ttbr0_el1 for sentry. + ttbr0Kvm uintptr + + // ttbr0App is the value of ttbr0_el1 for applicaton. + ttbr0App uintptr + + // exception vector. + vecCode Vector + + // application context pointer. + appAddr uintptr + + // lazyVFP is the value of cpacr_el1. + lazyVFP uintptr +} + +// ErrorCode returns the last error code. +// +// The returned boolean indicates whether the error code corresponds to the +// last user error or not. If it does not, then fault information must be +// ignored. This is generally the result of a kernel fault while servicing a +// user fault. +// +//go:nosplit +func (c *CPU) ErrorCode() (value uintptr, user bool) { + return c.errorCode, c.errorType != 0 +} + +// ClearErrorCode resets the error code. +// +//go:nosplit +func (c *CPU) ClearErrorCode() { + c.errorCode = 0 // No code. + c.errorType = 1 // User mode. +} + +//go:nosplit +func (c *CPU) GetFaultAddr() (value uintptr) { + return c.faultAddr +} + +//go:nosplit +func (c *CPU) SetTtbr0Kvm(value uintptr) { + c.ttbr0Kvm = value +} + +//go:nosplit +func (c *CPU) SetTtbr0App(value uintptr) { + c.ttbr0App = value +} + +//go:nosplit +func (c *CPU) GetVector() (value Vector) { + return c.vecCode +} + +//go:nosplit +func (c *CPU) SetAppAddr(value uintptr) { + c.appAddr = value +} + +// SwitchArchOpts are embedded in SwitchOpts. +type SwitchArchOpts struct { + // UserASID indicates that the application ASID to be used on switch, + UserASID uint16 + + // KernelASID indicates that the kernel ASID to be used on return, + KernelASID uint16 +} + +func init() { +} diff --git a/pkg/sentry/platform/ring0/entry_arm64.go b/pkg/sentry/platform/ring0/entry_arm64.go new file mode 100644 index 000000000..0dfa42c36 --- /dev/null +++ b/pkg/sentry/platform/ring0/entry_arm64.go @@ -0,0 +1,60 @@ +// Copyright 2019 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +// This is an assembly function. +// +// The sysenter function is invoked in two situations: +// +// (1) The guest kernel has executed a system call. +// (2) The guest application has executed a system call. +// +// The interrupt flag is examined to determine whether the system call was +// executed from kernel mode or not and the appropriate stub is called. + +func El1_sync_invalid() +func El1_irq_invalid() +func El1_fiq_invalid() +func El1_error_invalid() + +func El1_sync() +func El1_irq() +func El1_fiq() +func El1_error() + +func El0_sync() +func El0_irq() +func El0_fiq() +func El0_error() + +func El0_sync_invalid() +func El0_irq_invalid() +func El0_fiq_invalid() +func El0_error_invalid() + +func Vectors() + +// Start is the CPU entrypoint. +// +// The CPU state will be set to c.Registers(). +func Start() +func kernelExitToEl1() + +func kernelExitToEl0() + +// Shutdown execution +func Shutdown() diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s new file mode 100644 index 000000000..add2c3e08 --- /dev/null +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -0,0 +1,591 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +// NB: Offsets are programatically generated (see BUILD). +// +// This file is concatenated with the definitions. + +// Saves a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// + +#define ERET() \ + WORD $0xd69f03e0 + +#define RSV_REG R18_PLATFORM +#define RSV_REG_APP R9 + +#define FPEN_NOTRAP 0x3 +#define FPEN_SHIFT 20 + +#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT) + +#define REGISTERS_SAVE(reg, offset) \ + MOVD R0, offset+PTRACE_R0(reg); \ + MOVD R1, offset+PTRACE_R1(reg); \ + MOVD R2, offset+PTRACE_R2(reg); \ + MOVD R3, offset+PTRACE_R3(reg); \ + MOVD R4, offset+PTRACE_R4(reg); \ + MOVD R5, offset+PTRACE_R5(reg); \ + MOVD R6, offset+PTRACE_R6(reg); \ + MOVD R7, offset+PTRACE_R7(reg); \ + MOVD R8, offset+PTRACE_R8(reg); \ + MOVD R10, offset+PTRACE_R10(reg); \ + MOVD R11, offset+PTRACE_R11(reg); \ + MOVD R12, offset+PTRACE_R12(reg); \ + MOVD R13, offset+PTRACE_R13(reg); \ + MOVD R14, offset+PTRACE_R14(reg); \ + MOVD R15, offset+PTRACE_R15(reg); \ + MOVD R16, offset+PTRACE_R16(reg); \ + MOVD R17, offset+PTRACE_R17(reg); \ + MOVD R19, offset+PTRACE_R19(reg); \ + MOVD R20, offset+PTRACE_R20(reg); \ + MOVD R21, offset+PTRACE_R21(reg); \ + MOVD R22, offset+PTRACE_R22(reg); \ + MOVD R23, offset+PTRACE_R23(reg); \ + MOVD R24, offset+PTRACE_R24(reg); \ + MOVD R25, offset+PTRACE_R25(reg); \ + MOVD R26, offset+PTRACE_R26(reg); \ + MOVD R27, offset+PTRACE_R27(reg); \ + MOVD g, offset+PTRACE_R28(reg); \ + MOVD R29, offset+PTRACE_R29(reg); \ + MOVD R30, offset+PTRACE_R30(reg); + +#define REGISTERS_LOAD(reg, offset) \ + MOVD offset+PTRACE_R0(reg), R0; \ + MOVD offset+PTRACE_R1(reg), R1; \ + MOVD offset+PTRACE_R2(reg), R2; \ + MOVD offset+PTRACE_R3(reg), R3; \ + MOVD offset+PTRACE_R4(reg), R4; \ + MOVD offset+PTRACE_R5(reg), R5; \ + MOVD offset+PTRACE_R6(reg), R6; \ + MOVD offset+PTRACE_R7(reg), R7; \ + MOVD offset+PTRACE_R8(reg), R8; \ + MOVD offset+PTRACE_R10(reg), R10; \ + MOVD offset+PTRACE_R11(reg), R11; \ + MOVD offset+PTRACE_R12(reg), R12; \ + MOVD offset+PTRACE_R13(reg), R13; \ + MOVD offset+PTRACE_R14(reg), R14; \ + MOVD offset+PTRACE_R15(reg), R15; \ + MOVD offset+PTRACE_R16(reg), R16; \ + MOVD offset+PTRACE_R17(reg), R17; \ + MOVD offset+PTRACE_R19(reg), R19; \ + MOVD offset+PTRACE_R20(reg), R20; \ + MOVD offset+PTRACE_R21(reg), R21; \ + MOVD offset+PTRACE_R22(reg), R22; \ + MOVD offset+PTRACE_R23(reg), R23; \ + MOVD offset+PTRACE_R24(reg), R24; \ + MOVD offset+PTRACE_R25(reg), R25; \ + MOVD offset+PTRACE_R26(reg), R26; \ + MOVD offset+PTRACE_R27(reg), R27; \ + MOVD offset+PTRACE_R28(reg), g; \ + MOVD offset+PTRACE_R29(reg), R29; \ + MOVD offset+PTRACE_R30(reg), R30; + +//NOP +#define nop31Instructions() \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; + +#define ESR_ELx_EC_UNKNOWN (0x00) +#define ESR_ELx_EC_WFx (0x01) +/* Unallocated EC: 0x02 */ +#define ESR_ELx_EC_CP15_32 (0x03) +#define ESR_ELx_EC_CP15_64 (0x04) +#define ESR_ELx_EC_CP14_MR (0x05) +#define ESR_ELx_EC_CP14_LS (0x06) +#define ESR_ELx_EC_FP_ASIMD (0x07) +#define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ +#define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ +/* Unallocated EC: 0x0A - 0x0B */ +#define ESR_ELx_EC_CP14_64 (0x0C) +/* Unallocated EC: 0x0d */ +#define ESR_ELx_EC_ILL (0x0E) +/* Unallocated EC: 0x0F - 0x10 */ +#define ESR_ELx_EC_SVC32 (0x11) +#define ESR_ELx_EC_HVC32 (0x12) /* EL2 only */ +#define ESR_ELx_EC_SMC32 (0x13) /* EL2 and above */ +/* Unallocated EC: 0x14 */ +#define ESR_ELx_EC_SVC64 (0x15) +#define ESR_ELx_EC_HVC64 (0x16) /* EL2 and above */ +#define ESR_ELx_EC_SMC64 (0x17) /* EL2 and above */ +#define ESR_ELx_EC_SYS64 (0x18) +#define ESR_ELx_EC_SVE (0x19) +/* Unallocated EC: 0x1A - 0x1E */ +#define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ +#define ESR_ELx_EC_IABT_LOW (0x20) +#define ESR_ELx_EC_IABT_CUR (0x21) +#define ESR_ELx_EC_PC_ALIGN (0x22) +/* Unallocated EC: 0x23 */ +#define ESR_ELx_EC_DABT_LOW (0x24) +#define ESR_ELx_EC_DABT_CUR (0x25) +#define ESR_ELx_EC_SP_ALIGN (0x26) +/* Unallocated EC: 0x27 */ +#define ESR_ELx_EC_FP_EXC32 (0x28) +/* Unallocated EC: 0x29 - 0x2B */ +#define ESR_ELx_EC_FP_EXC64 (0x2C) +/* Unallocated EC: 0x2D - 0x2E */ +#define ESR_ELx_EC_SERROR (0x2F) +#define ESR_ELx_EC_BREAKPT_LOW (0x30) +#define ESR_ELx_EC_BREAKPT_CUR (0x31) +#define ESR_ELx_EC_SOFTSTP_LOW (0x32) +#define ESR_ELx_EC_SOFTSTP_CUR (0x33) +#define ESR_ELx_EC_WATCHPT_LOW (0x34) +#define ESR_ELx_EC_WATCHPT_CUR (0x35) +/* Unallocated EC: 0x36 - 0x37 */ +#define ESR_ELx_EC_BKPT32 (0x38) +/* Unallocated EC: 0x39 */ +#define ESR_ELx_EC_VECTOR32 (0x3A) /* EL2 only */ +/* Unallocted EC: 0x3B */ +#define ESR_ELx_EC_BRK64 (0x3C) +/* Unallocated EC: 0x3D - 0x3F */ +#define ESR_ELx_EC_MAX (0x3F) + +#define ESR_ELx_EC_SHIFT (26) +#define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) +#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) + +#define ESR_ELx_IL_SHIFT (25) +#define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) +#define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) + +/* ISS field definitions shared by different classes */ +#define ESR_ELx_WNR_SHIFT (6) +#define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) + +/* Asynchronous Error Type */ +#define ESR_ELx_IDS_SHIFT (24) +#define ESR_ELx_IDS (UL(1) << ESR_ELx_IDS_SHIFT) +#define ESR_ELx_AET_SHIFT (10) +#define ESR_ELx_AET (UL(0x7) << ESR_ELx_AET_SHIFT) + +#define ESR_ELx_AET_UC (UL(0) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UEU (UL(1) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UEO (UL(2) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UER (UL(3) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) + +/* Shared ISS field definitions for Data/Instruction aborts */ +#define ESR_ELx_SET_SHIFT (11) +#define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) +#define ESR_ELx_FnV_SHIFT (10) +#define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) +#define ESR_ELx_EA_SHIFT (9) +#define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) +#define ESR_ELx_S1PTW_SHIFT (7) +#define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) + +/* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ +#define ESR_ELx_FSC (0x3F) +#define ESR_ELx_FSC_TYPE (0x3C) +#define ESR_ELx_FSC_EXTABT (0x10) +#define ESR_ELx_FSC_SERROR (0x11) +#define ESR_ELx_FSC_ACCESS (0x08) +#define ESR_ELx_FSC_FAULT (0x04) +#define ESR_ELx_FSC_PERM (0x0C) + +/* ISS field definitions for Data Aborts */ +#define ESR_ELx_ISV_SHIFT (24) +#define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) +#define ESR_ELx_SAS_SHIFT (22) +#define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) +#define ESR_ELx_SSE_SHIFT (21) +#define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) +#define ESR_ELx_SRT_SHIFT (16) +#define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) +#define ESR_ELx_SF_SHIFT (15) +#define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) +#define ESR_ELx_AR_SHIFT (14) +#define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_CM_SHIFT (8) +#define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) + +/* ISS field definitions for exceptions taken in to Hyp */ +#define ESR_ELx_CV (UL(1) << 24) +#define ESR_ELx_COND_SHIFT (20) +#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) +#define ESR_ELx_WFx_ISS_TI (UL(1) << 0) +#define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) +#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) +#define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) + +#define LOAD_KERNEL_ADDRESS(from, to) \ + MOVD from, to; \ + ORR $0xffff000000000000, to, to; + +// LOAD_KERNEL_STACK loads the kernel temporary stack. +#define LOAD_KERNEL_STACK(from) \ + LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \ + MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \ + MOVD RSV_REG, RSP; \ + ISB $15; \ + DSB $15; + +#define SWITCH_TO_APP_PAGETABLE(from) \ + MOVD CPU_TTBR0_APP(from), RSV_REG; \ + WORD $0xd5182012; \ // MSR R18, TTBR0_EL1 + ISB $15; \ + DSB $15; + +#define SWITCH_TO_KVM_PAGETABLE(from) \ + MOVD CPU_TTBR0_KVM(from), RSV_REG; \ + WORD $0xd5182012; \ // MSR R18, TTBR0_EL1 + ISB $15; \ + DSB $15; + +#define IRQ_ENABLE \ + MSR $2, DAIFSet; + +#define IRQ_DISABLE \ + MSR $2, DAIFClr; + +#define VFP_ENABLE \ + MOVD $FPEN_ENABLE, R0; \ + WORD $0xd5181040; \ //MSR R0, CPACR_EL1 + ISB $15; + +#define VFP_DISABLE \ + MOVD $0x0, R0; \ + WORD $0xd5181040; \ //MSR R0, CPACR_EL1 + ISB $15; + +#define KERNEL_ENTRY_FROM_EL0 \ + SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack. + STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18, step2, switch user pagetable. + SWITCH_TO_KVM_PAGETABLE(RSV_REG); \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 + MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step3, load app context pointer. + REGISTERS_SAVE(RSV_REG_APP, 0); \ // step4, save app context. + MOVD RSV_REG_APP, R20; \ + LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \ + ADD $16, RSP, RSP; \ + MOVD RSV_REG, PTRACE_R18(R20); \ + MOVD RSV_REG_APP, PTRACE_R9(R20); \ + MOVD R20, RSV_REG_APP; \ + WORD $0xd5384003; \ // MRS SPSR_EL1, R3 + MOVD R3, PTRACE_PSTATE(RSV_REG_APP); \ + MRS ELR_EL1, R3; \ + MOVD R3, PTRACE_PC(RSV_REG_APP); \ + WORD $0xd5384103; \ // MRS SP_EL0, R3 + MOVD R3, PTRACE_SP(RSV_REG_APP); + +#define KERNEL_ENTRY_FROM_EL1 \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 + REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \ // save sentry context + MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \ + WORD $0xd5384004; \ // MRS SPSR_EL1, R4 + MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \ + MRS ELR_EL1, R4; \ + MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \ + MOVD RSP, R4; \ + MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); + +TEXT ·Halt(SB),NOSPLIT,$0 + // clear bluepill. + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + CMP RSV_REG, R9 + BNE mmio_exit + MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG) +mmio_exit: + // Disable fpsimd. + WORD $0xd5381041 // MRS CPACR_EL1, R1 + MOVD R1, CPU_LAZY_VFP(RSV_REG) + VFP_DISABLE + + // MMIO_EXIT. + MOVD $0, R9 + MOVD R0, 0xffff000000001000(R9) + B ·kernelExitToEl1(SB) + +TEXT ·Shutdown(SB),NOSPLIT,$0 + // PSCI EVENT. + MOVD $0x84000009, R0 + HVC $0 + +// See kernel.go. +TEXT ·Current(SB),NOSPLIT,$0-8 + MOVD CPU_SELF(RSV_REG), R8 + MOVD R8, ret+0(FP) + RET + +#define STACK_FRAME_SIZE 16 + +TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 + ERET() + +TEXT ·kernelExitToEl1(SB),NOSPLIT,$0 + ERET() + +TEXT ·Start(SB),NOSPLIT,$0 + IRQ_DISABLE + MOVD R8, RSV_REG + ORR $0xffff000000000000, RSV_REG, RSV_REG + WORD $0xd518d092 //MSR R18, TPIDR_EL1 + + B ·kernelExitToEl1(SB) + +TEXT ·El1_sync_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El1_irq_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El1_error_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El1_sync(SB),NOSPLIT,$0 + KERNEL_ENTRY_FROM_EL1 + WORD $0xd5385219 // MRS ESR_EL1, R25 + LSR $ESR_ELx_EC_SHIFT, R25, R24 + CMP $ESR_ELx_EC_DABT_CUR, R24 + BEQ el1_da + CMP $ESR_ELx_EC_IABT_CUR, R24 + BEQ el1_ia + CMP $ESR_ELx_EC_SYS64, R24 + BEQ el1_undef + CMP $ESR_ELx_EC_SP_ALIGN, R24 + BEQ el1_sp_pc + CMP $ESR_ELx_EC_PC_ALIGN, R24 + BEQ el1_sp_pc + CMP $ESR_ELx_EC_UNKNOWN, R24 + BEQ el1_undef + CMP $ESR_ELx_EC_SVC64, R24 + BEQ el1_svc + CMP $ESR_ELx_EC_BREAKPT_CUR, R24 + BGE el1_dbg + CMP $ESR_ELx_EC_FP_ASIMD, R24 + BEQ el1_fpsimd_acc + B el1_invalid + +el1_da: + B ·Halt(SB) + +el1_ia: + B ·Halt(SB) + +el1_sp_pc: + B ·Shutdown(SB) + +el1_undef: + B ·Shutdown(SB) + +el1_svc: + B ·Halt(SB) + +el1_dbg: + B ·Shutdown(SB) + +el1_fpsimd_acc: + VFP_ENABLE + B ·kernelExitToEl1(SB) // Resume. + +el1_invalid: + B ·Shutdown(SB) + +TEXT ·El1_irq(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El1_fiq(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El1_error(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_sync(SB),NOSPLIT,$0 + KERNEL_ENTRY_FROM_EL0 + WORD $0xd5385219 // MRS ESR_EL1, R25 + LSR $ESR_ELx_EC_SHIFT, R25, R24 + CMP $ESR_ELx_EC_SVC64, R24 + BEQ el0_svc + CMP $ESR_ELx_EC_DABT_LOW, R24 + BEQ el0_da + CMP $ESR_ELx_EC_IABT_LOW, R24 + BEQ el0_ia + CMP $ESR_ELx_EC_FP_ASIMD, R24 + BEQ el0_fpsimd_acc + CMP $ESR_ELx_EC_SVE, R24 + BEQ el0_sve_acc + CMP $ESR_ELx_EC_FP_EXC64, R24 + BEQ el0_fpsimd_exc + CMP $ESR_ELx_EC_SP_ALIGN, R24 + BEQ el0_sp_pc + CMP $ESR_ELx_EC_PC_ALIGN, R24 + BEQ el0_sp_pc + CMP $ESR_ELx_EC_UNKNOWN, R24 + BEQ el0_undef + CMP $ESR_ELx_EC_BREAKPT_LOW, R24 + BGE el0_dbg + B el0_invalid + +el0_svc: + B ·Halt(SB) + +el0_da: + B ·Halt(SB) + +el0_ia: + B ·Shutdown(SB) + +el0_fpsimd_acc: + B ·Shutdown(SB) + +el0_sve_acc: + B ·Shutdown(SB) + +el0_fpsimd_exc: + B ·Shutdown(SB) + +el0_sp_pc: + B ·Shutdown(SB) + +el0_undef: + B ·Shutdown(SB) + +el0_dbg: + B ·Shutdown(SB) + +el0_invalid: + B ·Shutdown(SB) + +TEXT ·El0_irq(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_fiq(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_error(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_sync_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_irq_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_error_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·Vectors(SB),NOSPLIT,$0 + B ·El1_sync_invalid(SB) + nop31Instructions() + B ·El1_irq_invalid(SB) + nop31Instructions() + B ·El1_fiq_invalid(SB) + nop31Instructions() + B ·El1_error_invalid(SB) + nop31Instructions() + + B ·El1_sync(SB) + nop31Instructions() + B ·El1_irq(SB) + nop31Instructions() + B ·El1_fiq(SB) + nop31Instructions() + B ·El1_error(SB) + nop31Instructions() + + B ·El0_sync(SB) + nop31Instructions() + B ·El0_irq(SB) + nop31Instructions() + B ·El0_fiq(SB) + nop31Instructions() + B ·El0_error(SB) + nop31Instructions() + + B ·El0_sync_invalid(SB) + nop31Instructions() + B ·El0_irq_invalid(SB) + nop31Instructions() + B ·El0_fiq_invalid(SB) + nop31Instructions() + B ·El0_error_invalid(SB) + nop31Instructions() + + WORD $0xd503201f //nop + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index d7029d5a9..42076fb04 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -1,20 +1,27 @@ load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template_instance") +go_template_instance( + name = "defs_impl_arm64", + out = "defs_impl_arm64.go", + package = "main", + template = "//pkg/sentry/platform/ring0:defs_arm64", +) go_template_instance( - name = "defs_impl", - out = "defs_impl.go", + name = "defs_impl_amd64", + out = "defs_impl_amd64.go", package = "main", - template = "//pkg/sentry/platform/ring0:defs", + template = "//pkg/sentry/platform/ring0:defs_amd64", ) go_binary( name = "gen_offsets", srcs = [ - "defs_impl.go", + "defs_impl_amd64.go", + "defs_impl_arm64.go", "main.go", ], visibility = ["//pkg/sentry/platform/ring0:__pkg__"], diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go new file mode 100644 index 000000000..ed82a131e --- /dev/null +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -0,0 +1,58 @@ +// Copyright 2019 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +// init initializes architecture-specific state. +func (k *Kernel) init(opts KernelOpts) { + // Save the root page tables. + k.PageTables = opts.PageTables +} + +// init initializes architecture-specific state. +func (c *CPU) init() { + // Set the kernel stack pointer(virtual address). + c.registers.Sp = uint64(c.StackTop()) + +} + +// StackTop returns the kernel's stack address. +// +//go:nosplit +func (c *CPU) StackTop() uint64 { + return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) +} + +// IsCanonical indicates whether addr is canonical per the arm64 spec. +// +//go:nosplit +func IsCanonical(addr uint64) bool { + return addr <= 0x0000ffffffffffff || addr > 0xffff000000000000 +} + +//go:nosplit +func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { + // Sanitize registers. + regs := switchOpts.Registers + + regs.Pstate &= ^uint64(UserFlagsClear) + regs.Pstate |= UserFlagsSet + kernelExitToEl0() + vector = c.vecCode + + // Perform the switch. + return +} diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go new file mode 100644 index 000000000..8bcfe1032 --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -0,0 +1,39 @@ +// Copyright 2019 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +// CPACREL1 returns the value of the CPACR_EL1 register. +func CPACREL1() (value uintptr) + +// FPCR returns the value of FPCR register. +func FPCR() (value uintptr) + +// SetFPCR writes the FPCR value. +func SetFPCR(value uintptr) + +// FPSR returns the value of FPSR register. +func FPSR() (value uintptr) + +// SetFPSR writes the FPSR value. +func SetFPSR(value uintptr) + +// SaveVRegs saves V0-V31 registers. +// V0-V31: 32 128-bit registers for floating point and simd. +func SaveVRegs(*byte) + +// LoadVRegs loads V0-V31 registers. +func LoadVRegs(*byte) diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s new file mode 100644 index 000000000..1c9171004 --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -0,0 +1,118 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +TEXT ·CPACREL1(SB),NOSPLIT,$0-8 + WORD $0xd5381041 // MRS CPACR_EL1, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·FPCR(SB),NOSPLIT,$0-8 + WORD $0xd53b4201 // MRS NZCV, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·GetFPSR(SB),NOSPLIT,$0-8 + WORD $0xd53b4421 // MRS FPSR, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·FPCR(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + WORD $0xd51b4201 // MSR R1, NZCV + RET + +TEXT ·SetFPSR(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + WORD $0xd51b4421 // MSR R1, FPSR + RET + +TEXT ·SaveVRegs(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + // Skip aarch64_ctx, fpsr, fpcr. + FMOVD F0, 16*1(R0) + FMOVD F1, 16*2(R0) + FMOVD F2, 16*3(R0) + FMOVD F3, 16*4(R0) + FMOVD F4, 16*5(R0) + FMOVD F5, 16*6(R0) + FMOVD F6, 16*7(R0) + FMOVD F7, 16*8(R0) + FMOVD F8, 16*9(R0) + FMOVD F9, 16*10(R0) + FMOVD F10, 16*11(R0) + FMOVD F11, 16*12(R0) + FMOVD F12, 16*13(R0) + FMOVD F13, 16*14(R0) + FMOVD F14, 16*15(R0) + FMOVD F15, 16*16(R0) + FMOVD F16, 16*17(R0) + FMOVD F17, 16*18(R0) + FMOVD F18, 16*19(R0) + FMOVD F19, 16*20(R0) + FMOVD F20, 16*21(R0) + FMOVD F21, 16*22(R0) + FMOVD F22, 16*23(R0) + FMOVD F23, 16*24(R0) + FMOVD F24, 16*25(R0) + FMOVD F25, 16*26(R0) + FMOVD F26, 16*27(R0) + FMOVD F27, 16*28(R0) + FMOVD F28, 16*29(R0) + FMOVD F29, 16*30(R0) + FMOVD F30, 16*31(R0) + FMOVD F31, 16*32(R0) + ISB $15 + + RET + +TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + // Skip aarch64_ctx, fpsr, fpcr. + FMOVD 16*1(R0), F0 + FMOVD 16*2(R0), F1 + FMOVD 16*3(R0), F2 + FMOVD 16*4(R0), F3 + FMOVD 16*5(R0), F4 + FMOVD 16*6(R0), F5 + FMOVD 16*7(R0), F6 + FMOVD 16*8(R0), F7 + FMOVD 16*9(R0), F8 + FMOVD 16*10(R0), F9 + FMOVD 16*11(R0), F10 + FMOVD 16*12(R0), F11 + FMOVD 16*13(R0), F12 + FMOVD 16*14(R0), F13 + FMOVD 16*15(R0), F14 + FMOVD 16*16(R0), F15 + FMOVD 16*17(R0), F16 + FMOVD 16*18(R0), F17 + FMOVD 16*19(R0), F18 + FMOVD 16*20(R0), F19 + FMOVD 16*21(R0), F20 + FMOVD 16*22(R0), F21 + FMOVD 16*23(R0), F22 + FMOVD 16*24(R0), F23 + FMOVD 16*25(R0), F24 + FMOVD 16*26(R0), F25 + FMOVD 16*27(R0), F26 + FMOVD 16*28(R0), F27 + FMOVD 16*29(R0), F28 + FMOVD 16*30(R0), F29 + FMOVD 16*31(R0), F30 + FMOVD 16*32(R0), F31 + ISB $15 + + RET diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go new file mode 100644 index 000000000..cd2a65f97 --- /dev/null +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -0,0 +1,125 @@ +// Copyright 2019 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +import ( + "fmt" + "io" + "reflect" + "syscall" +) + +// Emit prints architecture-specific offsets. +func Emit(w io.Writer) { + fmt.Fprintf(w, "// Automatically generated, do not edit.\n") + + c := &CPU{} + fmt.Fprintf(w, "\n// CPU offsets.\n") + fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) + fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_FAULT_ADDR 0x%02x\n", reflect.ValueOf(&c.faultAddr).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_TTBR0_KVM 0x%02x\n", reflect.ValueOf(&c.ttbr0Kvm).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_TTBR0_APP 0x%02x\n", reflect.ValueOf(&c.ttbr0App).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_VECTOR_CODE 0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_APP_ADDR 0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_LAZY_VFP 0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer()) + + fmt.Fprintf(w, "\n// Bits.\n") + fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) + + fmt.Fprintf(w, "\n// Vectors.\n") + fmt.Fprintf(w, "#define El1SyncInvalid 0x%02x\n", El1SyncInvalid) + fmt.Fprintf(w, "#define El1IrqInvalid 0x%02x\n", El1IrqInvalid) + fmt.Fprintf(w, "#define El1FiqInvalid 0x%02x\n", El1FiqInvalid) + fmt.Fprintf(w, "#define El1ErrorInvalid 0x%02x\n", El1ErrorInvalid) + + fmt.Fprintf(w, "#define El1Sync 0x%02x\n", El1Sync) + fmt.Fprintf(w, "#define El1Irq 0x%02x\n", El1Irq) + fmt.Fprintf(w, "#define El1Fiq 0x%02x\n", El1Fiq) + fmt.Fprintf(w, "#define El1Error 0x%02x\n", El1Error) + + fmt.Fprintf(w, "#define El0Sync 0x%02x\n", El0Sync) + fmt.Fprintf(w, "#define El0Irq 0x%02x\n", El0Irq) + fmt.Fprintf(w, "#define El0Fiq 0x%02x\n", El0Fiq) + fmt.Fprintf(w, "#define El0Error 0x%02x\n", El0Error) + + fmt.Fprintf(w, "#define El0Sync_invalid 0x%02x\n", El0Sync_invalid) + fmt.Fprintf(w, "#define El0Irq_invalid 0x%02x\n", El0Irq_invalid) + fmt.Fprintf(w, "#define El0Fiq_invalid 0x%02x\n", El0Fiq_invalid) + fmt.Fprintf(w, "#define El0Error_invalid 0x%02x\n", El0Error_invalid) + + fmt.Fprintf(w, "#define El1Sync_da 0x%02x\n", El1Sync_da) + fmt.Fprintf(w, "#define El1Sync_ia 0x%02x\n", El1Sync_ia) + fmt.Fprintf(w, "#define El1Sync_sp_pc 0x%02x\n", El1Sync_sp_pc) + fmt.Fprintf(w, "#define El1Sync_undef 0x%02x\n", El1Sync_undef) + fmt.Fprintf(w, "#define El1Sync_dbg 0x%02x\n", El1Sync_dbg) + fmt.Fprintf(w, "#define El1Sync_inv 0x%02x\n", El1Sync_inv) + + fmt.Fprintf(w, "#define El0Sync_svc 0x%02x\n", El0Sync_svc) + fmt.Fprintf(w, "#define El0Sync_da 0x%02x\n", El0Sync_da) + fmt.Fprintf(w, "#define El0Sync_ia 0x%02x\n", El0Sync_ia) + fmt.Fprintf(w, "#define El0Sync_fpsimd_acc 0x%02x\n", El0Sync_fpsimd_acc) + fmt.Fprintf(w, "#define El0Sync_sve_acc 0x%02x\n", El0Sync_sve_acc) + fmt.Fprintf(w, "#define El0Sync_sys 0x%02x\n", El0Sync_sys) + fmt.Fprintf(w, "#define El0Sync_sp_pc 0x%02x\n", El0Sync_sp_pc) + fmt.Fprintf(w, "#define El0Sync_undef 0x%02x\n", El0Sync_undef) + fmt.Fprintf(w, "#define El0Sync_dbg 0x%02x\n", El0Sync_dbg) + fmt.Fprintf(w, "#define El0Sync_inv 0x%02x\n", El0Sync_inv) + + fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) + fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) + + p := &syscall.PtraceRegs{} + fmt.Fprintf(w, "\n// Ptrace registers.\n") + fmt.Fprintf(w, "#define PTRACE_R0 0x%02x\n", reflect.ValueOf(&p.Regs[0]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R1 0x%02x\n", reflect.ValueOf(&p.Regs[1]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R2 0x%02x\n", reflect.ValueOf(&p.Regs[2]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R3 0x%02x\n", reflect.ValueOf(&p.Regs[3]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R4 0x%02x\n", reflect.ValueOf(&p.Regs[4]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R5 0x%02x\n", reflect.ValueOf(&p.Regs[5]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R6 0x%02x\n", reflect.ValueOf(&p.Regs[6]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R7 0x%02x\n", reflect.ValueOf(&p.Regs[7]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.Regs[8]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.Regs[9]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.Regs[10]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.Regs[11]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.Regs[12]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.Regs[13]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.Regs[14]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.Regs[15]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R16 0x%02x\n", reflect.ValueOf(&p.Regs[16]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R17 0x%02x\n", reflect.ValueOf(&p.Regs[17]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R18 0x%02x\n", reflect.ValueOf(&p.Regs[18]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R19 0x%02x\n", reflect.ValueOf(&p.Regs[19]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R20 0x%02x\n", reflect.ValueOf(&p.Regs[20]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R21 0x%02x\n", reflect.ValueOf(&p.Regs[21]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R22 0x%02x\n", reflect.ValueOf(&p.Regs[22]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R23 0x%02x\n", reflect.ValueOf(&p.Regs[23]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R24 0x%02x\n", reflect.ValueOf(&p.Regs[24]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R25 0x%02x\n", reflect.ValueOf(&p.Regs[25]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R26 0x%02x\n", reflect.ValueOf(&p.Regs[26]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R27 0x%02x\n", reflect.ValueOf(&p.Regs[27]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R28 0x%02x\n", reflect.ValueOf(&p.Regs[28]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R29 0x%02x\n", reflect.ValueOf(&p.Regs[29]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R30 0x%02x\n", reflect.ValueOf(&p.Regs[30]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_SP 0x%02x\n", reflect.ValueOf(&p.Sp).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_PC 0x%02x\n", reflect.ValueOf(&p.Pc).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_PSTATE 0x%02x\n", reflect.ValueOf(&p.Pstate).Pointer()-reflect.ValueOf(p).Pointer()) +} diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index 3b95af617..e2e15ba5c 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -1,14 +1,17 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") +config_setting( + name = "aarch64", + constraint_values = ["@bazel_tools//platforms:aarch64"], +) go_template( name = "generic_walker", - srcs = [ - "walker_amd64.go", - ], + srcs = ["walker_amd64.go"], opt_types = [ "Visitor", ], @@ -76,9 +79,13 @@ go_library( "allocator.go", "allocator_unsafe.go", "pagetables.go", + "pagetables_aarch64.go", "pagetables_amd64.go", + "pagetables_arm64.go", "pagetables_x86.go", "pcids_x86.go", + "walker_amd64.go", + "walker_arm64.go", "walker_empty.go", "walker_lookup.go", "walker_map.go", @@ -97,6 +104,7 @@ go_test( size = "small", srcs = [ "pagetables_amd64_test.go", + "pagetables_arm64_test.go", "pagetables_test.go", "walker_check.go", ], diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go index 904f1a6de..30c64a372 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -48,15 +48,6 @@ func New(a Allocator) *PageTables { return p } -// Init initializes a set of PageTables. -// -//go:nosplit -func (p *PageTables) Init(allocator Allocator) { - p.Allocator = allocator - p.root = p.Allocator.NewPTEs() - p.rootPhysical = p.Allocator.PhysicalFor(p.root) -} - // mapVisitor is used for map. type mapVisitor struct { target uintptr // Input. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go new file mode 100644 index 000000000..e78424766 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go @@ -0,0 +1,212 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package pagetables + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// archPageTables is architecture-specific data. +type archPageTables struct { + // root is the pagetable root for kernel space. + root *PTEs + + // rootPhysical is the cached physical address of the root. + // + // This is saved only to prevent constant translation. + rootPhysical uintptr + + asid uint16 +} + +// TTBR0_EL1 returns the translation table base register 0. +// +//go:nosplit +func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 { + return uint64(p.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset +} + +// TTBR1_EL1 returns the translation table base register 1. +// +//go:nosplit +func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 { + return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset +} + +// Bits in page table entries. +const ( + typeTable = 0x3 << 0 + typeSect = 0x1 << 0 + typePage = 0x3 << 0 + pteValid = 0x1 << 0 + pteTableBit = 0x1 << 1 + pteTypeMask = 0x3 << 0 + present = pteValid | pteTableBit + user = 0x1 << 6 /* AP[1] */ + readOnly = 0x1 << 7 /* AP[2] */ + accessed = 0x1 << 10 + dbm = 0x1 << 51 + writable = dbm + cont = 0x1 << 52 + pxn = 0x1 << 53 + xn = 0x1 << 54 + dirty = 0x1 << 55 + nG = 0x1 << 11 + shared = 0x3 << 8 +) + +const ( + mtNormal = 0x4 << 2 +) + +const ( + executeDisable = xn + optionMask = 0xfff | 0xfff<<48 + protDefault = accessed | shared | mtNormal +) + +// MapOpts are x86 options. +type MapOpts struct { + // AccessType defines permissions. + AccessType usermem.AccessType + + // Global indicates the page is globally accessible. + Global bool + + // User indicates the page is a user page. + User bool +} + +// PTE is a page table entry. +type PTE uintptr + +// Clear clears this PTE, including sect page information. +// +//go:nosplit +func (p *PTE) Clear() { + atomic.StoreUintptr((*uintptr)(p), 0) +} + +// Valid returns true iff this entry is valid. +// +//go:nosplit +func (p *PTE) Valid() bool { + return atomic.LoadUintptr((*uintptr)(p))&present != 0 +} + +// Opts returns the PTE options. +// +// These are all options except Valid and Sect. +// +//go:nosplit +func (p *PTE) Opts() MapOpts { + v := atomic.LoadUintptr((*uintptr)(p)) + + return MapOpts{ + AccessType: usermem.AccessType{ + Read: true, + Write: v&readOnly == 0, + Execute: v&xn == 0, + }, + Global: v&nG == 0, + User: v&user != 0, + } +} + +// SetSect sets this page as a sect page. +// +// The page must not be valid or a panic will result. +// +//go:nosplit +func (p *PTE) SetSect() { + if p.Valid() { + // This is not allowed. + panic("SetSect called on valid page!") + } + atomic.StoreUintptr((*uintptr)(p), typeSect) +} + +// IsSect returns true iff this page is a sect page. +// +//go:nosplit +func (p *PTE) IsSect() bool { + return atomic.LoadUintptr((*uintptr)(p))&pteTypeMask == typeSect +} + +// Set sets this PTE value. +// +// This does not change the sect page property. +// +//go:nosplit +func (p *PTE) Set(addr uintptr, opts MapOpts) { + if !opts.AccessType.Any() { + p.Clear() + return + } + v := (addr &^ optionMask) | protDefault | nG | readOnly + + if p.IsSect() { + // Note that this is inherited from the previous instance. Set + // does not change the value of Sect. See above. + v |= typeSect + } else { + v |= typePage + } + + if opts.Global { + v = v &^ nG + } + + if opts.AccessType.Execute { + v = v &^ executeDisable + } else { + v |= executeDisable + } + if opts.AccessType.Write { + v = v &^ readOnly + } + + if opts.User { + v |= user + } else { + v = v &^ user + } + atomic.StoreUintptr((*uintptr)(p), v) +} + +// setPageTable sets this PTE value and forces the write bit and sect bit to +// be cleared. This is used explicitly for breaking sect pages. +// +//go:nosplit +func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { + addr := pt.Allocator.PhysicalFor(ptes) + if addr&^optionMask != addr { + // This should never happen. + panic("unaligned physical address!") + } + v := addr | typeTable | protDefault + atomic.StoreUintptr((*uintptr)(p), v) +} + +// Address extracts the address. This should only be used if Valid returns true. +// +//go:nosplit +func (p *PTE) Address() uintptr { + return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go index 7aa6c524e..0c153cf8c 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -41,5 +41,14 @@ const ( entriesPerPage = 512 ) +// Init initializes a set of PageTables. +// +//go:nosplit +func (p *PageTables) Init(allocator Allocator) { + p.Allocator = allocator + p.root = p.Allocator.NewPTEs() + p.rootPhysical = p.Allocator.PhysicalFor(p.root) +} + // PTEs is a collection of entries. type PTEs [entriesPerPage]PTE diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go new file mode 100644 index 000000000..1a49f12a2 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go @@ -0,0 +1,57 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +// Address constraints. +// +// The lowerTop and upperBottom currently apply to four-level pagetables; +// additional refactoring would be necessary to support five-level pagetables. +const ( + lowerTop = 0x0000ffffffffffff + upperBottom = 0xffff000000000000 + pteShift = 12 + pmdShift = 21 + pudShift = 30 + pgdShift = 39 + + pteMask = 0x1ff << pteShift + pmdMask = 0x1ff << pmdShift + pudMask = 0x1ff << pudShift + pgdMask = 0x1ff << pgdShift + + pteSize = 1 << pteShift + pmdSize = 1 << pmdShift + pudSize = 1 << pudShift + pgdSize = 1 << pgdShift + + ttbrASIDOffset = 55 + ttbrASIDMask = 0xff + + entriesPerPage = 512 +) + +// Init initializes a set of PageTables. +// +//go:nosplit +func (p *PageTables) Init(allocator Allocator) { + p.Allocator = allocator + p.root = p.Allocator.NewPTEs() + p.rootPhysical = p.Allocator.PhysicalFor(p.root) + p.archPageTables.root = p.Allocator.NewPTEs() + p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root) +} + +// PTEs is a collection of entries. +type PTEs [entriesPerPage]PTE diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go new file mode 100644 index 000000000..254116233 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go @@ -0,0 +1,80 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package pagetables + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +func Test2MAnd4K(t *testing.T) { + pt := New(NewRuntimeAllocator()) + + // Map a small page and a huge page. + pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42) + pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*47) + + pt.Map(0xffff000000400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: false}, pteSize*42) + pt.Map(0xffffff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: false}, pmdSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}}, + {0x0000ff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: true}}, + {0xffff000000400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: false}}, + {0xffffff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: false}}, + }) +} + +func Test1GAnd4K(t *testing.T) { + pt := New(NewRuntimeAllocator()) + + // Map a small page and a super page. + pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42) + pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}}, + {0x0000ff0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read, User: true}}, + }) +} + +func TestSplit1GPage(t *testing.T) { + pt := New(NewRuntimeAllocator()) + + // Map a super page and knock out the middle. + pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*42) + pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pudSize-(2*pteSize)) + + checkMappings(t, pt, []mapping{ + {0x0000ff0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read, User: true}}, + {0x0000ff0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}}, + }) +} + +func TestSplit2MPage(t *testing.T) { + pt := New(NewRuntimeAllocator()) + + // Map a huge page and knock out the middle. + pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*42) + pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pmdSize-(2*pteSize)) + + checkMappings(t, pt, []mapping{ + {0x0000ff0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read, User: true}}, + {0x0000ff0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}}, + }) +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go new file mode 100644 index 000000000..c261d393a --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go @@ -0,0 +1,314 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package pagetables + +// Visitor is a generic type. +type Visitor interface { + // visit is called on each PTE. + visit(start uintptr, pte *PTE, align uintptr) + + // requiresAlloc indicates that new entries should be allocated within + // the walked range. + requiresAlloc() bool + + // requiresSplit indicates that entries in the given range should be + // split if they are huge or jumbo pages. + requiresSplit() bool +} + +// Walker walks page tables. +type Walker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor Visitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is sect pages. If a valid sect page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of sect pages whenever +// possible. Whether a sect page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *Walker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func next(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *Walker) iterateRangeCanonical(start, end uintptr) { + pgdEntryIndex := w.pageTables.root + if start >= upperBottom { + pgdEntryIndex = w.pageTables.archPageTables.root + } + + for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &pgdEntryIndex[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + start = next(start, pgdSize) + continue + } + + // Allocate a new pgd. + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + // Map the next level. + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + clearPUDEntries++ + start = next(start, pudSize) + continue + } + + // This level has 1-GB sect pages. Is this + // entire region at least as large as a single + // PUD entry? If so, we can skip allocating a + // new page for the pmd. + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSect() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = next(start, pudSize) + continue + } + } + + // Allocate a new pud. + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSect() { + // Does this page need to be split? + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) { + // Install the relevant entries. + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSect() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + // A sect page to be checked directly. + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + // Might have been cleared. + if !pudEntry.Valid() { + clearPUDEntries++ + } + + // Note that the sect page was changed. + start = next(start, pudSize) + continue + } + + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + // Map the next level, since this is valid. + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + clearPMDEntries++ + start = next(start, pmdSize) + continue + } + + // This level has 2-MB huge pages. If this + // region is contined in a single PMD entry? + // As above, we can skip allocating a new page. + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSect() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = next(start, pmdSize) + continue + } + } + + // Allocate a new pmd. + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSect() { + // Does this page need to be split? + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) { + // Install the relevant entries. + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + // A huge page to be checked directly. + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + // Might have been cleared. + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + // Note that the huge page was changed. + start = next(start, pmdSize) + continue + } + + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + // Map the next level, since this is valid. + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + // At this point, we are guaranteed that start%pteSize == 0. + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + // Note that the pte was changed. + start += pteSize + continue + } + + // Check if we no longer need this page. + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + // Check if we no longer need this page. + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + // Check if we no longer need this page. + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD index 924d8a6d6..6769cd0a5 100644 --- a/pkg/sentry/platform/safecopy/BUILD +++ b/pkg/sentry/platform/safecopy/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD index fd6dc8e6e..884020f7b 100644 --- a/pkg/sentry/safemem/BUILD +++ b/pkg/sentry/safemem/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go index 2f65db70b..ba1f9043d 100644 --- a/pkg/sentry/sighandling/sighandling.go +++ b/pkg/sentry/sighandling/sighandling.go @@ -16,7 +16,6 @@ package sighandling import ( - "fmt" "os" "os/signal" "reflect" @@ -31,37 +30,25 @@ const numSignals = 32 // handleSignals listens for incoming signals and calls the given handler // function. // -// It starts when the start channel is closed, stops when the stop channel -// is closed, and closes done once it will no longer deliver signals to k. -func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, stop, done chan struct{}) { +// It stops when the stop channel is closed. The done channel is closed once it +// will no longer deliver signals to k. +func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), stop, done chan struct{}) { // Build a select case. - sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}} + sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}} for _, sigchan := range sigchans { sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)}) } - started := false for { // Wait for a notification. index, _, ok := reflect.Select(sc) - // Was it the start / stop channel? + // Was it the stop channel? if index == 0 { if !ok { - if !started { - // start channel; start forwarding and - // swap this case for the stop channel - // to select stop requests. - started = true - sc[0] = reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)} - } else { - // stop channel; stop forwarding and - // clear this case so it is never - // selected again. - started = false - close(done) - sc[0].Chan = reflect.Value{} - } + // Stop forwarding and notify that it's done. + close(done) + return } continue } @@ -73,44 +60,17 @@ func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, // Otherwise, it was a signal on channel N. Index 0 represents the stop // channel, so index N represents the channel for signal N. - signal := linux.Signal(index) - - if !started { - // Kernel cannot receive signals, either because it is - // not ready yet or is shutting down. - // - // Kill ourselves if this signal would have killed the - // process before PrepareForwarding was called. i.e., all - // _SigKill signals; see Go - // src/runtime/sigtab_linux_generic.go. - // - // Otherwise ignore the signal. - // - // TODO(b/114489875): Drop in Go 1.12, which uses tgkill - // in runtime.raise. - switch signal { - case linux.SIGHUP, linux.SIGINT, linux.SIGTERM: - dieFromSignal(signal) - panic(fmt.Sprintf("Failed to die from signal %d", signal)) - default: - continue - } - } - - // Pass the signal to the handler. - handler(signal) + handler(linux.Signal(index)) } } -// PrepareHandler ensures that synchronous signals are passed to the given -// handler function and returns a callback that starts signal delivery, which -// itself returns a callback that stops signal handling. +// StartSignalForwarding ensures that synchronous signals are passed to the +// given handler function and returns a callback that stops signal delivery. // // Note that this function permanently takes over signal handling. After the // stop callback, signals revert to the default Go runtime behavior, which // cannot be overridden with external calls to signal.Notify. -func PrepareHandler(handler func(linux.Signal)) func() func() { - start := make(chan struct{}) +func StartSignalForwarding(handler func(linux.Signal)) func() { stop := make(chan struct{}) done := make(chan struct{}) @@ -128,13 +88,10 @@ func PrepareHandler(handler func(linux.Signal)) func() func() { signal.Notify(sigchan, syscall.Signal(sig)) } // Start up our listener. - go handleSignals(sigchans, handler, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu. + go handleSignals(sigchans, handler, stop, done) // S/R-SAFE: synchronized by Kernel.extMu. - return func() func() { - close(start) - return func() { - close(stop) - <-done - } + return func() { + close(stop) + <-done } } diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go index eace3766d..1ebe22d34 100644 --- a/pkg/sentry/sighandling/sighandling_unsafe.go +++ b/pkg/sentry/sighandling/sighandling_unsafe.go @@ -15,15 +15,13 @@ package sighandling import ( - "fmt" - "runtime" "syscall" "unsafe" "gvisor.dev/gvisor/pkg/abi/linux" ) -// TODO(b/34161764): Move to pkg/abi/linux along with definitions in +// FIXME(gvisor.dev/issue/214): Move to pkg/abi/linux along with definitions in // pkg/sentry/arch. type sigaction struct { handler uintptr @@ -48,27 +46,3 @@ func IgnoreChildStop() error { return nil } - -// dieFromSignal kills the current process with sig. -// -// Preconditions: The default action of sig is termination. -func dieFromSignal(sig linux.Signal) { - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - sa := sigaction{handler: linux.SIG_DFL} - if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 { - panic(fmt.Sprintf("rt_sigaction failed: %v", e)) - } - - set := linux.MakeSignalSet(sig) - if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0); e != 0 { - panic(fmt.Sprintf("rt_sigprocmask failed: %v", e)) - } - - if err := syscall.Tgkill(syscall.Getpid(), syscall.Gettid(), syscall.Signal(sig)); err != nil { - panic(fmt.Sprintf("tgkill failed: %v", err)) - } - - panic("failed to die") -} diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 3300f9a6b..26176b10d 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "socket", srcs = ["socket.go"], diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 81dbd7309..357517ed4 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "control", srcs = ["control.go"], @@ -17,6 +17,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", "//pkg/syserror", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 4e95101b7..af1a4e95f 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -194,15 +195,15 @@ func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([ // the available space, we must align down. // // align must be >= 4 and each data int32 is 4 bytes. The length of the - // header is already aligned, so if we align to the with of the data there + // header is already aligned, so if we align to the width of the data there // are two cases: // 1. The aligned length is less than the length of the header. The // unaligned length was also less than the length of the header, so we // can't write anything. // 2. The aligned length is greater than or equal to the length of the - // header. We can write the header plus zero or more datas. We can't write - // a partial int32, so the length of the message will be - // min(aligned length, header + datas). + // header. We can write the header plus zero or more bytes of data. We can't + // write a partial int32, so the length of the message will be + // min(aligned length, header + data). if space < linux.SizeOfControlMessageHeader { flags |= linux.MSG_CTRUNC return buf, flags @@ -239,12 +240,12 @@ func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interf buf = binary.Marshal(buf, usermem.ByteOrder, data) - // Check if we went over. + // If the control message data brought us over capacity, omit it. if cap(buf) != cap(ob) { return hdrBuf } - // Fix up length. + // Update control message length to include data. putUint64(ob, uint64(len(buf)-len(ob))) return alignSlice(buf, align) @@ -320,35 +321,109 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte { buf, linux.SOL_TCP, linux.TCP_INQ, - 4, + t.Arch().Width(), inq, ) } +// PackTOS packs an IP_TOS socket control message. +func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_IP, + linux.IP_TOS, + t.Arch().Width(), + tos, + ) +} + +// PackTClass packs an IPV6_TCLASS socket control message. +func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_IPV6, + linux.IPV6_TCLASS, + t.Arch().Width(), + tClass, + ) +} + +// PackControlMessages packs control messages into the given buffer. +// +// We skip control messages specific to Unix domain sockets. +// +// Note that some control messages may be truncated if they do not fit under +// the capacity of buf. +func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byte) []byte { + if cmsgs.IP.HasTimestamp { + buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf) + } + + if cmsgs.IP.HasInq { + // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP. + buf = PackInq(t, cmsgs.IP.Inq, buf) + } + + if cmsgs.IP.HasTOS { + buf = PackTOS(t, cmsgs.IP.TOS, buf) + } + + if cmsgs.IP.HasTClass { + buf = PackTClass(t, cmsgs.IP.TClass, buf) + } + + return buf +} + +// cmsgSpace is equivalent to CMSG_SPACE in Linux. +func cmsgSpace(t *kernel.Task, dataLen int) int { + return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width()) +} + +// CmsgsSpace returns the number of bytes needed to fit the control messages +// represented in cmsgs. +func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int { + space := 0 + + if cmsgs.IP.HasTimestamp { + space += cmsgSpace(t, linux.SizeOfTimeval) + } + + if cmsgs.IP.HasInq { + space += cmsgSpace(t, linux.SizeOfControlMessageInq) + } + + if cmsgs.IP.HasTOS { + space += cmsgSpace(t, linux.SizeOfControlMessageTOS) + } + + if cmsgs.IP.HasTClass { + space += cmsgSpace(t, linux.SizeOfControlMessageTClass) + } + + return space +} + // Parse parses a raw socket control message into portable objects. -func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) { +func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) { var ( - fds linux.ControlMessageRights - haveCreds bool - creds linux.ControlMessageCredentials + cmsgs socket.ControlMessages + fds linux.ControlMessageRights ) for i := 0; i < len(buf); { if i+linux.SizeOfControlMessageHeader > len(buf) { - return transport.ControlMessages{}, syserror.EINVAL + return cmsgs, syserror.EINVAL } var h linux.ControlMessageHeader binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h) if h.Length < uint64(linux.SizeOfControlMessageHeader) { - return transport.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, syserror.EINVAL } if h.Length > uint64(len(buf)-i) { - return transport.ControlMessages{}, syserror.EINVAL - } - if h.Level != linux.SOL_SOCKET { - return transport.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, syserror.EINVAL } i += linux.SizeOfControlMessageHeader @@ -358,59 +433,79 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport. // sizeof(long) in CMSG_ALIGN. width := t.Arch().Width() - switch h.Type { - case linux.SCM_RIGHTS: - rightsSize := AlignDown(length, linux.SizeOfControlMessageRight) - numRights := rightsSize / linux.SizeOfControlMessageRight - - if len(fds)+numRights > linux.SCM_MAX_FD { - return transport.ControlMessages{}, syserror.EINVAL + switch h.Level { + case linux.SOL_SOCKET: + switch h.Type { + case linux.SCM_RIGHTS: + rightsSize := AlignDown(length, linux.SizeOfControlMessageRight) + numRights := rightsSize / linux.SizeOfControlMessageRight + + if len(fds)+numRights > linux.SCM_MAX_FD { + return socket.ControlMessages{}, syserror.EINVAL + } + + for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight { + fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight]))) + } + + i += AlignUp(length, width) + + case linux.SCM_CREDENTIALS: + if length < linux.SizeOfControlMessageCredentials { + return socket.ControlMessages{}, syserror.EINVAL + } + + var creds linux.ControlMessageCredentials + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds) + scmCreds, err := NewSCMCredentials(t, creds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Credentials = scmCreds + i += AlignUp(length, width) + + default: + // Unknown message type. + return socket.ControlMessages{}, syserror.EINVAL } - - for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight { - fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight]))) + case linux.SOL_IP: + switch h.Type { + case linux.IP_TOS: + cmsgs.IP.HasTOS = true + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS) + i += AlignUp(length, width) + + default: + return socket.ControlMessages{}, syserror.EINVAL } - - i += AlignUp(length, width) - - case linux.SCM_CREDENTIALS: - if length < linux.SizeOfControlMessageCredentials { - return transport.ControlMessages{}, syserror.EINVAL + case linux.SOL_IPV6: + switch h.Type { + case linux.IPV6_TCLASS: + cmsgs.IP.HasTClass = true + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass) + i += AlignUp(length, width) + + default: + return socket.ControlMessages{}, syserror.EINVAL } - - binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds) - haveCreds = true - i += AlignUp(length, width) - default: - // Unknown message type. - return transport.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, syserror.EINVAL } } - var credentials SCMCredentials - if haveCreds { - var err error - if credentials, err = NewSCMCredentials(t, creds); err != nil { - return transport.ControlMessages{}, err - } - } else { - credentials = makeCreds(t, socketOrEndpoint) + if cmsgs.Unix.Credentials == nil { + cmsgs.Unix.Credentials = makeCreds(t, socketOrEndpoint) } - var rights SCMRights if len(fds) > 0 { - var err error - if rights, err = NewSCMRights(t, fds); err != nil { - return transport.ControlMessages{}, err + rights, err := NewSCMRights(t, fds) + if err != nil { + return socket.ControlMessages{}, err } + cmsgs.Unix.Rights = rights } - if credentials == nil && rights == nil { - return transport.ControlMessages{}, nil - } - - return transport.ControlMessages{Credentials: credentials, Rights: rights}, nil + return cmsgs, nil } func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials { diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index a951f1bb0..4c44c7c0f 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "hostinet", srcs = [ @@ -29,9 +29,12 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", + "//pkg/sentry/socket/control", "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", + "//pkg/tcpip/stack", "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 92beb1bcf..c957b0f1d 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -18,6 +18,7 @@ import ( "fmt" "syscall" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/fdnotifier" @@ -29,6 +30,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" @@ -41,6 +43,10 @@ const ( // sizeofSockaddr is the size in bytes of the largest sockaddr type // supported by this package. sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in) + + // maxControlLen is the maximum size of a control message buffer used in a + // recvmsg or sendmsg syscall. + maxControlLen = 1024 ) // socketOperations implements fs.FileOperations and socket.Socket for a socket @@ -281,26 +287,32 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt // Whitelist options and constrain option length. var optlen int switch level { - case syscall.SOL_IPV6: + case linux.SOL_IP: + switch name { + case linux.IP_TOS, linux.IP_RECVTOS: + optlen = sizeofInt32 + } + case linux.SOL_IPV6: switch name { - case syscall.IPV6_V6ONLY: + case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY: optlen = sizeofInt32 } - case syscall.SOL_SOCKET: + case linux.SOL_SOCKET: switch name { - case syscall.SO_ERROR, syscall.SO_KEEPALIVE, syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR: + case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR: optlen = sizeofInt32 - case syscall.SO_LINGER: + case linux.SO_LINGER: optlen = syscall.SizeofLinger } - case syscall.SOL_TCP: + case linux.SOL_TCP: switch name { - case syscall.TCP_NODELAY: + case linux.TCP_NODELAY: optlen = sizeofInt32 - case syscall.TCP_INFO: + case linux.TCP_INFO: optlen = int(linux.SizeOfTCPInfo) } } + if optlen == 0 { return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT } @@ -320,19 +332,24 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ // Whitelist options and constrain option length. var optlen int switch level { - case syscall.SOL_IPV6: + case linux.SOL_IP: + switch name { + case linux.IP_TOS, linux.IP_RECVTOS: + optlen = sizeofInt32 + } + case linux.SOL_IPV6: switch name { - case syscall.IPV6_V6ONLY: + case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY: optlen = sizeofInt32 } - case syscall.SOL_SOCKET: + case linux.SOL_SOCKET: switch name { - case syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR: + case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR: optlen = sizeofInt32 } - case syscall.SOL_TCP: + case linux.SOL_TCP: switch name { - case syscall.TCP_NODELAY: + case linux.TCP_NODELAY: optlen = sizeofInt32 } } @@ -354,11 +371,11 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ } // RecvMsg implements socket.Socket.RecvMsg. -func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { // Whitelist flags. // // FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary - // messages that netstack/tcpip/transport/unix doesn't understand. Kill the + // messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the // Socket interface's dependence on netstack. if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument @@ -370,6 +387,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags senderAddrBuf = make([]byte, sizeofSockaddr) } + var controlBuf []byte var msgFlags int recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { @@ -384,11 +402,6 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // We always do a non-blocking recv*(). sysflags := flags | syscall.MSG_DONTWAIT - if dsts.NumBlocks() == 1 { - // Skip allocating []syscall.Iovec. - return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddrBuf) - } - iovs := iovecsFromBlockSeq(dsts) msg := syscall.Msghdr{ Iov: &iovs[0], @@ -398,12 +411,21 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags msg.Name = &senderAddrBuf[0] msg.Namelen = uint32(len(senderAddrBuf)) } + if controlLen > 0 { + if controlLen > maxControlLen { + controlLen = maxControlLen + } + controlBuf = make([]byte, controlLen) + msg.Control = &controlBuf[0] + msg.Controllen = controlLen + } n, err := recvmsg(s.fd, &msg, sysflags) if err != nil { return 0, err } senderAddrBuf = senderAddrBuf[:msg.Namelen] msgFlags = int(msg.Flags) + controlLen = uint64(msg.Controllen) return n, nil }) @@ -429,14 +451,38 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags n, err = dst.CopyOutFrom(t, recvmsgToBlocks) } } - - // We don't allow control messages. - msgFlags &^= linux.MSG_CTRUNC + if err != nil { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } if senderRequested { senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf) } - return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), socket.ControlMessages{}, syserr.FromError(err) + + unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf[:controlLen]) + if err != nil { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + + controlMessages := socket.ControlMessages{} + for _, unixCmsg := range unixControlMessages { + switch unixCmsg.Header.Level { + case syscall.SOL_IP: + switch unixCmsg.Header.Type { + case syscall.IP_TOS: + controlMessages.IP.HasTOS = true + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS) + } + case syscall.SOL_IPV6: + switch unixCmsg.Header.Type { + case syscall.IPV6_TCLASS: + controlMessages.IP.HasTClass = true + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTClass], usermem.ByteOrder, &controlMessages.IP.TClass) + } + } + } + + return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), controlMessages, nil } // SendMsg implements socket.Socket.SendMsg. @@ -446,6 +492,14 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return 0, syserr.ErrInvalidArgument } + space := uint64(control.CmsgsSpace(t, controlMessages)) + if space > maxControlLen { + space = maxControlLen + } + controlBuf := make([]byte, 0, space) + // PackControlMessages will append up to space bytes to controlBuf. + controlBuf = control.PackControlMessages(t, controlMessages, controlBuf) + sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { // Refuse to do anything if any part of src.Addrs was unusable. if uint64(src.NumBytes()) != srcs.NumBytes() { @@ -458,7 +512,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] // We always do a non-blocking send*(). sysflags := flags | syscall.MSG_DONTWAIT - if srcs.NumBlocks() == 1 { + if srcs.NumBlocks() == 1 && len(controlBuf) == 0 { // Skip allocating []syscall.Iovec. src := srcs.Head() n, _, errno := syscall.Syscall6(syscall.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) @@ -477,6 +531,10 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] msg.Name = &to[0] msg.Namelen = uint32(len(to)) } + if len(controlBuf) != 0 { + msg.Control = &controlBuf[0] + msg.Controllen = uint64(len(controlBuf)) + } return sendmsg(s.fd, &msg, sysflags) }) diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index 3a4fdec47..e67b46c9e 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -16,8 +16,11 @@ package hostinet import ( "fmt" + "io" "io/ioutil" "os" + "reflect" + "strconv" "strings" "syscall" @@ -26,7 +29,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/stack" ) var defaultRecvBufSize = inet.TCPBufferSize{ @@ -51,6 +56,8 @@ type Stack struct { tcpRecvBufSize inet.TCPBufferSize tcpSendBufSize inet.TCPBufferSize tcpSACKEnabled bool + netDevFile *os.File + netSNMPFile *os.File } // NewStack returns an empty Stack containing no configuration. @@ -98,6 +105,18 @@ func (s *Stack) Configure() error { log.Warningf("Failed to read if TCP SACK if enabled, setting to true") } + if f, err := os.Open("/proc/net/dev"); err != nil { + log.Warningf("Failed to open /proc/net/dev: %v", err) + } else { + s.netDevFile = f + } + + if f, err := os.Open("/proc/net/snmp"); err != nil { + log.Warningf("Failed to open /proc/net/snmp: %v", err) + } else { + s.netSNMPFile = f + } + return nil } @@ -326,9 +345,95 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserror.EACCES } +// getLine reads one line from proc file, with specified prefix. +// The last argument, withHeader, specifies if it contains line header. +func getLine(f *os.File, prefix string, withHeader bool) string { + data := make([]byte, 4096) + + if _, err := f.Seek(0, 0); err != nil { + return "" + } + + if _, err := io.ReadFull(f, data); err != io.ErrUnexpectedEOF { + return "" + } + + prefix = prefix + ":" + lines := strings.Split(string(data), "\n") + for _, l := range lines { + l = strings.TrimSpace(l) + if strings.HasPrefix(l, prefix) { + if withHeader { + withHeader = false + continue + } + return l + } + } + return "" +} + +func toSlice(i interface{}) []uint64 { + v := reflect.Indirect(reflect.ValueOf(i)) + return v.Slice(0, v.Len()).Interface().([]uint64) +} + // Statistics implements inet.Stack.Statistics. func (s *Stack) Statistics(stat interface{}, arg string) error { - return syserror.EOPNOTSUPP + var ( + snmpTCP bool + rawLine string + sliceStat []uint64 + ) + + switch stat.(type) { + case *inet.StatDev: + if s.netDevFile == nil { + return fmt.Errorf("/proc/net/dev is not opened for hostinet") + } + rawLine = getLine(s.netDevFile, arg, false /* with no header */) + case *inet.StatSNMPIP, *inet.StatSNMPICMP, *inet.StatSNMPICMPMSG, *inet.StatSNMPTCP, *inet.StatSNMPUDP, *inet.StatSNMPUDPLite: + if s.netSNMPFile == nil { + return fmt.Errorf("/proc/net/snmp is not opened for hostinet") + } + rawLine = getLine(s.netSNMPFile, arg, true) + default: + return syserr.ErrEndpointOperation.ToError() + } + + if rawLine == "" { + return fmt.Errorf("Failed to get raw line") + } + + parts := strings.SplitN(rawLine, ":", 2) + if len(parts) != 2 { + return fmt.Errorf("Failed to get prefix from: %q", rawLine) + } + + sliceStat = toSlice(stat) + fields := strings.Fields(strings.TrimSpace(parts[1])) + if len(fields) != len(sliceStat) { + return fmt.Errorf("Failed to parse fields: %q", rawLine) + } + if _, ok := stat.(*inet.StatSNMPTCP); ok { + snmpTCP = true + } + for i := 0; i < len(sliceStat); i++ { + var err error + if snmpTCP && i == 3 { + var tmp int64 + // MaxConn field is signed, RFC 2012. + tmp, err = strconv.ParseInt(fields[i], 10, 64) + sliceStat[i] = uint64(tmp) // Convert back to int before use. + } else { + sliceStat[i], err = strconv.ParseUint(fields[i], 10, 64) + } + if err != nil { + return fmt.Errorf("Failed to parse field %d from: %q, %v", i, rawLine, err) + } + } + + return nil } // RouteTable implements inet.Stack.RouteTable. @@ -338,3 +443,12 @@ func (s *Stack) RouteTable() []inet.Route { // Resume implements inet.Stack.Resume. func (s *Stack) Resume() {} + +// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. +func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } + +// CleanupEndpoints implements inet.Stack.CleanupEndpoints. +func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } + +// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. +func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index 354a0d6ee..5eb06bbf4 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "netfilter", srcs = [ diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 45ebb2a0e..79589e3c8 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "netlink", srcs = [ @@ -20,7 +20,9 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", + "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix", diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD index 9e2e12799..463544c1a 100644 --- a/pkg/sentry/socket/netlink/port/BUILD +++ b/pkg/sentry/socket/netlink/port/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "port", diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 689cad997..be005df24 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -30,6 +30,13 @@ type Protocol interface { // Protocol returns the Linux netlink protocol value. Protocol() int + // CanSend returns true if this protocol may ever send messages. + // + // TODO(gvisor.dev/issue/1119): This is a workaround to allow + // advertising support for otherwise unimplemented features on sockets + // that will never send messages, thus making those features no-ops. + CanSend() bool + // ProcessMessage processes a single message from userspace. // // If err == nil, any messages added to ms will be sent back to the diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD index 5dc8533ec..1d4912753 100644 --- a/pkg/sentry/socket/netlink/route/BUILD +++ b/pkg/sentry/socket/netlink/route/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "route", srcs = ["protocol.go"], diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go index cc70ac237..6b4a0ecf4 100644 --- a/pkg/sentry/socket/netlink/route/protocol.go +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -61,6 +61,11 @@ func (p *Protocol) Protocol() int { return linux.NETLINK_ROUTE } +// CanSend implements netlink.Protocol.CanSend. +func (p *Protocol) CanSend() bool { + return true +} + // dumpLinks handles RTM_GETLINK + NLM_F_DUMP requests. func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { // NLM_F_DUMP + RTM_GETLINK messages are supposed to include an diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index d0aab293d..4a1b87a9a 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -27,7 +27,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" "gvisor.dev/gvisor/pkg/sentry/socket/unix" @@ -52,6 +54,8 @@ const ( maxSendBufferSize = 4 << 20 // 4MB ) +var errNoFilter = syserr.New("no filter attached", linux.ENOENT) + // netlinkSocketDevice is the netlink socket virtual device. var netlinkSocketDevice = device.NewAnonDevice() @@ -60,7 +64,7 @@ var netlinkSocketDevice = device.NewAnonDevice() // This implementation only supports userspace sending and receiving messages // to/from the kernel. // -// Socket implements socket.Socket. +// Socket implements socket.Socket and transport.Credentialer. // // +stateify savable type Socket struct { @@ -103,9 +107,19 @@ type Socket struct { // sendBufferSize is the send buffer "size". We don't actually have a // fixed buffer but only consume this many bytes. sendBufferSize uint32 + + // passcred indicates if this socket wants SCM credentials. + passcred bool + + // filter indicates that this socket has a BPF filter "installed". + // + // TODO(gvisor.dev/issue/1119): We don't actually support filtering, + // this is just bookkeeping for tracking add/remove. + filter bool } var _ socket.Socket = (*Socket)(nil) +var _ transport.Credentialer = (*Socket)(nil) // NewSocket creates a new Socket. func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) { @@ -171,6 +185,22 @@ func (s *Socket) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } +// Passcred implements transport.Credentialer.Passcred. +func (s *Socket) Passcred() bool { + s.mu.Lock() + passcred := s.passcred + s.mu.Unlock() + return passcred +} + +// ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. +func (s *Socket) ConnectedPasscred() bool { + // This socket is connected to the kernel, which doesn't need creds. + // + // This is arbitrary, as ConnectedPasscred on this type has no callers. + return false +} + // Ioctl implements fs.FileOperations.Ioctl. func (*Socket) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) { // TODO(b/68878065): no ioctls supported. @@ -308,9 +338,20 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem. // We don't have limit on receiving size. return int32(math.MaxInt32), nil + case linux.SO_PASSCRED: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + var passcred int32 + if s.Passcred() { + passcred = 1 + } + return passcred, nil + default: socket.GetSockOptEmitUnimplementedEvent(t, name) } + case linux.SOL_NETLINK: switch name { case linux.NETLINK_BROADCAST_ERROR, @@ -347,6 +388,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy s.sendBufferSize = size s.mu.Unlock() return nil + case linux.SO_RCVBUF: if len(opt) < sizeOfInt32 { return syserr.ErrInvalidArgument @@ -354,6 +396,52 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy // We don't have limit on receiving size. So just accept anything as // valid for compatibility. return nil + + case linux.SO_PASSCRED: + if len(opt) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + passcred := usermem.ByteOrder.Uint32(opt) + + s.mu.Lock() + s.passcred = passcred != 0 + s.mu.Unlock() + return nil + + case linux.SO_ATTACH_FILTER: + // TODO(gvisor.dev/issue/1119): We don't actually + // support filtering. If this socket can't ever send + // messages, then there is nothing to filter and we can + // advertise support. Otherwise, be conservative and + // return an error. + if s.protocol.CanSend() { + socket.SetSockOptEmitUnimplementedEvent(t, name) + return syserr.ErrProtocolNotAvailable + } + + s.mu.Lock() + s.filter = true + s.mu.Unlock() + return nil + + case linux.SO_DETACH_FILTER: + // TODO(gvisor.dev/issue/1119): See above. + if s.protocol.CanSend() { + socket.SetSockOptEmitUnimplementedEvent(t, name) + return syserr.ErrProtocolNotAvailable + } + + s.mu.Lock() + filter := s.filter + s.filter = false + s.mu.Unlock() + + if !filter { + return errNoFilter + } + + return nil + default: socket.SetSockOptEmitUnimplementedEvent(t, name) } @@ -416,6 +504,24 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have Peek: flags&linux.MSG_PEEK != 0, } + // If MSG_TRUNC is set with a zero byte destination then we still need + // to read the message and discard it, or in the case where MSG_PEEK is + // set, leave it be. In both cases the full message length must be + // returned. However, the memory manager for the destination will not read + // the endpoint if the destination is zero length. + // + // In order for the endpoint to be read when the destination size is zero, + // we must cause a read of the endpoint by using a separate fake zero + // length block sequence and calling the EndpointReader directly. + if trunc && dst.Addrs.NumBytes() == 0 { + // Perform a read to a zero byte block sequence. We can ignore the + // original destination since it was zero bytes. The length returned by + // ReadToBlocks is ignored and we return the full message length to comply + // with MSG_TRUNC. + _, err := r.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(make([]byte, 0)))) + return int(r.MsgSize), linux.MSG_TRUNC, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) + } + if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { var mflags int if n < int64(r.MsgSize) { @@ -464,6 +570,26 @@ func (s *Socket) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ }) } +// kernelSCM implements control.SCMCredentials with credentials that represent +// the kernel itself rather than a Task. +// +// +stateify savable +type kernelSCM struct{} + +// Equals implements transport.CredentialsControlMessage.Equals. +func (kernelSCM) Equals(oc transport.CredentialsControlMessage) bool { + _, ok := oc.(kernelSCM) + return ok +} + +// Credentials implements control.SCMCredentials.Credentials. +func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) { + return 0, auth.RootUID, auth.RootGID +} + +// kernelCreds is the concrete version of kernelSCM used in all creds. +var kernelCreds = &kernelSCM{} + // sendResponse sends the response messages in ms back to userspace. func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { // Linux combines multiple netlink messages into a single datagram. @@ -472,10 +598,15 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error bufs = append(bufs, m.Finalize()) } + // All messages are from the kernel. + cms := transport.ControlMessages{ + Credentials: kernelCreds, + } + if len(bufs) > 0 { // RecvMsg never receives the address, so we don't need to send // one. - _, notify, err := s.connection.Send(bufs, transport.ControlMessages{}, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{}) // If the buffer is full, we simply drop messages, just like // Linux. if err != nil && err != syserr.ErrWouldBlock { @@ -499,7 +630,10 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error PortID: uint32(ms.PortID), }) - _, notify, err := s.connection.Send([][]byte{m.Finalize()}, transport.ControlMessages{}, tcpip.FullAddress{}) + // Add the dump_done_errno payload. + m.Put(int64(0)) + + _, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) if err != nil && err != syserr.ErrWouldBlock { return err } diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD new file mode 100644 index 000000000..0777f3baf --- /dev/null +++ b/pkg/sentry/socket/netlink/uevent/BUILD @@ -0,0 +1,17 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "uevent", + srcs = ["protocol.go"], + importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/kernel", + "//pkg/sentry/socket/netlink", + "//pkg/syserr", + ], +) diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go new file mode 100644 index 000000000..b5d7808d7 --- /dev/null +++ b/pkg/sentry/socket/netlink/uevent/protocol.go @@ -0,0 +1,60 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package uevent provides a NETLINK_KOBJECT_UEVENT socket protocol. +// +// NETLINK_KOBJECT_UEVENT sockets send udev-style device events. gVisor does +// not support any device events, so these sockets never send any messages. +package uevent + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink" + "gvisor.dev/gvisor/pkg/syserr" +) + +// Protocol implements netlink.Protocol. +// +// +stateify savable +type Protocol struct{} + +var _ netlink.Protocol = (*Protocol)(nil) + +// NewProtocol creates a NETLINK_KOBJECT_UEVENT netlink.Protocol. +func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) { + return &Protocol{}, nil +} + +// Protocol implements netlink.Protocol.Protocol. +func (p *Protocol) Protocol() int { + return linux.NETLINK_KOBJECT_UEVENT +} + +// CanSend implements netlink.Protocol.CanSend. +func (p *Protocol) CanSend() bool { + return false +} + +// ProcessMessage implements netlink.Protocol.ProcessMessage. +func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { + // Silently ignore all messages. + return nil +} + +// init registers the NETLINK_KOBJECT_UEVENT provider. +func init() { + netlink.RegisterProvider(linux.NETLINK_KOBJECT_UEVENT, NewProtocol) +} diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/netstack/BUILD index e927821e1..e414d8055 100644 --- a/pkg/sentry/socket/epsocket/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -1,17 +1,17 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( - name = "epsocket", + name = "netstack", srcs = [ "device.go", - "epsocket.go", + "netstack.go", "provider.go", "save_restore.go", "stack.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/epsocket", + importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netstack", visibility = [ "//pkg/sentry:internal", ], diff --git a/pkg/sentry/socket/epsocket/device.go b/pkg/sentry/socket/netstack/device.go index 85484d5b1..fbeb89fb8 100644 --- a/pkg/sentry/socket/epsocket/device.go +++ b/pkg/sentry/socket/netstack/device.go @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -package epsocket +package netstack import "gvisor.dev/gvisor/pkg/sentry/device" -// epsocketDevice is the endpoint socket virtual device. -var epsocketDevice = device.NewAnonDevice() +// netstackDevice is the endpoint socket virtual device. +var netstackDevice = device.NewAnonDevice() diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/netstack/netstack.go index 0e37ce61b..140851c17 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package epsocket provides an implementation of the socket.Socket interface +// Package netstack provides an implementation of the socket.Socket interface // that is backed by a tcpip.Endpoint. // // It does not depend on any particular endpoint implementation, and thus can @@ -22,10 +22,11 @@ // Lock ordering: netstack => mm: ioSequencePayload copies user memory inside // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during // this operation. -package epsocket +package netstack import ( "bytes" + "io" "math" "reflect" "sync" @@ -52,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" @@ -136,15 +138,21 @@ var Metrics = tcpip.Stats{ }, }, IP: tcpip.IPStats{ - PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."), - InvalidAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."), - PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), - PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."), - OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."), + PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."), + InvalidAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."), + PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), + PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."), + OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."), + MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."), + MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."), }, TCP: tcpip.TCPStats{ ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), + CurrentEstablished: mustCreateMetric("/netstack/tcp/current_established", "Number of connections in either ESTABLISHED or CLOSE-WAIT state now."), + EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"), + EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."), + EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."), ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."), ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."), ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."), @@ -154,6 +162,7 @@ var Metrics = tcpip.Stats{ ValidSegmentsReceived: mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."), InvalidSegmentsReceived: mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."), SegmentsSent: mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."), + SegmentSendErrors: mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."), ResetsSent: mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."), ResetsReceived: mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."), Retransmits: mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."), @@ -169,13 +178,18 @@ var Metrics = tcpip.Stats{ UnknownPortErrors: mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."), ReceiveBufferErrors: mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."), MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."), - PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent via sendUDP."), + PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."), + PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."), }, } +// DefaultTTL is linux's default TTL. All network protocols in all stacks used +// with this package must have this value set as their default TTL. +const DefaultTTL = 64 + const sizeOfInt32 int = 4 -var errStackType = syserr.New("expected but did not receive an epsocket.Stack", linux.EINVAL) +var errStackType = syserr.New("expected but did not receive a netstack.Stack", linux.EINVAL) // ntohs converts a 16-bit number from network byte order to host byte order. It // assumes that the host is little endian. @@ -208,6 +222,10 @@ type commonEndpoint interface { // transport.Endpoint.SetSockOpt. SetSockOpt(interface{}) *tcpip.Error + // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and + // transport.Endpoint.SetSockOptInt. + SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error + // GetSockOpt implements tcpip.Endpoint.GetSockOpt and // transport.Endpoint.GetSockOpt. GetSockOpt(interface{}) *tcpip.Error @@ -227,7 +245,6 @@ type SocketOperations struct { fsutil.FileNoopFlush `state:"nosave"` fsutil.FileNoFsync `state:"nosave"` fsutil.FileNoMMap `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` socket.SendReceiveTimeout *waiter.Queue @@ -258,20 +275,20 @@ type SocketOperations struct { // valid when timestampValid is true. It is protected by readMu. timestampNS int64 - // sockOptInq corresponds to TCP_INQ. It is implemented on the epsocket - // level, because it takes into account data from readView. + // sockOptInq corresponds to TCP_INQ. It is implemented at this level + // because it takes into account data from readView. sockOptInq bool } // New creates a new endpoint socket. func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) { if skType == linux.SOCK_STREAM { - if err := endpoint.SetSockOpt(tcpip.DelayOption(1)); err != nil { + if err := endpoint.SetSockOptInt(tcpip.DelayOption, 1); err != nil { return nil, syserr.TranslateNetstackError(err) } } - dirent := socket.NewDirent(t, epsocketDevice) + dirent := socket.NewDirent(t, netstackDevice) defer dirent.DecRef() return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ Queue: queue, @@ -284,6 +301,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{})) var sockAddrInet6Size = int(binary.Size(linux.SockAddrInet6{})) +var sockAddrLinkSize = int(binary.Size(linux.SockAddrLink{})) // bytesToIPAddress converts an IPv4 or IPv6 address from the user to the // netstack representation taking any addresses into account. @@ -295,12 +313,12 @@ func bytesToIPAddress(addr []byte) tcpip.Address { } // AddressAndFamily reads an sockaddr struct from the given address and -// converts it to the FullAddress format. It supports AF_UNIX, AF_INET and -// AF_INET6 addresses. +// converts it to the FullAddress format. It supports AF_UNIX, AF_INET, +// AF_INET6, and AF_PACKET addresses. // // strict indicates whether addresses with the AF_UNSPEC family are accepted of not. // -// AddressAndFamily returns an address, its family. +// AddressAndFamily returns an address and its family. func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) { // Make sure we have at least 2 bytes for the address family. if len(addr) < 2 { @@ -308,7 +326,7 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, } family := usermem.ByteOrder.Uint16(addr) - if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) { + if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) { return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported } @@ -359,6 +377,22 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, } return out, family, nil + case linux.AF_PACKET: + var a linux.SockAddrLink + if len(addr) < sockAddrLinkSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a) + if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + + // TODO(b/129292371): Return protocol too. + return tcpip.FullAddress{ + NIC: tcpip.NICID(a.InterfaceIndex), + Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + }, family, nil + case linux.AF_UNSPEC: return tcpip.FullAddress{}, family, nil @@ -412,17 +446,60 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS return int64(n), nil } -// ioSequencePayload implements tcpip.Payload. It copies user memory bytes on demand -// based on the requested size. +// WriteTo implements fs.FileOperations.WriteTo. +func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) { + s.readMu.Lock() + + // Copy as much data as possible. + done := int64(0) + for count > 0 { + // This may return a blocking error. + if err := s.fetchReadView(); err != nil { + s.readMu.Unlock() + return done, err.ToError() + } + + // Write to the underlying file. + n, err := dst.Write(s.readView) + done += int64(n) + count -= int64(n) + if dup { + // That's all we support for dup. This is generally + // supported by any Linux system calls, but the + // expectation is that now a caller will call read to + // actually remove these bytes from the socket. + break + } + + // Drop that part of the view. + s.readView.TrimFront(n) + if err != nil { + s.readMu.Unlock() + return done, err + } + } + + s.readMu.Unlock() + return done, nil +} + +// ioSequencePayload implements tcpip.Payload. +// +// t copies user memory bytes on demand based on the requested size. type ioSequencePayload struct { ctx context.Context src usermem.IOSequence } -// Get implements tcpip.Payload. -func (i *ioSequencePayload) Get(size int) ([]byte, *tcpip.Error) { - if size > i.Size() { - size = i.Size() +// FullPayload implements tcpip.Payloader.FullPayload +func (i *ioSequencePayload) FullPayload() ([]byte, *tcpip.Error) { + return i.Payload(int(i.src.NumBytes())) +} + +// Payload implements tcpip.Payloader.Payload. +func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) { + if max := int(i.src.NumBytes()); size > max { + size = max } v := buffer.NewView(size) if _, err := i.src.CopyIn(i.ctx, v); err != nil { @@ -431,11 +508,6 @@ func (i *ioSequencePayload) Get(size int) ([]byte, *tcpip.Error) { return v, nil } -// Size implements tcpip.Payload. -func (i *ioSequencePayload) Size() int { - return int(i.src.NumBytes()) -} - // DropFirst drops the first n bytes from underlying src. func (i *ioSequencePayload) DropFirst(n int) { i.src = i.src.DropFirst(int(n)) @@ -469,6 +541,78 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO return int64(n), nil } +// readerPayload implements tcpip.Payloader. +// +// It allocates a view and reads from a reader on-demand, based on available +// capacity in the endpoint. +type readerPayload struct { + ctx context.Context + r io.Reader + count int64 + err error +} + +// FullPayload implements tcpip.Payloader.FullPayload. +func (r *readerPayload) FullPayload() ([]byte, *tcpip.Error) { + return r.Payload(int(r.count)) +} + +// Payload implements tcpip.Payloader.Payload. +func (r *readerPayload) Payload(size int) ([]byte, *tcpip.Error) { + if size > int(r.count) { + size = int(r.count) + } + v := buffer.NewView(size) + n, err := r.r.Read(v) + if n > 0 { + // We ignore the error here. It may re-occur on subsequent + // reads, but for now we can enqueue some amount of data. + r.count -= int64(n) + return v[:n], nil + } + if err == syserror.ErrWouldBlock { + return nil, tcpip.ErrWouldBlock + } else if err != nil { + r.err = err // Save for propation. + return nil, tcpip.ErrBadAddress + } + + // There is no data and no error. Return an error, which will propagate + // r.err, which will be nil. This is the desired result: (0, nil). + return nil, tcpip.ErrBadAddress +} + +// ReadFrom implements fs.FileOperations.ReadFrom. +func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { + f := &readerPayload{ctx: ctx, r: r, count: count} + n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{ + // Reads may be destructive but should be very fast, + // so we can't release the lock while copying data. + Atomic: true, + }) + if err == tcpip.ErrWouldBlock { + return 0, syserror.ErrWouldBlock + } + + if resCh != nil { + t := ctx.(*kernel.Task) + if err := t.Block(resCh); err != nil { + return 0, syserr.FromError(err).ToError() + } + + n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{ + Atomic: true, // See above. + }) + } + if err == tcpip.ErrWouldBlock { + return n, syserror.ErrWouldBlock + } else if err != nil { + return int64(n), f.err // Propagate error. + } + + return int64(n), nil +} + // Readiness returns a mask of ready events for socket s. func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { r := s.Endpoint.Readiness(mask) @@ -646,7 +790,7 @@ func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { // tcpip.Endpoint. func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is - // implemented specifically for epsocket.SocketOperations rather than + // implemented specifically for netstack.SocketOperations rather than // commonEndpoint. commonEndpoint should be extended to support socket // options where the implementation is not shared, as unix sockets need // their own support for SO_TIMESTAMP. @@ -719,7 +863,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, return getSockOptIPv6(t, ep, name, outLen) case linux.SOL_IP: - return getSockOptIP(t, ep, name, outLen) + return getSockOptIP(t, ep, name, outLen, family) case linux.SOL_UDP, linux.SOL_ICMPV6, @@ -777,8 +921,8 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family return nil, syserr.ErrInvalidArgument } - var size tcpip.SendBufferSizeOption - if err := ep.GetSockOpt(&size); err != nil { + size, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -793,8 +937,8 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family return nil, syserr.ErrInvalidArgument } - var size tcpip.ReceiveBufferSizeOption - if err := ep.GetSockOpt(&size); err != nil { + size, err := ep.GetSockOptInt(tcpip.ReceiveBufferSizeOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -828,6 +972,19 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family return int32(v), nil + case linux.SO_BINDTODEVICE: + var v tcpip.BindToDeviceOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + if len(v) == 0 { + return []byte{}, nil + } + if outLen < linux.IFNAMSIZ { + return nil, syserr.ErrInvalidArgument + } + return append([]byte(v), 0), nil + case linux.SO_BROADCAST: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument @@ -900,8 +1057,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return nil, syserr.ErrInvalidArgument } - var v tcpip.DelayOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptInt(tcpip.DelayOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -970,6 +1127,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(time.Duration(v) / time.Second), nil + case linux.TCP_USER_TIMEOUT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TCPUserTimeoutOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Millisecond), nil + case linux.TCP_INFO: var v tcpip.TCPInfoOption if err := ep.GetSockOpt(&v); err != nil { @@ -1018,6 +1187,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa copy(b, v) return b, nil + case linux.TCP_LINGER2: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TCPLingerTimeoutOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Second), nil + default: emitUnimplementedEventTCP(t, name) } @@ -1042,6 +1223,25 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf case linux.IPV6_PATHMTU: t.Kernel().EmitUnimplementedEvent(t) + case linux.IPV6_TCLASS: + // Length handling for parity with Linux. + if outLen == 0 { + return make([]byte, 0), nil + } + var v tcpip.IPv6TrafficClassOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + uintv := uint32(v) + // Linux truncates the output binary to outLen. + ib := binary.Marshal(nil, usermem.ByteOrder, &uintv) + // Handle cases where outLen is lesser than sizeOfInt32. + if len(ib) > outLen { + ib = ib[:outLen] + } + return ib, nil + default: emitUnimplementedEventIPv6(t, name) } @@ -1049,8 +1249,25 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf } // getSockOptIP implements GetSockOpt when level is SOL_IP. -func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (interface{}, *syserr.Error) { switch name { + case linux.IP_TTL: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TTLOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + // Fill in the default value, if needed. + if v == 0 { + v = DefaultTTL + } + + return int32(v), nil + case linux.IP_MULTICAST_TTL: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument @@ -1092,6 +1309,20 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac } return int32(0), nil + case linux.IP_TOS: + // Length handling for parity with Linux. + if outLen == 0 { + return []byte(nil), nil + } + var v tcpip.IPv4TOSOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + if outLen < sizeOfInt32 { + return uint8(v), nil + } + return int32(v), nil + default: emitUnimplementedEventIP(t, name) } @@ -1102,7 +1333,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac // tcpip.Endpoint. func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is - // implemented specifically for epsocket.SocketOperations rather than + // implemented specifically for netstack.SocketOperations rather than // commonEndpoint. commonEndpoint should be extended to support socket // options where the implementation is not shared, as unix sockets need // their own support for SO_TIMESTAMP. @@ -1165,7 +1396,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.SendBufferSizeOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.SendBufferSizeOption, int(v))) case linux.SO_RCVBUF: if len(optVal) < sizeOfInt32 { @@ -1173,7 +1404,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, int(v))) case linux.SO_REUSEADDR: if len(optVal) < sizeOfInt32 { @@ -1191,6 +1422,13 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i v := usermem.ByteOrder.Uint32(optVal) return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v))) + case linux.SO_BINDTODEVICE: + n := bytes.IndexByte(optVal, 0) + if n == -1 { + n = len(optVal) + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(optVal[:n]))) + case linux.SO_BROADCAST: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument @@ -1285,11 +1523,11 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } v := usermem.ByteOrder.Uint32(optVal) - var o tcpip.DelayOption + var o int if v == 0 { o = 1 } - return syserr.TranslateNetstackError(ep.SetSockOpt(o)) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.DelayOption, o)) case linux.TCP_CORK: if len(optVal) < sizeOfInt32 { @@ -1337,6 +1575,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)))) + case linux.TCP_USER_TIMEOUT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := int32(usermem.ByteOrder.Uint32(optVal)) + if v < 0 { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)))) + case linux.TCP_CONGESTION: v := tcpip.CongestionControlOption(optVal) if err := ep.SetSockOpt(v); err != nil { @@ -1344,6 +1593,14 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } return nil + case linux.TCP_LINGER2: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)))) + case linux.TCP_REPAIR_OPTIONS: t.Kernel().EmitUnimplementedEvent(t) @@ -1383,6 +1640,19 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) t.Kernel().EmitUnimplementedEvent(t) + case linux.IPV6_TCLASS: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := int32(usermem.ByteOrder.Uint32(optVal)) + if v < -1 || v > 255 { + return syserr.ErrInvalidArgument + } + if v == -1 { + v = 0 + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v))) + default: emitUnimplementedEventIPv6(t, name) } @@ -1514,6 +1784,30 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s t.Kernel().EmitUnimplementedEvent(t) return syserr.ErrInvalidArgument + case linux.IP_TTL: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + + // -1 means default TTL. + if v == -1 { + v = 0 + } else if v < 1 || v > 255 { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TTLOption(v))) + + case linux.IP_TOS: + if len(optVal) == 0 { + return nil + } + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v))) + case linux.IP_ADD_SOURCE_MEMBERSHIP, linux.IP_BIND_ADDRESS_NO_PORT, linux.IP_BLOCK_SOURCE, @@ -1537,9 +1831,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s linux.IP_RECVTOS, linux.IP_RECVTTL, linux.IP_RETOPTS, - linux.IP_TOS, linux.IP_TRANSPARENT, - linux.IP_TTL, linux.IP_UNBLOCK_SOURCE, linux.IP_UNICAST_IF, linux.IP_XFRM_POLICY, @@ -1724,12 +2016,14 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) return &out, uint32(2 + l) } return &out, uint32(3 + l) + case linux.AF_INET: var out linux.SockAddrInet copy(out.Addr[:], addr.Addr) out.Family = linux.AF_INET out.Port = htons(addr.Port) - return &out, uint32(binary.Size(out)) + return &out, uint32(sockAddrInetSize) + case linux.AF_INET6: var out linux.SockAddrInet6 if len(addr.Addr) == 4 { @@ -1745,7 +2039,17 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) if isLinkLocal(addr.Addr) { out.Scope_id = uint32(addr.NIC) } - return &out, uint32(binary.Size(out)) + return &out, uint32(sockAddrInet6Size) + + case linux.AF_PACKET: + // TODO(b/129292371): Return protocol too. + var out linux.SockAddrLink + out.Family = linux.AF_PACKET + out.InterfaceIndex = int32(addr.NIC) + out.HardwareAddrLen = header.EthernetAddressSize + copy(out.HardwareAddr[:], addr.Addr) + return &out, uint32(sockAddrLinkSize) + default: return nil, 0 } @@ -2060,7 +2364,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] n, _, err = s.Endpoint.Write(v, opts) } dontWait := flags&linux.MSG_DONTWAIT != 0 - if err == nil && (n >= int64(v.Size()) || dontWait) { + if err == nil && (n >= v.src.NumBytes() || dontWait) { // Complete write. return int(n), nil } @@ -2085,7 +2389,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return 0, syserr.TranslateNetstackError(err) } - if err == nil && v.Size() == 0 || err != nil && err != tcpip.ErrWouldBlock { + if err == nil && v.src.NumBytes() == 0 || err != nil && err != tcpip.ErrWouldBlock { return int(total), nil } @@ -2101,7 +2405,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] // Ioctl implements fs.FileOperations.Ioctl. func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - // SIOCGSTAMP is implemented by epsocket rather than all commonEndpoint + // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint // sockets. // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. switch args[1].Int() { @@ -2207,9 +2511,9 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc return 0, err case linux.TIOCOUTQ: - var v tcpip.SendQueueSizeOption - if err := ep.GetSockOpt(&v); err != nil { - return 0, syserr.TranslateNetstackError(err).ToError() + v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption) + if terr != nil { + return 0, syserr.TranslateNetstackError(terr).ToError() } if v > math.MaxInt32 { @@ -2404,7 +2708,7 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error { // Flag values and meanings are described in greater detail in netdevice(7) in // the SIOCGIFFLAGS section. func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) { - // epsocket should only ever be passed an epsocket.Stack. + // We should only ever be passed a netstack.Stack. epstack, ok := stack.(*Stack) if !ok { return 0, errStackType diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/netstack/provider.go index 421f93dc4..2d2c1ba2a 100644 --- a/pkg/sentry/socket/epsocket/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package epsocket +package netstack import ( "syscall" @@ -62,10 +62,14 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in } case linux.SOCK_RAW: + // TODO(b/142504697): "In order to create a raw socket, a + // process must have the CAP_NET_RAW capability in the user + // namespace that governs its network namespace." - raw(7) + // Raw sockets require CAP_NET_RAW. creds := auth.CredentialsFromContext(ctx) if !creds.HasCapability(linux.CAP_NET_RAW) { - return 0, true, syserr.ErrPermissionDenied + return 0, true, syserr.ErrNotPermitted } switch protocol { @@ -85,7 +89,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in return 0, true, syserr.ErrProtocolNotSupported } -// Socket creates a new socket object for the AF_INET or AF_INET6 family. +// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET +// family. func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Fail right away if we don't have a stack. stack := t.NetworkContext() @@ -99,6 +104,12 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* return nil, nil } + // Packet sockets are handled separately, since they are neither INET + // nor INET6 specific. + if p.family == linux.AF_PACKET { + return packetSocket(t, eps, stype, protocol) + } + // Figure out the transport protocol. transProto, associated, err := getTransportProtocol(t, stype, protocol) if err != nil { @@ -121,12 +132,47 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* return New(t, p.family, stype, int(transProto), wq, ep) } +func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { + // TODO(b/142504697): "In order to create a packet socket, a process + // must have the CAP_NET_RAW capability in the user namespace that + // governs its network namespace." - packet(7) + + // Packet sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(t) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return nil, syserr.ErrNotPermitted + } + + // "cooked" packets don't contain link layer information. + var cooked bool + switch stype { + case linux.SOCK_DGRAM: + cooked = true + case linux.SOCK_RAW: + cooked = false + default: + return nil, syserr.ErrProtocolNotSupported + } + + // protocol is passed in network byte order, but netstack wants it in + // host order. + netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + + wq := &waiter.Queue{} + ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return New(t, linux.AF_PACKET, stype, protocol, wq, ep) +} + // Pair just returns nil sockets (not supported). func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { return nil, nil, nil } -// init registers socket providers for AF_INET and AF_INET6. +// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET. func init() { // Providers backed by netstack. p := []provider{ @@ -138,6 +184,9 @@ func init() { family: linux.AF_INET6, netProto: ipv6.ProtocolNumber, }, + { + family: linux.AF_PACKET, + }, } for i := range p { diff --git a/pkg/sentry/socket/epsocket/save_restore.go b/pkg/sentry/socket/netstack/save_restore.go index f7b8c10cc..c7aaf722a 100644 --- a/pkg/sentry/socket/epsocket/save_restore.go +++ b/pkg/sentry/socket/netstack/save_restore.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package epsocket +package netstack import ( "gvisor.dev/gvisor/pkg/tcpip/stack" diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/netstack/stack.go index 7cf7ff735..a0db2d4fd 100644 --- a/pkg/sentry/socket/epsocket/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package epsocket +package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" @@ -144,7 +144,98 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error { // Statistics implements inet.Stack.Statistics. func (s *Stack) Statistics(stat interface{}, arg string) error { - return syserr.ErrEndpointOperation.ToError() + switch stats := stat.(type) { + case *inet.StatSNMPIP: + ip := Metrics.IP + *stats = inet.StatSNMPIP{ + 0, // TODO(gvisor.dev/issue/969): Support Ip/Forwarding. + 0, // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL. + ip.PacketsReceived.Value(), // InReceives. + 0, // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors. + ip.InvalidAddressesReceived.Value(), // InAddrErrors. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams. + 0, // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos. + 0, // TODO(gvisor.dev/issue/969): Support Ip/InDiscards. + ip.PacketsDelivered.Value(), // InDelivers. + ip.PacketsSent.Value(), // OutRequests. + ip.OutgoingPacketErrors.Value(), // OutDiscards. + 0, // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails. + 0, // TODO(gvisor.dev/issue/969): Support Ip/FragOKs. + 0, // TODO(gvisor.dev/issue/969): Support Ip/FragFails. + 0, // TODO(gvisor.dev/issue/969): Support Ip/FragCreates. + } + case *inet.StatSNMPICMP: + in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats + out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats + *stats = inet.StatSNMPICMP{ + 0, // TODO(gvisor.dev/issue/969): Support Icmp/InMsgs. + Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors. + 0, // TODO(gvisor.dev/issue/969): Support Icmp/InCsumErrors. + in.DstUnreachable.Value(), // InDestUnreachs. + in.TimeExceeded.Value(), // InTimeExcds. + in.ParamProblem.Value(), // InParmProbs. + in.SrcQuench.Value(), // InSrcQuenchs. + in.Redirect.Value(), // InRedirects. + in.Echo.Value(), // InEchos. + in.EchoReply.Value(), // InEchoReps. + in.Timestamp.Value(), // InTimestamps. + in.TimestampReply.Value(), // InTimestampReps. + in.InfoRequest.Value(), // InAddrMasks. + in.InfoReply.Value(), // InAddrMaskReps. + 0, // TODO(gvisor.dev/issue/969): Support Icmp/OutMsgs. + Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors. + out.DstUnreachable.Value(), // OutDestUnreachs. + out.TimeExceeded.Value(), // OutTimeExcds. + out.ParamProblem.Value(), // OutParmProbs. + out.SrcQuench.Value(), // OutSrcQuenchs. + out.Redirect.Value(), // OutRedirects. + out.Echo.Value(), // OutEchos. + out.EchoReply.Value(), // OutEchoReps. + out.Timestamp.Value(), // OutTimestamps. + out.TimestampReply.Value(), // OutTimestampReps. + out.InfoRequest.Value(), // OutAddrMasks. + out.InfoReply.Value(), // OutAddrMaskReps. + } + case *inet.StatSNMPTCP: + tcp := Metrics.TCP + // RFC 2012 (updates 1213): SNMPv2-MIB-TCP. + *stats = inet.StatSNMPTCP{ + 1, // RtoAlgorithm. + 200, // RtoMin. + 120000, // RtoMax. + (1<<64 - 1), // MaxConn. + tcp.ActiveConnectionOpenings.Value(), // ActiveOpens. + tcp.PassiveConnectionOpenings.Value(), // PassiveOpens. + tcp.FailedConnectionAttempts.Value(), // AttemptFails. + tcp.EstablishedResets.Value(), // EstabResets. + tcp.CurrentEstablished.Value(), // CurrEstab. + tcp.ValidSegmentsReceived.Value(), // InSegs. + tcp.SegmentsSent.Value(), // OutSegs. + tcp.Retransmits.Value(), // RetransSegs. + tcp.InvalidSegmentsReceived.Value(), // InErrs. + tcp.ResetsSent.Value(), // OutRsts. + tcp.ChecksumErrors.Value(), // InCsumErrors. + } + case *inet.StatSNMPUDP: + udp := Metrics.UDP + *stats = inet.StatSNMPUDP{ + udp.PacketsReceived.Value(), // InDatagrams. + udp.UnknownPortErrors.Value(), // NoPorts. + 0, // TODO(gvisor.dev/issue/969): Support Udp/InErrors. + udp.PacketsSent.Value(), // OutDatagrams. + udp.ReceiveBufferErrors.Value(), // RcvbufErrors. + 0, // TODO(gvisor.dev/issue/969): Support Udp/SndbufErrors. + 0, // TODO(gvisor.dev/issue/969): Support Udp/InCsumErrors. + 0, // TODO(gvisor.dev/issue/969): Support Udp/IgnoredMulti. + } + default: + return syserr.ErrEndpointOperation.ToError() + } + return nil } // RouteTable implements inet.Stack.RouteTable. @@ -200,3 +291,18 @@ func (s *Stack) FillDefaultIPTables() { func (s *Stack) Resume() { s.Stack.Resume() } + +// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. +func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { + return s.Stack.RegisteredEndpoints() +} + +// CleanupEndpoints implements inet.Stack.CleanupEndpoints. +func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { + return s.Stack.CleanupEndpoints() +} + +// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. +func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) { + s.Stack.RestoreCleanupEndpoints(es) +} diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD index 5061dcbde..4668b87d1 100644 --- a/pkg/sentry/socket/rpcinet/BUILD +++ b/pkg/sentry/socket/rpcinet/BUILD @@ -1,5 +1,6 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@rules_cc//cc:defs.bzl", "cc_proto_library") package(licenses = ["notice"]) @@ -36,6 +37,7 @@ go_library( "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/buffer", + "//pkg/tcpip/stack", "//pkg/unet", "//pkg/waiter", ], @@ -49,6 +51,14 @@ proto_library( ], ) +cc_proto_library( + name = "syscall_rpc_cc_proto", + visibility = [ + "//visibility:public", + ], + deps = [":syscall_rpc_proto"], +) + go_proto_library( name = "syscall_rpc_go_proto", importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto", diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go index 5dcb6b455..f7878a760 100644 --- a/pkg/sentry/socket/rpcinet/stack.go +++ b/pkg/sentry/socket/rpcinet/stack.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier" "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/unet" ) @@ -165,3 +166,12 @@ func (s *Stack) RouteTable() []inet.Route { // Resume implements inet.Stack.Resume. func (s *Stack) Resume() {} + +// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. +func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } + +// CleanupEndpoints implements inet.Stack.CleanupEndpoints. +func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } + +// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. +func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto index 9586f5923..b677e9eb3 100644 --- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto +++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto @@ -3,7 +3,6 @@ syntax = "proto3"; // package syscall_rpc is a set of networking related system calls that can be // forwarded to a socket gofer. // -// TODO(b/77963526): Document individual RPCs. package syscall_rpc; message SendmsgRequest { diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 8c250c325..2389a9cdb 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -43,6 +43,11 @@ type ControlMessages struct { IP tcpip.ControlMessages } +// Release releases Unix domain socket credentials and rights. +func (c *ControlMessages) Release() { + c.Unix.Release() +} + // Socket is the interface containing socket syscalls used by the syscall layer // to redirect them to the appropriate implementation. type Socket interface { diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index da9977fde..5b6a154f6 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "unix", srcs = [ @@ -24,7 +24,7 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/control", - "//pkg/sentry/socket/epsocket", + "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", "//pkg/syserr", diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index 0b0240336..788ad70d2 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -1,8 +1,8 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") +package(licenses = ["notice"]) + go_template_instance( name = "transport_message_list", out = "transport_message_list.go", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 4bd15808a..dea11e253 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -220,6 +220,11 @@ func (e *connectionedEndpoint) Close() { case e.Connected(): e.connected.CloseSend() e.receiver.CloseRecv() + // Still have unread data? If yes, we set this into the write + // end so that the peer can get ECONNRESET) when it does read. + if e.receiver.RecvQueuedSize() > 0 { + e.connected.CloseUnread() + } c = e.connected r = e.receiver e.connected = nil diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index 0415fae9a..e27b1c714 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -33,6 +33,7 @@ type queue struct { mu sync.Mutex `state:"nosave"` closed bool + unread bool used int64 limit int64 dataList messageList @@ -161,6 +162,9 @@ func (q *queue) Dequeue() (e *message, notify bool, err *syserr.Error) { err := syserr.ErrWouldBlock if q.closed { err = syserr.ErrClosedForReceive + if q.unread { + err = syserr.ErrConnectionReset + } } q.mu.Unlock() @@ -188,7 +192,9 @@ func (q *queue) Peek() (*message, *syserr.Error) { if q.dataList.Front() == nil { err := syserr.ErrWouldBlock if q.closed { - err = syserr.ErrClosedForReceive + if err = syserr.ErrClosedForReceive; q.unread { + err = syserr.ErrConnectionReset + } } return nil, err } @@ -208,3 +214,11 @@ func (q *queue) QueuedSize() int64 { func (q *queue) MaxQueueSize() int64 { return q.limit } + +// CloseUnread sets flag to indicate that the peer is closed (not shutdown) +// with unread data. So if read on this queue shall return ECONNRESET error. +func (q *queue) CloseUnread() { + q.mu.Lock() + defer q.mu.Unlock() + q.unread = true +} diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 2b0ad6395..529a7a7a9 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -175,6 +175,10 @@ type Endpoint interface { // types. SetSockOpt(opt interface{}) *tcpip.Error + // SetSockOptInt sets a socket option for simple cases when a value has + // the int type. + SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error + // GetSockOpt gets a socket option. opt should be a pointer to one of the // tcpip.*Option types. GetSockOpt(opt interface{}) *tcpip.Error @@ -604,6 +608,10 @@ type ConnectedEndpoint interface { // Release releases any resources owned by the ConnectedEndpoint. It should // be called before droping all references to a ConnectedEndpoint. Release() + + // CloseUnread sets the fact that this end is closed with unread data to + // the peer socket. + CloseUnread() } // +stateify savable @@ -707,6 +715,11 @@ func (e *connectedEndpoint) Release() { e.writeQueue.DecRef() } +// CloseUnread implements ConnectedEndpoint.CloseUnread. +func (e *connectedEndpoint) CloseUnread() { + e.writeQueue.CloseUnread() +} + // baseEndpoint is an embeddable unix endpoint base used in both the connected and connectionless // unix domain socket Endpoint implementations. // @@ -838,6 +851,10 @@ func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error { return nil } +func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error { + return nil +} + func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: @@ -853,65 +870,63 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { return -1, tcpip.ErrQueueSizeNotSupported } return v, nil - default: - return -1, tcpip.ErrUnknownProtocolOption - } -} -// GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { - switch o := opt.(type) { - case tcpip.ErrorOption: - return nil - - case *tcpip.SendQueueSizeOption: + case tcpip.SendQueueSizeOption: e.Lock() if !e.Connected() { e.Unlock() - return tcpip.ErrNotConnected + return -1, tcpip.ErrNotConnected } - qs := tcpip.SendQueueSizeOption(e.connected.SendQueuedSize()) + v := e.connected.SendQueuedSize() e.Unlock() - if qs < 0 { - return tcpip.ErrQueueSizeNotSupported - } - *o = qs - return nil - - case *tcpip.PasscredOption: - if e.Passcred() { - *o = tcpip.PasscredOption(1) - } else { - *o = tcpip.PasscredOption(0) + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported } - return nil + return int(v), nil - case *tcpip.SendBufferSizeOption: + case tcpip.SendBufferSizeOption: e.Lock() if !e.Connected() { e.Unlock() - return tcpip.ErrNotConnected + return -1, tcpip.ErrNotConnected } - qs := tcpip.SendBufferSizeOption(e.connected.SendMaxQueueSize()) + v := e.connected.SendMaxQueueSize() e.Unlock() - if qs < 0 { - return tcpip.ErrQueueSizeNotSupported + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported } - *o = qs - return nil + return int(v), nil - case *tcpip.ReceiveBufferSizeOption: + case tcpip.ReceiveBufferSizeOption: e.Lock() if e.receiver == nil { e.Unlock() - return tcpip.ErrNotConnected + return -1, tcpip.ErrNotConnected } - qs := tcpip.ReceiveBufferSizeOption(e.receiver.RecvMaxQueueSize()) + v := e.receiver.RecvMaxQueueSize() e.Unlock() - if qs < 0 { - return tcpip.ErrQueueSizeNotSupported + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported + } + return int(v), nil + + default: + return -1, tcpip.ErrUnknownProtocolOption + } +} + +// GetSockOpt implements tcpip.Endpoint.GetSockOpt. +func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { + switch o := opt.(type) { + case tcpip.ErrorOption: + return nil + + case *tcpip.PasscredOption: + if e.Passcred() { + *o = tcpip.PasscredOption(1) + } else { + *o = tcpip.PasscredOption(0) } - *o = qs return nil case *tcpip.KeepaliveEnabledOption: diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 0d0cb68df..885758054 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -31,7 +31,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" - "gvisor.dev/gvisor/pkg/sentry/socket/epsocket" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" @@ -40,8 +40,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// SocketOperations is a Unix socket. It is similar to an epsocket, except it -// is backed by a transport.Endpoint instead of a tcpip.Endpoint. +// SocketOperations is a Unix socket. It is similar to a netstack socket, +// except it is backed by a transport.Endpoint instead of a tcpip.Endpoint. // // +stateify savable type SocketOperations struct { @@ -116,8 +116,11 @@ func (s *SocketOperations) Endpoint() transport.Endpoint { // extractPath extracts and validates the address. func extractPath(sockaddr []byte) (string, *syserr.Error) { - addr, _, err := epsocket.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */) + addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */) if err != nil { + if err == syserr.ErrAddressFamilyNotSupported { + err = syserr.ErrInvalidArgument + } return "", err } @@ -143,7 +146,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, return nil, 0, syserr.TranslateNetstackError(err) } - a, l := epsocket.ConvertAddress(linux.AF_UNIX, addr) + a, l := netstack.ConvertAddress(linux.AF_UNIX, addr) return a, l, nil } @@ -155,19 +158,19 @@ func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, return nil, 0, syserr.TranslateNetstackError(err) } - a, l := epsocket.ConvertAddress(linux.AF_UNIX, addr) + a, l := netstack.ConvertAddress(linux.AF_UNIX, addr) return a, l, nil } // Ioctl implements fs.FileOperations.Ioctl. func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - return epsocket.Ioctl(ctx, s.ep, io, args) + return netstack.Ioctl(ctx, s.ep, io, args) } // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // a transport.Endpoint. func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { - return epsocket.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) + return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) } // Listen implements the linux syscall listen(2) for sockets backed by @@ -474,13 +477,13 @@ func (s *SocketOperations) EventUnregister(e *waiter.Entry) { // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by // a transport.Endpoint. func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { - return epsocket.SetSockOpt(t, s, s.ep, level, name, optVal) + return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) } // Shutdown implements the linux syscall shutdown(2) for sockets backed by // a transport.Endpoint. func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { - f, err := epsocket.ConvertShutdown(how) + f, err := netstack.ConvertShutdown(how) if err != nil { return err } @@ -546,7 +549,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags var from linux.SockAddr var fromLen uint32 if r.From != nil && len([]byte(r.From.Addr)) != 0 { - from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From) + from, fromLen = netstack.ConvertAddress(linux.AF_UNIX, *r.From) } if r.ControlTrunc { @@ -581,7 +584,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags var from linux.SockAddr var fromLen uint32 if r.From != nil { - from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From) + from, fromLen = netstack.ConvertAddress(linux.AF_UNIX, *r.From) } if r.ControlTrunc { @@ -595,7 +598,8 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags total += n } - if err != nil || !waitAll || isPacket || n >= dst.NumBytes() { + streamPeerClosed := s.stype == linux.SOCK_STREAM && n == 0 && err == nil + if err != nil || !waitAll || isPacket || n >= dst.NumBytes() || streamPeerClosed { if total > 0 { err = nil } diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index 445d25010..d46421199 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -1,5 +1,6 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@rules_cc//cc:defs.bzl", "cc_proto_library") package(licenses = ["notice"]) @@ -13,6 +14,7 @@ go_library( "open.go", "poll.go", "ptrace.go", + "select.go", "signal.go", "socket.go", "strace.go", @@ -31,8 +33,8 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/kernel", "//pkg/sentry/socket/control", - "//pkg/sentry/socket/epsocket", "//pkg/sentry/socket/netlink", + "//pkg/sentry/socket/netstack", "//pkg/sentry/syscalls/linux", "//pkg/sentry/usermem", ], @@ -44,6 +46,12 @@ proto_library( visibility = ["//visibility:public"], ) +cc_proto_library( + name = "strace_cc_proto", + visibility = ["//visibility:public"], + deps = [":strace_proto"], +) + go_proto_library( name = "strace_go_proto", importpath = "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto", diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go index 3650fd6e1..e603f858f 100644 --- a/pkg/sentry/strace/linux64.go +++ b/pkg/sentry/strace/linux64.go @@ -40,7 +40,7 @@ var linuxAMD64 = SyscallMap{ 20: makeSyscallInfo("writev", FD, WriteIOVec, Hex), 21: makeSyscallInfo("access", Path, Oct), 22: makeSyscallInfo("pipe", PipeFDs), - 23: makeSyscallInfo("select", Hex, Hex, Hex, Hex, Timeval), + 23: makeSyscallInfo("select", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timeval), 24: makeSyscallInfo("sched_yield"), 25: makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex), 26: makeSyscallInfo("msync", Hex, Hex, Hex), @@ -287,7 +287,7 @@ var linuxAMD64 = SyscallMap{ 267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex), 268: makeSyscallInfo("fchmodat", FD, Path, Mode), 269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex), - 270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex), + 270: makeSyscallInfo("pselect6", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timespec, SigSet), 271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex), 272: makeSyscallInfo("unshare", CloneFlags), 273: makeSyscallInfo("set_robust_list", Hex, Hex), @@ -335,4 +335,33 @@ var linuxAMD64 = SyscallMap{ 315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex), 316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex), 317: makeSyscallInfo("seccomp", Hex, Hex, Hex), + 318: makeSyscallInfo("getrandom", Hex, Hex, Hex), + 319: makeSyscallInfo("memfd_create", Path, Hex), // Not quite a path, but close. + 320: makeSyscallInfo("kexec_file_load", FD, FD, Hex, Hex, Hex), + 321: makeSyscallInfo("bpf", Hex, Hex, Hex), + 322: makeSyscallInfo("execveat", FD, Path, ExecveStringVector, ExecveStringVector, Hex), + 323: makeSyscallInfo("userfaultfd", Hex), + 324: makeSyscallInfo("membarrier", Hex, Hex), + 325: makeSyscallInfo("mlock2", Hex, Hex, Hex), + 326: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex), + 327: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex), + 328: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex), + 329: makeSyscallInfo("pkey_mprotect", Hex, Hex, Hex, Hex), + 330: makeSyscallInfo("pkey_alloc", Hex, Hex), + 331: makeSyscallInfo("pkey_free", Hex), + 332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex), + 333: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet), + 334: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex), + 424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex), + 425: makeSyscallInfo("io_uring_setup", Hex, Hex), + 426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex), + 427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex), + 428: makeSyscallInfo("open_tree", FD, Path, Hex), + 429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex), + 430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close. + 431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex), + 432: makeSyscallInfo("fsmount", FD, Hex, Hex), + 433: makeSyscallInfo("fspick", FD, Path, Hex), + 434: makeSyscallInfo("pidfd_open", Hex, Hex), + 435: makeSyscallInfo("clone3", Hex, Hex), } diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go new file mode 100644 index 000000000..dea309fda --- /dev/null +++ b/pkg/sentry/strace/select.go @@ -0,0 +1,53 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strace + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +func fdsFromSet(t *kernel.Task, set []byte) []int { + var fds []int + // Append n if the n-th bit is 1. + for i, v := range set { + for j := 0; j < 8; j++ { + if (v>>j)&1 == 1 { + fds = append(fds, i*8+j) + } + } + } + return fds +} + +func fdSet(t *kernel.Task, nfds int, addr usermem.Addr) string { + if addr == 0 { + return "null" + } + + // Calculate the size of the fd set (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 + + set, err := linux.CopyInFDSet(t, addr, nBytes, nBitsInLastPartialByte) + if err != nil { + return fmt.Sprintf("%#x (error decoding fdset: %s)", addr, err) + } + + return fmt.Sprintf("%#x %v", addr, fdsFromSet(t, set)) +} diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index f779186ad..51f2efb39 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/control" - "gvisor.dev/gvisor/pkg/sentry/socket/epsocket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" ) @@ -208,6 +208,15 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) i += linux.SizeOfControlMessageHeader width := t.Arch().Width() length := int(h.Length) - linux.SizeOfControlMessageHeader + if length < 0 { + strs = append(strs, fmt.Sprintf( + "{level=%s, type=%s, length=%d, content too short}", + level, + typ, + h.Length, + )) + break + } if skipData { strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length)) @@ -332,7 +341,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string { switch family { case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX: - fa, _, err := epsocket.AddressAndFamily(int(family), b, true /* strict */) + fa, _, err := netstack.AddressAndFamily(int(family), b, true /* strict */) if err != nil { return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err) } diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 311389547..629c1f308 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -439,6 +439,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer())) case PollFDs: output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false)) + case SelectFDSet: + output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer())) case Oct: output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8)) case Hex: diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto index 4b2f73a5f..906c52c51 100644 --- a/pkg/sentry/strace/strace.proto +++ b/pkg/sentry/strace/strace.proto @@ -32,8 +32,7 @@ message Strace { } } -message StraceEnter { -} +message StraceEnter {} message StraceExit { // Return value formatted as string. diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go index 3c389d375..e5d486c4e 100644 --- a/pkg/sentry/strace/syscalls.go +++ b/pkg/sentry/strace/syscalls.go @@ -206,6 +206,10 @@ const ( // PollFDs is an array of struct pollfd. The number of entries in the // array is in the next argument. PollFDs + + // SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of + // fds represented must be the first argument. + SelectFDSet ) // defaultFormat is the syscall argument format to use if the actual format is diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 33a40b9c6..6766ba587 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -1,13 +1,15 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "linux", srcs = [ "error.go", "flags.go", "linux64.go", + "linux64_amd64.go", + "linux64_arm64.go", "sigset.go", "sys_aio.go", "sys_capability.go", @@ -47,6 +49,7 @@ go_library( "sys_tls.go", "sys_utsname.go", "sys_write.go", + "sys_xattr.go", "timespec.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux", @@ -74,8 +77,10 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/shm", + "//pkg/sentry/kernel/signalfd", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", + "//pkg/sentry/loader", "//pkg/sentry/memmap", "//pkg/sentry/mm", "//pkg/sentry/safemem", diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go index 444f2b004..07961dad9 100644 --- a/pkg/sentry/syscalls/linux/flags.go +++ b/pkg/sentry/syscalls/linux/flags.go @@ -50,5 +50,6 @@ func linuxToFlags(mask uint) fs.FileFlags { Directory: mask&linux.O_DIRECTORY != 0, Async: mask&linux.O_ASYNC != 0, LargeFile: mask&linux.O_LARGEFILE != 0, + Truncate: mask&linux.O_TRUNC != 0, } } diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index ed996ba51..68589a377 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,377 +15,13 @@ // Package linux provides syscall tables for amd64 Linux. package linux -import ( - "gvisor.dev/gvisor/pkg/abi" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserror" -) - -// AUDIT_ARCH_X86_64 identifies the Linux syscall API on AMD64, and is taken -// from <linux/audit.h>. -const _AUDIT_ARCH_X86_64 = 0xc000003e +const ( + // LinuxSysname is the OS name advertised by gVisor. + LinuxSysname = "Linux" -// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall -// numbers from Linux 4.4. -var AMD64 = &kernel.SyscallTable{ - OS: abi.Linux, - Arch: arch.AMD64, - Version: kernel.Version{ - // Version 4.4 is chosen as a stable, longterm version of Linux, which - // guides the interface provided by this syscall table. The build - // version is that for a clean build with default kernel config, at 5 - // minutes after v4.4 was tagged. - Sysname: "Linux", - Release: "4.4", - Version: "#1 SMP Sun Jan 10 15:06:54 PST 2016", - }, - AuditNumber: _AUDIT_ARCH_X86_64, - Table: map[uintptr]kernel.Syscall{ - 0: syscalls.Supported("read", Read), - 1: syscalls.Supported("write", Write), - 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), - 3: syscalls.Supported("close", Close), - 4: syscalls.Supported("stat", Stat), - 5: syscalls.Supported("fstat", Fstat), - 6: syscalls.Supported("lstat", Lstat), - 7: syscalls.Supported("poll", Poll), - 8: syscalls.Supported("lseek", Lseek), - 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), - 10: syscalls.Supported("mprotect", Mprotect), - 11: syscalls.Supported("munmap", Munmap), - 12: syscalls.Supported("brk", Brk), - 13: syscalls.Supported("rt_sigaction", RtSigaction), - 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), - 15: syscalls.Supported("rt_sigreturn", RtSigreturn), - 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), - 17: syscalls.Supported("pread64", Pread64), - 18: syscalls.Supported("pwrite64", Pwrite64), - 19: syscalls.Supported("readv", Readv), - 20: syscalls.Supported("writev", Writev), - 21: syscalls.Supported("access", Access), - 22: syscalls.Supported("pipe", Pipe), - 23: syscalls.Supported("select", Select), - 24: syscalls.Supported("sched_yield", SchedYield), - 25: syscalls.Supported("mremap", Mremap), - 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), - 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), - 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), - 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), - 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), - 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), - 32: syscalls.Supported("dup", Dup), - 33: syscalls.Supported("dup2", Dup2), - 34: syscalls.Supported("pause", Pause), - 35: syscalls.Supported("nanosleep", Nanosleep), - 36: syscalls.Supported("getitimer", Getitimer), - 37: syscalls.Supported("alarm", Alarm), - 38: syscalls.Supported("setitimer", Setitimer), - 39: syscalls.Supported("getpid", Getpid), - 40: syscalls.Supported("sendfile", Sendfile), - 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), - 42: syscalls.Supported("connect", Connect), - 43: syscalls.Supported("accept", Accept), - 44: syscalls.Supported("sendto", SendTo), - 45: syscalls.Supported("recvfrom", RecvFrom), - 46: syscalls.Supported("sendmsg", SendMsg), - 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), - 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), - 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), - 50: syscalls.Supported("listen", Listen), - 51: syscalls.Supported("getsockname", GetSockName), - 52: syscalls.Supported("getpeername", GetPeerName), - 53: syscalls.Supported("socketpair", SocketPair), - 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), - 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), - 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), - 57: syscalls.Supported("fork", Fork), - 58: syscalls.Supported("vfork", Vfork), - 59: syscalls.Supported("execve", Execve), - 60: syscalls.Supported("exit", Exit), - 61: syscalls.Supported("wait4", Wait4), - 62: syscalls.Supported("kill", Kill), - 63: syscalls.Supported("uname", Uname), - 64: syscalls.Supported("semget", Semget), - 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), - 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), - 67: syscalls.Supported("shmdt", Shmdt), - 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), - 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), - 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), - 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), - 76: syscalls.Supported("truncate", Truncate), - 77: syscalls.Supported("ftruncate", Ftruncate), - 78: syscalls.Supported("getdents", Getdents), - 79: syscalls.Supported("getcwd", Getcwd), - 80: syscalls.Supported("chdir", Chdir), - 81: syscalls.Supported("fchdir", Fchdir), - 82: syscalls.Supported("rename", Rename), - 83: syscalls.Supported("mkdir", Mkdir), - 84: syscalls.Supported("rmdir", Rmdir), - 85: syscalls.Supported("creat", Creat), - 86: syscalls.Supported("link", Link), - 87: syscalls.Supported("unlink", Unlink), - 88: syscalls.Supported("symlink", Symlink), - 89: syscalls.Supported("readlink", Readlink), - 90: syscalls.Supported("chmod", Chmod), - 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), - 92: syscalls.Supported("chown", Chown), - 93: syscalls.Supported("fchown", Fchown), - 94: syscalls.Supported("lchown", Lchown), - 95: syscalls.Supported("umask", Umask), - 96: syscalls.Supported("gettimeofday", Gettimeofday), - 97: syscalls.Supported("getrlimit", Getrlimit), - 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), - 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), - 100: syscalls.Supported("times", Times), - 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), - 102: syscalls.Supported("getuid", Getuid), - 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), - 104: syscalls.Supported("getgid", Getgid), - 105: syscalls.Supported("setuid", Setuid), - 106: syscalls.Supported("setgid", Setgid), - 107: syscalls.Supported("geteuid", Geteuid), - 108: syscalls.Supported("getegid", Getegid), - 109: syscalls.Supported("setpgid", Setpgid), - 110: syscalls.Supported("getppid", Getppid), - 111: syscalls.Supported("getpgrp", Getpgrp), - 112: syscalls.Supported("setsid", Setsid), - 113: syscalls.Supported("setreuid", Setreuid), - 114: syscalls.Supported("setregid", Setregid), - 115: syscalls.Supported("getgroups", Getgroups), - 116: syscalls.Supported("setgroups", Setgroups), - 117: syscalls.Supported("setresuid", Setresuid), - 118: syscalls.Supported("getresuid", Getresuid), - 119: syscalls.Supported("setresgid", Setresgid), - 120: syscalls.Supported("getresgid", Getresgid), - 121: syscalls.Supported("getpgid", Getpgid), - 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 124: syscalls.Supported("getsid", Getsid), - 125: syscalls.Supported("capget", Capget), - 126: syscalls.Supported("capset", Capset), - 127: syscalls.Supported("rt_sigpending", RtSigpending), - 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), - 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), - 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), - 131: syscalls.Supported("sigaltstack", Sigaltstack), - 132: syscalls.Supported("utime", Utime), - 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), - 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), - 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), - 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), - 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), - 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), - 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), - 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), - 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), - 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), - 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), - 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), - 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), - 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), - 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), - 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), - 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), - 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), - 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), - 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), - 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), - 161: syscalls.Supported("chroot", Chroot), - 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), - 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), - 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), - 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), - 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), - 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), - 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), - 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), - 170: syscalls.Supported("sethostname", Sethostname), - 171: syscalls.Supported("setdomainname", Setdomainname), - 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), - 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), - 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), - 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), - 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), - 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), - 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), - 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), - 186: syscalls.Supported("gettid", Gettid), - 187: syscalls.ErrorWithEvent("readahead", syserror.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) - 188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 200: syscalls.Supported("tkill", Tkill), - 201: syscalls.Supported("time", Time), - 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), - 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), - 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), - 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), - 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), - 213: syscalls.Supported("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), - 217: syscalls.Supported("getdents64", Getdents64), - 218: syscalls.Supported("set_tid_address", SetTidAddress), - 219: syscalls.Supported("restart_syscall", RestartSyscall), - 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) - 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), - 222: syscalls.Supported("timer_create", TimerCreate), - 223: syscalls.Supported("timer_settime", TimerSettime), - 224: syscalls.Supported("timer_gettime", TimerGettime), - 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), - 226: syscalls.Supported("timer_delete", TimerDelete), - 227: syscalls.Supported("clock_settime", ClockSettime), - 228: syscalls.Supported("clock_gettime", ClockGettime), - 229: syscalls.Supported("clock_getres", ClockGetres), - 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), - 231: syscalls.Supported("exit_group", ExitGroup), - 232: syscalls.Supported("epoll_wait", EpollWait), - 233: syscalls.Supported("epoll_ctl", EpollCtl), - 234: syscalls.Supported("tgkill", Tgkill), - 235: syscalls.Supported("utimes", Utimes), - 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), - 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), - 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), - 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), - 247: syscalls.Supported("waitid", Waitid), - 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), - 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), - 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), - 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), - 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), - 257: syscalls.Supported("openat", Openat), - 258: syscalls.Supported("mkdirat", Mkdirat), - 259: syscalls.Supported("mknodat", Mknodat), - 260: syscalls.Supported("fchownat", Fchownat), - 261: syscalls.Supported("futimesat", Futimesat), - 262: syscalls.Supported("fstatat", Fstatat), - 263: syscalls.Supported("unlinkat", Unlinkat), - 264: syscalls.Supported("renameat", Renameat), - 265: syscalls.Supported("linkat", Linkat), - 266: syscalls.Supported("symlinkat", Symlinkat), - 267: syscalls.Supported("readlinkat", Readlinkat), - 268: syscalls.Supported("fchmodat", Fchmodat), - 269: syscalls.Supported("faccessat", Faccessat), - 270: syscalls.Supported("pselect", Pselect), - 271: syscalls.Supported("ppoll", Ppoll), - 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 276: syscalls.ErrorWithEvent("tee", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) - 280: syscalls.Supported("utimensat", Utimensat), - 281: syscalls.Supported("epoll_pwait", EpollPwait), - 282: syscalls.ErrorWithEvent("signalfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) - 283: syscalls.Supported("timerfd_create", TimerfdCreate), - 284: syscalls.Supported("eventfd", Eventfd), - 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), - 286: syscalls.Supported("timerfd_settime", TimerfdSettime), - 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), - 288: syscalls.Supported("accept4", Accept4), - 289: syscalls.ErrorWithEvent("signalfd4", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) - 290: syscalls.Supported("eventfd2", Eventfd2), - 291: syscalls.Supported("epoll_create1", EpollCreate1), - 292: syscalls.Supported("dup3", Dup3), - 293: syscalls.Supported("pipe2", Pipe2), - 294: syscalls.Supported("inotify_init1", InotifyInit1), - 295: syscalls.Supported("preadv", Preadv), - 296: syscalls.Supported("pwritev", Pwritev), - 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), - 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 302: syscalls.Supported("prlimit64", Prlimit64), - 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), - 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), - 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) - 309: syscalls.Supported("getcpu", Getcpu), - 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), - 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) - 317: syscalls.Supported("seccomp", Seccomp), - 318: syscalls.Supported("getrandom", GetRandom), - 319: syscalls.Supported("memfd_create", MemfdCreate), - 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), - 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 322: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) - 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) - 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + // LinuxRelease is the Linux release version number advertised by gVisor. + LinuxRelease = "4.4.0" - // Syscalls after 325 are "backports" from versions of Linux after 4.4. - 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), - 327: syscalls.Supported("preadv2", Preadv2), - 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 332: syscalls.Supported("statx", Statx), - }, - - Emulate: map[usermem.Addr]uintptr{ - 0xffffffffff600000: 96, // vsyscall gettimeofday(2) - 0xffffffffff600400: 201, // vsyscall time(2) - 0xffffffffff600800: 309, // vsyscall getcpu(2) - }, - Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { - t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS - }, -} + // LinuxVersion is the version info advertised by gVisor. + LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016" +) diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go new file mode 100644 index 000000000..272ae9991 --- /dev/null +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -0,0 +1,406 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" +) + +// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall +// numbers from Linux 4.4. +var AMD64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.AMD64, + Version: kernel.Version{ + // Version 4.4 is chosen as a stable, longterm version of Linux, which + // guides the interface provided by this syscall table. The build + // version is that for a clean build with default kernel config, at 5 + // minutes after v4.4 was tagged. + Sysname: LinuxSysname, + Release: LinuxRelease, + Version: LinuxVersion, + }, + AuditNumber: linux.AUDIT_ARCH_X86_64, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.Supported("read", Read), + 1: syscalls.Supported("write", Write), + 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), + 3: syscalls.Supported("close", Close), + 4: syscalls.Supported("stat", Stat), + 5: syscalls.Supported("fstat", Fstat), + 6: syscalls.Supported("lstat", Lstat), + 7: syscalls.Supported("poll", Poll), + 8: syscalls.Supported("lseek", Lseek), + 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 10: syscalls.Supported("mprotect", Mprotect), + 11: syscalls.Supported("munmap", Munmap), + 12: syscalls.Supported("brk", Brk), + 13: syscalls.Supported("rt_sigaction", RtSigaction), + 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 15: syscalls.Supported("rt_sigreturn", RtSigreturn), + 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 17: syscalls.Supported("pread64", Pread64), + 18: syscalls.Supported("pwrite64", Pwrite64), + 19: syscalls.Supported("readv", Readv), + 20: syscalls.Supported("writev", Writev), + 21: syscalls.Supported("access", Access), + 22: syscalls.Supported("pipe", Pipe), + 23: syscalls.Supported("select", Select), + 24: syscalls.Supported("sched_yield", SchedYield), + 25: syscalls.Supported("mremap", Mremap), + 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 32: syscalls.Supported("dup", Dup), + 33: syscalls.Supported("dup2", Dup2), + 34: syscalls.Supported("pause", Pause), + 35: syscalls.Supported("nanosleep", Nanosleep), + 36: syscalls.Supported("getitimer", Getitimer), + 37: syscalls.Supported("alarm", Alarm), + 38: syscalls.Supported("setitimer", Setitimer), + 39: syscalls.Supported("getpid", Getpid), + 40: syscalls.Supported("sendfile", Sendfile), + 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 42: syscalls.Supported("connect", Connect), + 43: syscalls.Supported("accept", Accept), + 44: syscalls.Supported("sendto", SendTo), + 45: syscalls.Supported("recvfrom", RecvFrom), + 46: syscalls.Supported("sendmsg", SendMsg), + 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 50: syscalls.Supported("listen", Listen), + 51: syscalls.Supported("getsockname", GetSockName), + 52: syscalls.Supported("getpeername", GetPeerName), + 53: syscalls.Supported("socketpair", SocketPair), + 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 57: syscalls.Supported("fork", Fork), + 58: syscalls.Supported("vfork", Vfork), + 59: syscalls.Supported("execve", Execve), + 60: syscalls.Supported("exit", Exit), + 61: syscalls.Supported("wait4", Wait4), + 62: syscalls.Supported("kill", Kill), + 63: syscalls.Supported("uname", Uname), + 64: syscalls.Supported("semget", Semget), + 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 67: syscalls.Supported("shmdt", Shmdt), + 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 76: syscalls.Supported("truncate", Truncate), + 77: syscalls.Supported("ftruncate", Ftruncate), + 78: syscalls.Supported("getdents", Getdents), + 79: syscalls.Supported("getcwd", Getcwd), + 80: syscalls.Supported("chdir", Chdir), + 81: syscalls.Supported("fchdir", Fchdir), + 82: syscalls.Supported("rename", Rename), + 83: syscalls.Supported("mkdir", Mkdir), + 84: syscalls.Supported("rmdir", Rmdir), + 85: syscalls.Supported("creat", Creat), + 86: syscalls.Supported("link", Link), + 87: syscalls.Supported("unlink", Unlink), + 88: syscalls.Supported("symlink", Symlink), + 89: syscalls.Supported("readlink", Readlink), + 90: syscalls.Supported("chmod", Chmod), + 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 92: syscalls.Supported("chown", Chown), + 93: syscalls.Supported("fchown", Fchown), + 94: syscalls.Supported("lchown", Lchown), + 95: syscalls.Supported("umask", Umask), + 96: syscalls.Supported("gettimeofday", Gettimeofday), + 97: syscalls.Supported("getrlimit", Getrlimit), + 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 100: syscalls.Supported("times", Times), + 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 102: syscalls.Supported("getuid", Getuid), + 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 104: syscalls.Supported("getgid", Getgid), + 105: syscalls.Supported("setuid", Setuid), + 106: syscalls.Supported("setgid", Setgid), + 107: syscalls.Supported("geteuid", Geteuid), + 108: syscalls.Supported("getegid", Getegid), + 109: syscalls.Supported("setpgid", Setpgid), + 110: syscalls.Supported("getppid", Getppid), + 111: syscalls.Supported("getpgrp", Getpgrp), + 112: syscalls.Supported("setsid", Setsid), + 113: syscalls.Supported("setreuid", Setreuid), + 114: syscalls.Supported("setregid", Setregid), + 115: syscalls.Supported("getgroups", Getgroups), + 116: syscalls.Supported("setgroups", Setgroups), + 117: syscalls.Supported("setresuid", Setresuid), + 118: syscalls.Supported("getresuid", Getresuid), + 119: syscalls.Supported("setresgid", Setresgid), + 120: syscalls.Supported("getresgid", Getresgid), + 121: syscalls.Supported("getpgid", Getpgid), + 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 124: syscalls.Supported("getsid", Getsid), + 125: syscalls.Supported("capget", Capget), + 126: syscalls.Supported("capset", Capset), + 127: syscalls.Supported("rt_sigpending", RtSigpending), + 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 131: syscalls.Supported("sigaltstack", Sigaltstack), + 132: syscalls.Supported("utime", Utime), + 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), + 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), + 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), + 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), + 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 161: syscalls.Supported("chroot", Chroot), + 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), + 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), + 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 170: syscalls.Supported("sethostname", Sethostname), + 171: syscalls.Supported("setdomainname", Setdomainname), + 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), + 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), + 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), + 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), + 186: syscalls.Supported("gettid", Gettid), + 187: syscalls.Supported("readahead", Readahead), + 188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil), + 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil), + 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 200: syscalls.Supported("tkill", Tkill), + 201: syscalls.Supported("time", Time), + 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 213: syscalls.Supported("epoll_create", EpollCreate), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 217: syscalls.Supported("getdents64", Getdents64), + 218: syscalls.Supported("set_tid_address", SetTidAddress), + 219: syscalls.Supported("restart_syscall", RestartSyscall), + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), + 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 222: syscalls.Supported("timer_create", TimerCreate), + 223: syscalls.Supported("timer_settime", TimerSettime), + 224: syscalls.Supported("timer_gettime", TimerGettime), + 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 226: syscalls.Supported("timer_delete", TimerDelete), + 227: syscalls.Supported("clock_settime", ClockSettime), + 228: syscalls.Supported("clock_gettime", ClockGettime), + 229: syscalls.Supported("clock_getres", ClockGetres), + 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 231: syscalls.Supported("exit_group", ExitGroup), + 232: syscalls.Supported("epoll_wait", EpollWait), + 233: syscalls.Supported("epoll_ctl", EpollCtl), + 234: syscalls.Supported("tgkill", Tgkill), + 235: syscalls.Supported("utimes", Utimes), + 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), + 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 247: syscalls.Supported("waitid", Waitid), + 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), + 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 257: syscalls.Supported("openat", Openat), + 258: syscalls.Supported("mkdirat", Mkdirat), + 259: syscalls.Supported("mknodat", Mknodat), + 260: syscalls.Supported("fchownat", Fchownat), + 261: syscalls.Supported("futimesat", Futimesat), + 262: syscalls.Supported("fstatat", Fstatat), + 263: syscalls.Supported("unlinkat", Unlinkat), + 264: syscalls.Supported("renameat", Renameat), + 265: syscalls.Supported("linkat", Linkat), + 266: syscalls.Supported("symlinkat", Symlinkat), + 267: syscalls.Supported("readlinkat", Readlinkat), + 268: syscalls.Supported("fchmodat", Fchmodat), + 269: syscalls.Supported("faccessat", Faccessat), + 270: syscalls.Supported("pselect", Pselect), + 271: syscalls.Supported("ppoll", Ppoll), + 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 275: syscalls.Supported("splice", Splice), + 276: syscalls.Supported("tee", Tee), + 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 280: syscalls.Supported("utimensat", Utimensat), + 281: syscalls.Supported("epoll_pwait", EpollPwait), + 282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 283: syscalls.Supported("timerfd_create", TimerfdCreate), + 284: syscalls.Supported("eventfd", Eventfd), + 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 286: syscalls.Supported("timerfd_settime", TimerfdSettime), + 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 288: syscalls.Supported("accept4", Accept4), + 289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 290: syscalls.Supported("eventfd2", Eventfd2), + 291: syscalls.Supported("epoll_create1", EpollCreate1), + 292: syscalls.Supported("dup3", Dup3), + 293: syscalls.Supported("pipe2", Pipe2), + 294: syscalls.Supported("inotify_init1", InotifyInit1), + 295: syscalls.Supported("preadv", Preadv), + 296: syscalls.Supported("pwritev", Pwritev), + 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 302: syscalls.Supported("prlimit64", Prlimit64), + 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 309: syscalls.Supported("getcpu", Getcpu), + 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 317: syscalls.Supported("seccomp", Seccomp), + 318: syscalls.Supported("getrandom", GetRandom), + 319: syscalls.Supported("memfd_create", MemfdCreate), + 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), + 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 322: syscalls.Supported("execveat", Execveat), + 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + + // Syscalls implemented after 325 are "backports" from versions + // of Linux after 4.4. + 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 327: syscalls.Supported("preadv2", Preadv2), + 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 332: syscalls.Supported("statx", Statx), + 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 334: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + }, + + Emulate: map[usermem.Addr]uintptr{ + 0xffffffffff600000: 96, // vsyscall gettimeofday(2) + 0xffffffffff600400: 201, // vsyscall time(2) + 0xffffffffff600800: 309, // vsyscall getcpu(2) + }, + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go new file mode 100644 index 000000000..3b584eed9 --- /dev/null +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -0,0 +1,332 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall +// numbers from Linux 4.4. +var ARM64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.ARM64, + Version: kernel.Version{ + Sysname: LinuxSysname, + Release: LinuxRelease, + Version: LinuxVersion, + }, + AuditNumber: linux.AUDIT_ARCH_AARCH64, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 5: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil), + 6: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 7: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 8: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil), + 9: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 10: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 12: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 13: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 14: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 15: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 16: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 17: syscalls.Supported("getcwd", Getcwd), + 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 19: syscalls.Supported("eventfd2", Eventfd2), + 20: syscalls.Supported("epoll_create1", EpollCreate1), + 21: syscalls.Supported("epoll_ctl", EpollCtl), + 22: syscalls.Supported("epoll_pwait", EpollPwait), + 23: syscalls.Supported("dup", Dup), + 24: syscalls.Supported("dup3", Dup3), + 26: syscalls.Supported("inotify_init1", InotifyInit1), + 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 32: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 33: syscalls.Supported("mknodat", Mknodat), + 34: syscalls.Supported("mkdirat", Mkdirat), + 35: syscalls.Supported("unlinkat", Unlinkat), + 36: syscalls.Supported("symlinkat", Symlinkat), + 37: syscalls.Supported("linkat", Linkat), + 38: syscalls.Supported("renameat", Renameat), + 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), + 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 46: syscalls.Supported("ftruncate", Ftruncate), + 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 48: syscalls.Supported("faccessat", Faccessat), + 49: syscalls.Supported("chdir", Chdir), + 50: syscalls.Supported("fchdir", Fchdir), + 51: syscalls.Supported("chroot", Chroot), + 52: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 53: syscalls.Supported("fchmodat", Fchmodat), + 54: syscalls.Supported("fchownat", Fchownat), + 55: syscalls.Supported("fchown", Fchown), + 56: syscalls.Supported("openat", Openat), + 57: syscalls.Supported("close", Close), + 58: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 59: syscalls.Supported("pipe2", Pipe2), + 60: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 61: syscalls.Supported("getdents64", Getdents64), + 62: syscalls.Supported("lseek", Lseek), + 63: syscalls.Supported("read", Read), + 64: syscalls.Supported("write", Write), + 65: syscalls.Supported("readv", Readv), + 66: syscalls.Supported("writev", Writev), + 67: syscalls.Supported("pread64", Pread64), + 68: syscalls.Supported("pwrite64", Pwrite64), + 69: syscalls.Supported("preadv", Preadv), + 70: syscalls.Supported("pwritev", Pwritev), + 71: syscalls.Supported("sendfile", Sendfile), + 72: syscalls.Supported("pselect", Pselect), + 73: syscalls.Supported("ppoll", Ppoll), + 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 77: syscalls.Supported("tee", Tee), + 78: syscalls.Supported("readlinkat", Readlinkat), + 80: syscalls.Supported("fstat", Fstat), + 81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), + 82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 83: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 84: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 85: syscalls.Supported("timerfd_create", TimerfdCreate), + 86: syscalls.Supported("timerfd_settime", TimerfdSettime), + 87: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 88: syscalls.Supported("utimensat", Utimensat), + 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 90: syscalls.Supported("capget", Capget), + 91: syscalls.Supported("capset", Capset), + 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 93: syscalls.Supported("exit", Exit), + 94: syscalls.Supported("exit_group", ExitGroup), + 95: syscalls.Supported("waitid", Waitid), + 96: syscalls.Supported("set_tid_address", SetTidAddress), + 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 101: syscalls.Supported("nanosleep", Nanosleep), + 102: syscalls.Supported("getitimer", Getitimer), + 103: syscalls.Supported("setitimer", Setitimer), + 104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 107: syscalls.Supported("timer_create", TimerCreate), + 108: syscalls.Supported("timer_gettime", TimerGettime), + 109: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 110: syscalls.Supported("timer_settime", TimerSettime), + 111: syscalls.Supported("timer_delete", TimerDelete), + 112: syscalls.Supported("clock_settime", ClockSettime), + 113: syscalls.Supported("clock_gettime", ClockGettime), + 114: syscalls.Supported("clock_getres", ClockGetres), + 115: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 124: syscalls.Supported("sched_yield", SchedYield), + 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 128: syscalls.Supported("restart_syscall", RestartSyscall), + 129: syscalls.Supported("kill", Kill), + 130: syscalls.Supported("tkill", Tkill), + 131: syscalls.Supported("tgkill", Tgkill), + 132: syscalls.Supported("sigaltstack", Sigaltstack), + 133: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 134: syscalls.Supported("rt_sigaction", RtSigaction), + 135: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 136: syscalls.Supported("rt_sigpending", RtSigpending), + 137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 139: syscalls.Supported("rt_sigreturn", RtSigreturn), + 140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 143: syscalls.Supported("setregid", Setregid), + 144: syscalls.Supported("setgid", Setgid), + 145: syscalls.Supported("setreuid", Setreuid), + 146: syscalls.Supported("setuid", Setuid), + 147: syscalls.Supported("setresuid", Setresuid), + 148: syscalls.Supported("getresuid", Getresuid), + 149: syscalls.Supported("setresgid", Setresgid), + 150: syscalls.Supported("getresgid", Getresgid), + 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 153: syscalls.Supported("times", Times), + 154: syscalls.Supported("setpgid", Setpgid), + 155: syscalls.Supported("getpgid", Getpgid), + 156: syscalls.Supported("getsid", Getsid), + 157: syscalls.Supported("setsid", Setsid), + 158: syscalls.Supported("getgroups", Getgroups), + 159: syscalls.Supported("setgroups", Setgroups), + 160: syscalls.Supported("uname", Uname), + 161: syscalls.Supported("sethostname", Sethostname), + 162: syscalls.Supported("setdomainname", Setdomainname), + 163: syscalls.Supported("getrlimit", Getrlimit), + 164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 166: syscalls.Supported("umask", Umask), + 167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 168: syscalls.Supported("getcpu", Getcpu), + 169: syscalls.Supported("gettimeofday", Gettimeofday), + 170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 172: syscalls.Supported("getpid", Getpid), + 173: syscalls.Supported("getppid", Getppid), + 174: syscalls.Supported("getuid", Getuid), + 175: syscalls.Supported("geteuid", Geteuid), + 176: syscalls.Supported("getgid", Getgid), + 177: syscalls.Supported("getegid", Getegid), + 178: syscalls.Supported("gettid", Gettid), + 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 190: syscalls.Supported("semget", Semget), + 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), + 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 197: syscalls.Supported("shmdt", Shmdt), + 198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 199: syscalls.Supported("socketpair", SocketPair), + 200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 201: syscalls.Supported("listen", Listen), + 202: syscalls.Supported("accept", Accept), + 203: syscalls.Supported("connect", Connect), + 204: syscalls.Supported("getsockname", GetSockName), + 205: syscalls.Supported("getpeername", GetPeerName), + 206: syscalls.Supported("sendto", SendTo), + 207: syscalls.Supported("recvfrom", RecvFrom), + 208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 211: syscalls.Supported("sendmsg", SendMsg), + 212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 213: syscalls.Supported("readahead", Readahead), + 214: syscalls.Supported("brk", Brk), + 215: syscalls.Supported("munmap", Munmap), + 216: syscalls.Supported("mremap", Mremap), + 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 221: syscalls.Supported("execve", Execve), + 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 226: syscalls.Supported("mprotect", Mprotect), + 227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 242: syscalls.Supported("accept4", Accept4), + 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 260: syscalls.Supported("wait4", Wait4), + 261: syscalls.Supported("prlimit64", Prlimit64), + 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 277: syscalls.Supported("seccomp", Seccomp), + 278: syscalls.Supported("getrandom", GetRandom), + 279: syscalls.Supported("memfd_create", MemfdCreate), + 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 281: syscalls.Supported("execveat", Execveat), + 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 286: syscalls.Supported("preadv2", Preadv2), + 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 291: syscalls.Supported("statx", Statx), + 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 293: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + }, + Emulate: map[usermem.Addr]uintptr{}, + + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 2e00a91ce..9bc2445a5 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -169,10 +169,14 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint if dirPath { return syserror.ENOTDIR } - if flags&linux.O_TRUNC != 0 { - if err := d.Inode.Truncate(t, d, 0); err != nil { - return err - } + } + + // Truncate is called when O_TRUNC is specified for any kind of + // existing Dirent. Behavior is delegated to the entry's Truncate + // implementation. + if flags&linux.O_TRUNC != 0 { + if err := d.Inode.Truncate(t, d, 0); err != nil { + return err } } @@ -396,7 +400,9 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l return err } - // Should we truncate the file? + // Truncate is called when O_TRUNC is specified for any kind of + // existing Dirent. Behavior is delegated to the entry's Truncate + // implementation. if flags&linux.O_TRUNC != 0 { if err := found.Inode.Truncate(t, found, 0); err != nil { return err @@ -834,25 +840,42 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return uintptr(newfd), nil, nil } -func fGetOwn(t *kernel.Task, file *fs.File) int32 { +func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx { ma := file.Async(nil) if ma == nil { - return 0 + return linux.FOwnerEx{} } a := ma.(*fasync.FileAsync) ot, otg, opg := a.Owner() switch { case ot != nil: - return int32(t.PIDNamespace().IDOfTask(ot)) + return linux.FOwnerEx{ + Type: linux.F_OWNER_TID, + PID: int32(t.PIDNamespace().IDOfTask(ot)), + } case otg != nil: - return int32(t.PIDNamespace().IDOfThreadGroup(otg)) + return linux.FOwnerEx{ + Type: linux.F_OWNER_PID, + PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), + } case opg != nil: - return int32(-t.PIDNamespace().IDOfProcessGroup(opg)) + return linux.FOwnerEx{ + Type: linux.F_OWNER_PGRP, + PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), + } default: - return 0 + return linux.FOwnerEx{} } } +func fGetOwn(t *kernel.Task, file *fs.File) int32 { + owner := fGetOwnEx(t, file) + if owner.Type == linux.F_OWNER_PGRP { + return -owner.PID + } + return owner.PID +} + // fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux. // // If who is positive, it represents a PID. If negative, it represents a PGID. @@ -895,11 +918,13 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) + return 0, nil, nil case linux.F_GETFL: return uintptr(file.Flags().ToLinux()), nil, nil case linux.F_SETFL: flags := uint(args[2].Uint()) file.SetFlags(linuxToFlags(flags).Settable()) + return 0, nil, nil case linux.F_SETLK, linux.F_SETLKW: // In Linux the file system can choose to provide lock operations for an inode. // Normally pipe and socket types lack lock operations. We diverge and use a heavy @@ -1002,6 +1027,44 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_SETOWN: fSetOwn(t, file, args[2].Int()) return 0, nil, nil + case linux.F_GETOWN_EX: + addr := args[2].Pointer() + owner := fGetOwnEx(t, file) + _, err := t.CopyOut(addr, &owner) + return 0, nil, err + case linux.F_SETOWN_EX: + addr := args[2].Pointer() + var owner linux.FOwnerEx + n, err := t.CopyIn(addr, &owner) + if err != nil { + return 0, nil, err + } + a := file.Async(fasync.New).(*fasync.FileAsync) + switch owner.Type { + case linux.F_OWNER_TID: + task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID)) + if task == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerTask(t, task) + return uintptr(n), nil, nil + case linux.F_OWNER_PID: + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID)) + if tg == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerThreadGroup(t, tg) + return uintptr(n), nil, nil + case linux.F_OWNER_PGRP: + pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID)) + if pg == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerProcessGroup(t, pg) + return uintptr(n), nil, nil + default: + return 0, nil, syserror.EINVAL + } case linux.F_GET_SEALS: val, err := tmpfs.GetSeals(file.Dirent.Inode) return uintptr(val), nil, err @@ -1029,7 +1092,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Everything else is not yet supported. return 0, nil, syserror.EINVAL } - return 0, nil, nil } const ( @@ -1423,9 +1485,6 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { if err != nil { return err } - if dirPath { - return syserror.ENOENT - } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { @@ -1436,7 +1495,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { return err } - return d.Remove(t, root, name) + return d.Remove(t, root, name, dirPath) }) } @@ -1486,6 +1545,8 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if fs.IsDir(d.Inode.StableAttr) { return syserror.EISDIR } + // In contrast to open(O_TRUNC), truncate(2) is only valid for file + // types. if !fs.IsFile(d.Inode.StableAttr) { return syserror.EINVAL } @@ -1524,7 +1585,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, syserror.EINVAL } - // Note that this is different from truncate(2) above, where a + // In contrast to open(O_TRUNC), truncate(2) is only valid for file + // types. Note that this is different from truncate(2) above, where a // directory returns EISDIR. if !fs.IsFile(file.Dirent.Inode.StableAttr) { return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index 7a13beac2..2b2df989a 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -197,53 +197,51 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) return remainingTimeout, n, err } +// CopyInFDSet copies an fd set from select(2)/pselect(2). +func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { + set := make([]byte, nBytes) + + if addr != 0 { + if _, err := t.CopyIn(addr, &set); err != nil { + return nil, err + } + // If we only use part of the last byte, mask out the extraneous bits. + // + // N.B. This only works on little-endian architectures. + if nBitsInLastPartialByte != 0 { + set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte + } + } + return set, nil +} + func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) { if nfds < 0 || nfds > fileCap { return 0, syserror.EINVAL } - // Capture all the provided input vectors. - // - // N.B. This only works on little-endian architectures. - byteCount := (nfds + 7) / 8 - - bitsInLastPartialByte := uint(nfds % 8) - r := make([]byte, byteCount) - w := make([]byte, byteCount) - e := make([]byte, byteCount) + // Calculate the size of the fd sets (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 - if readFDs != 0 { - if _, err := t.CopyIn(readFDs, &r); err != nil { - return 0, err - } - // Mask out bits above nfds. - if bitsInLastPartialByte != 0 { - r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte - } + // Capture all the provided input vectors. + r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err } - - if writeFDs != 0 { - if _, err := t.CopyIn(writeFDs, &w); err != nil { - return 0, err - } - if bitsInLastPartialByte != 0 { - w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte - } + w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err } - - if exceptFDs != 0 { - if _, err := t.CopyIn(exceptFDs, &e); err != nil { - return 0, err - } - if bitsInLastPartialByte != 0 { - e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte - } + e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err } // Count how many FDs are actually being requested so that we can build // a PollFD array. fdCount := 0 - for i := 0; i < byteCount; i++ { + for i := 0; i < nBytes; i++ { v := r[i] | w[i] | e[i] for v != 0 { v &= (v - 1) @@ -254,7 +252,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add // Build the PollFD array. pfd := make([]linux.PollFD, 0, fdCount) var fd int32 - for i := 0; i < byteCount; i++ { + for i := 0; i < nBytes; i++ { rV, wV, eV := r[i], w[i], e[i] v := rV | wV | eV m := byte(1) @@ -295,8 +293,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add } // Do the syscall, then count the number of bits set. - _, _, err := pollBlock(t, pfd, timeout) - if err != nil { + if _, _, err = pollBlock(t, pfd, timeout); err != nil { return 0, syserror.ConvertIntr(err, syserror.EINTR) } diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 3ab54271c..cd31e0649 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -72,6 +72,39 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file) } +// Readahead implements readahead(2). +func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + size := args[2].SizeT() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Check that the size is valid. + if int(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Return EINVAL; if the underlying file type does not support readahead, + // then Linux will return EINVAL to indicate as much. In the future, we + // may extend this function to actually support readahead hints. + return 0, nil, syserror.EINVAL +} + // Pread64 implements linux syscall pread64(2). func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index 0104a94c0..fb6efd5d8 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -20,7 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd" + "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -506,3 +509,77 @@ func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?") return 0, nil, syserror.EINTR } + +// sharedSignalfd is shared between the two calls. +func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { + // Copy in the signal mask. + mask, err := copyInSigSet(t, sigset, sigsetsize) + if err != nil { + return 0, nil, err + } + + // Always check for valid flags, even if not creating. + if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Is this a change to an existing signalfd? + // + // The spec indicates that this should adjust the mask. + if fd != -1 { + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Is this a signalfd? + if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok { + s.SetMask(mask) + return 0, nil, nil + } + + // Not a signalfd. + return 0, nil, syserror.EINVAL + } + + // Create a new file. + file, err := signalfd.New(t, mask) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + // Set appropriate flags. + file.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags&linux.SFD_NONBLOCK != 0, + }) + + // Create a new descriptor. + fd, err = t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.SFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + // Done. + return uintptr(fd), nil, nil +} + +// Signalfd implements the linux syscall signalfd(2). +func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + return sharedSignalfd(t, fd, sigset, sigsetsize, 0) +} + +// Signalfd4 implements the linux syscall signalfd4(2). +func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + flags := args[3].Int() + return sharedSignalfd(t, fd, sigset, sigsetsize, flags) +} diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 3bac4d90d..4b5aafcc0 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -447,16 +447,13 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, syserror.ENOTSOCK } - // Read the length if present. Reject negative values. + // Read the length. Reject negative values. optLen := int32(0) - if optLenAddr != 0 { - if _, err := t.CopyIn(optLenAddr, &optLen); err != nil { - return 0, nil, err - } - - if optLen < 0 { - return 0, nil, syserror.EINVAL - } + if _, err := t.CopyIn(optLenAddr, &optLen); err != nil { + return 0, nil, err + } + if optLen < 0 { + return 0, nil, syserror.EINVAL } // Call syscall implementation then copy both value and value len out. @@ -465,11 +462,9 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, e.ToError() } - if optLenAddr != 0 { - vLen := int32(binary.Size(v)) - if _, err := t.CopyOut(optLenAddr, vLen); err != nil { - return 0, nil, err - } + vLen := int32(binary.Size(v)) + if _, err := t.CopyOut(optLenAddr, vLen); err != nil { + return 0, nil, err } if v != nil { @@ -531,7 +526,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, syserror.ENOTSOCK } - if optLen <= 0 { + if optLen < 0 { return 0, nil, syserror.EINVAL } if optLen > maxOptLen { @@ -769,7 +764,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Unix.Release() + cms.Release() } if int(msg.Flags) != mflags { @@ -789,24 +784,16 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } - defer cms.Unix.Release() + defer cms.Release() controlData := make([]byte, 0, msg.ControlLen) + controlData = control.PackControlMessages(t, cms, controlData) if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() { creds, _ := cms.Unix.Credentials.(control.SCMCredentials) controlData, mflags = control.PackCredentials(t, creds, controlData, mflags) } - if cms.IP.HasTimestamp { - controlData = control.PackTimestamp(t, cms.IP.Timestamp, controlData) - } - - if cms.IP.HasInq { - // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP. - controlData = control.PackInq(t, cms.IP.Inq, controlData) - } - if cms.Unix.Rights != nil { controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) } @@ -882,7 +869,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Unix.Release() + cm.Release() if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } @@ -1065,7 +1052,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme } // Call the syscall implementation. - n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: controlMessages}) + n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) if err != nil { controlMessages.Release() diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 8a98fedcb..dd3a5807f 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -29,9 +29,8 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB total int64 n int64 err error - ch chan struct{} - inW bool - outW bool + inCh chan struct{} + outCh chan struct{} ) for opts.Length > 0 { n, err = fs.Splice(t, outFile, inFile, opts) @@ -43,35 +42,33 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB break } - // Are we a registered waiter? - if ch == nil { - ch = make(chan struct{}, 1) - } - if !inW && !inFile.Flags().NonBlocking { - w, _ := waiter.NewChannelEntry(ch) - inFile.EventRegister(&w, EventMaskRead) - defer inFile.EventUnregister(&w) - inW = true // Registered. - } else if !outW && !outFile.Flags().NonBlocking { - w, _ := waiter.NewChannelEntry(ch) - outFile.EventRegister(&w, EventMaskWrite) - defer outFile.EventUnregister(&w) - outW = true // Registered. - } - - // Was anything registered? If no, everything is non-blocking. - if !inW && !outW { - break - } - - if (!inW || inFile.Readiness(EventMaskRead) != 0) && (!outW || outFile.Readiness(EventMaskWrite) != 0) { - // Something became ready, try again without blocking. - continue + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the splice operation. + if inFile.Readiness(EventMaskRead) == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, EventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err = t.Block(inCh); err != nil { + break + } } - - // Block until there's data. - if err = t.Block(ch); err != nil { - break + if outFile.Readiness(EventMaskWrite) == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, EventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err = t.Block(outCh); err != nil { + break + } } } @@ -149,7 +146,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc Length: count, SrcOffset: true, SrcStart: offset, - }, false) + }, outFile.Flags().NonBlocking) // Copy out the new offset. if _, err := t.CopyOut(offsetAddr, n+offset); err != nil { @@ -159,12 +156,17 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Send data using splice. n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{ Length: count, - }, false) + }, outFile.Flags().NonBlocking) + } + + // Sendfile can't lose any data because inFD is always a regual file. + if n != 0 { + err = nil } // We can only pass a single file to handleIOError, so pick inFile // arbitrarily. This is used only for debugging purposes. - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile) + return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "sendfile", inFile) } // Splice implements splice(2). @@ -181,12 +183,6 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, syserror.EINVAL } - // Only non-blocking is meaningful. Note that unlike in Linux, this - // flag is applied consistently. We will have either fully blocking or - // non-blocking behavior below, regardless of the underlying files - // being spliced to. It's unclear if this is a bug or not yet. - nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0 - // Get files. outFile := t.GetFile(outFD) if outFile == nil { @@ -200,6 +196,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } defer inFile.DecRef() + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0) + // Construct our options. // // Note that exactly one of the underlying buffers must be a pipe. We @@ -247,17 +250,17 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if inOffset != 0 || outOffset != 0 { return 0, nil, syserror.ESPIPE } - default: - return 0, nil, syserror.EINVAL - } - // We may not refer to the same pipe; otherwise it's a continuous loop. - if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID { + // We may not refer to the same pipe; otherwise it's a continuous loop. + if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID { + return 0, nil, syserror.EINVAL + } + default: return 0, nil, syserror.EINVAL } // Splice data. - n, err := doSplice(t, outFile, inFile, opts, nonBlocking) + n, err := doSplice(t, outFile, inFile, opts, nonBlock) // See above; inFile is chosen arbitrarily here. return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile) @@ -275,9 +278,6 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo return 0, nil, syserror.EINVAL } - // Only non-blocking is meaningful. - nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0 - // Get files. outFile := t.GetFile(outFD) if outFile == nil { @@ -301,12 +301,20 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo return 0, nil, syserror.EINVAL } + // The operation is non-blocking if anything is non-blocking. + nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0) + // Splice data. n, err := doSplice(t, outFile, inFile, fs.SpliceOpts{ Length: count, Dup: true, - }, nonBlocking) + }, nonBlock) + + // Tee doesn't change a state of inFD, so it can't lose any data. + if n != 0 { + err = nil + } // See above; inFile is chosen arbitrarily here. - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "tee", inFile) + return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "tee", inFile) } diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 8ab7ffa25..4115116ff 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -15,12 +15,15 @@ package linux import ( + "path" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -67,8 +70,22 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal argvAddr := args[1].Pointer() envvAddr := args[2].Pointer() - // Extract our arguments. - filename, err := t.CopyInString(filenameAddr, linux.PATH_MAX) + return execveat(t, linux.AT_FDCWD, filenameAddr, argvAddr, envvAddr, 0) +} + +// Execveat implements linux syscall execveat(2). +func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + pathnameAddr := args[1].Pointer() + argvAddr := args[2].Pointer() + envvAddr := args[3].Pointer() + flags := args[4].Int() + + return execveat(t, dirFD, pathnameAddr, argvAddr, envvAddr, flags) +} + +func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { + pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) if err != nil { return 0, nil, err } @@ -89,14 +106,66 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } } + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + atEmptyPath := flags&linux.AT_EMPTY_PATH != 0 + if !atEmptyPath && len(pathname) == 0 { + return 0, nil, syserror.ENOENT + } + resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 + root := t.FSContext().RootDirectory() defer root.DecRef() - wd := t.FSContext().WorkingDirectory() - defer wd.DecRef() + + var wd *fs.Dirent + var executable *fs.File + var closeOnExec bool + if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) { + // Even if the pathname is absolute, we may still need the wd + // for interpreter scripts if the path of the interpreter is + // relative. + wd = t.FSContext().WorkingDirectory() + } else { + // Need to extract the given FD. + f, fdFlags := t.FDTable().Get(dirFD) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + closeOnExec = fdFlags.CloseOnExec + + if atEmptyPath && len(pathname) == 0 { + executable = f + } else { + wd = f.Dirent + wd.IncRef() + if !fs.IsDir(wd.Inode.StableAttr) { + return 0, nil, syserror.ENOTDIR + } + } + } + if wd != nil { + defer wd.DecRef() + } // Load the new TaskContext. - maxTraversals := uint(linux.MaxSymlinkTraversals) - tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, nil, argv, envv, t.Arch().FeatureSet()) + remainingTraversals := uint(linux.MaxSymlinkTraversals) + loadArgs := loader.LoadArgs{ + Mounts: t.MountNamespace(), + Root: root, + WorkingDirectory: wd, + RemainingTraversals: &remainingTraversals, + ResolveFinal: resolveFinal, + Filename: pathname, + File: executable, + CloseOnExec: closeOnExec, + Argv: argv, + Envv: envv, + Features: t.Arch().FeatureSet(), + } + + tc, se := t.Kernel().LoadTaskImage(t, loadArgs) if se != nil { return 0, nil, se.ToError() } diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go index 4b3f043a2..b887fa9d7 100644 --- a/pkg/sentry/syscalls/linux/sys_time.go +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -15,6 +15,7 @@ package linux import ( + "fmt" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -228,41 +229,35 @@ func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem use timer.Destroy() - var remaining time.Duration - // Did we just block for the entire duration? - if err == syserror.ETIMEDOUT { - remaining = 0 - } else { - remaining = dur - after.Sub(start) + switch err { + case syserror.ETIMEDOUT: + // Slept for entire timeout. + return nil + case syserror.ErrInterrupted: + // Interrupted. + remaining := dur - after.Sub(start) if remaining < 0 { remaining = time.Duration(0) } - } - // Copy out remaining time. - if err != nil && rem != usermem.Addr(0) { - timeleft := linux.NsecToTimespec(remaining.Nanoseconds()) - if err := copyTimespecOut(t, rem, &timeleft); err != nil { - return err + // Copy out remaining time. + if rem != 0 { + timeleft := linux.NsecToTimespec(remaining.Nanoseconds()) + if err := copyTimespecOut(t, rem, &timeleft); err != nil { + return err + } } - } - - // Did we just block for the entire duration? - if err == syserror.ETIMEDOUT { - return nil - } - // If interrupted, arrange for a restart with the remaining duration. - if err == syserror.ErrInterrupted { + // Arrange for a restart with the remaining duration. t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{ c: c, duration: remaining, rem: rem, }) return kernel.ERESTART_RESTARTBLOCK + default: + panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err)) } - - return err } // Nanosleep implements linux syscall Nanosleep(2). diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go index 271ace08e..748e8dd8d 100644 --- a/pkg/sentry/syscalls/linux/sys_utsname.go +++ b/pkg/sentry/syscalls/linux/sys_utsname.go @@ -79,11 +79,11 @@ func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, syserror.EINVAL } - name, err := t.CopyInString(nameAddr, int(size)) - if err != nil { + name := make([]byte, size) + if _, err := t.CopyInBytes(nameAddr, name); err != nil { return 0, nil, err } - utsns.SetHostName(name) + utsns.SetHostName(string(name)) return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 27cd2c336..ad4b67806 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -191,7 +191,6 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } // Pwritev2 implements linux syscall pwritev2(2). -// TODO(b/120161091): Implement O_SYNC and D_SYNC functionality. func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // While the syscall is // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go new file mode 100644 index 000000000..97d9a65ea --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -0,0 +1,169 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Getxattr implements linux syscall getxattr(2). +func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + valueLen := 0 + err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + value, err := getxattr(t, d, dirPath, nameAddr) + if err != nil { + return err + } + + valueLen = len(value) + if size == 0 { + return nil + } + if size > linux.XATTR_SIZE_MAX { + size = linux.XATTR_SIZE_MAX + } + if valueLen > int(size) { + return syserror.ERANGE + } + + _, err = t.CopyOutBytes(valueAddr, []byte(value)) + return err + }) + if err != nil { + return 0, nil, err + } + return uintptr(valueLen), nil, nil +} + +// getxattr implements getxattr from the given *fs.Dirent. +func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return "", syserror.ENOTDIR + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil { + return "", err + } + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return "", err + } + + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + + return d.Inode.Getxattr(name) +} + +// Setxattr implements linux syscall setxattr(2). +func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Uint() + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return 0, nil, syserror.EINVAL + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags) + }) +} + +// setxattr implements setxattr from the given *fs.Dirent. +func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { + return err + } + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + if size > linux.XATTR_SIZE_MAX { + return syserror.E2BIG + } + buf := make([]byte, size) + if _, err = t.CopyInBytes(valueAddr, buf); err != nil { + return err + } + value := string(buf) + + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + + return d.Inode.Setxattr(name, value) +} + +func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { + name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) + if err != nil { + if err == syserror.ENAMETOOLONG { + return "", syserror.ERANGE + } + return "", err + } + if len(name) == 0 { + return "", syserror.ERANGE + } + return name, nil +} + +func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error { + // Restrict xattrs to regular files and directories. + // + // In Linux, this restriction technically only applies to xattrs in the + // "user.*" namespace, but we don't allow any other xattr prefixes anyway. + if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) { + if perms.Write { + return syserror.EPERM + } + return syserror.ENODATA + } + + return i.CheckPermission(t, perms) +} diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index 8aa6a3017..18e212dff 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -1,15 +1,15 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template_instance") - go_template_instance( name = "seqatomic_parameters", out = "seqatomic_parameters_unsafe.go", package = "time", suffix = "Parameters", - template = "//third_party/gvsync:generic_seqatomic", + template = "//pkg/syncutil:generic_seqatomic", types = { "Value": "Parameters", }, @@ -36,8 +36,8 @@ go_library( deps = [ "//pkg/log", "//pkg/metric", + "//pkg/syncutil", "//pkg/syserror", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD index b69603da3..fc7614fff 100644 --- a/pkg/sentry/unimpl/BUILD +++ b/pkg/sentry/unimpl/BUILD @@ -1,5 +1,6 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@rules_cc//cc:defs.bzl", "cc_proto_library") package(licenses = ["notice"]) @@ -10,6 +11,12 @@ proto_library( deps = ["//pkg/sentry/arch:registers_proto"], ) +cc_proto_library( + name = "unimplemented_syscall_cc_proto", + visibility = ["//visibility:public"], + deps = [":unimplemented_syscall_proto"], +) + go_proto_library( name = "unimplemented_syscall_go_proto", importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto", diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD index a34c39540..c32fe3241 100644 --- a/pkg/sentry/usage/BUILD +++ b/pkg/sentry/usage/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "usage", srcs = [ diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD index a5b4206bb..684f59a6b 100644 --- a/pkg/sentry/usermem/BUILD +++ b/pkg/sentry/usermem/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "addr_range", diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go index 8d88396ba..7898851b3 100644 --- a/pkg/sentry/usermem/bytes_io.go +++ b/pkg/sentry/usermem/bytes_io.go @@ -102,19 +102,34 @@ func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) { } func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) { - blocks := make([]safemem.Block, 0, ars.NumRanges()) - for !ars.IsEmpty() { - ar := ars.Head() - n, err := b.rangeCheck(ar.Start, int(ar.Length())) - if n != 0 { - blocks = append(blocks, safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start):int(ar.Start)+n])) + switch ars.NumRanges() { + case 0: + return safemem.BlockSeq{}, nil + case 1: + block, err := b.blockFromAddrRange(ars.Head()) + return safemem.BlockSeqOf(block), err + default: + blocks := make([]safemem.Block, 0, ars.NumRanges()) + for !ars.IsEmpty() { + block, err := b.blockFromAddrRange(ars.Head()) + if block.Len() != 0 { + blocks = append(blocks, block) + } + if err != nil { + return safemem.BlockSeqFromSlice(blocks), err + } + ars = ars.Tail() } - if err != nil { - return safemem.BlockSeqFromSlice(blocks), err - } - ars = ars.Tail() + return safemem.BlockSeqFromSlice(blocks), nil + } +} + +func (b *BytesIO) blockFromAddrRange(ar AddrRange) (safemem.Block, error) { + n, err := b.rangeCheck(ar.Start, int(ar.Length())) + if n == 0 { + return safemem.Block{}, err } - return safemem.BlockSeqFromSlice(blocks), nil + return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err } // BytesIOSequence returns an IOSequence representing the given byte slice. diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go index 6eced660a..7b1f312b1 100644 --- a/pkg/sentry/usermem/usermem.go +++ b/pkg/sentry/usermem/usermem.go @@ -16,6 +16,7 @@ package usermem import ( + "bytes" "errors" "io" "strconv" @@ -270,11 +271,10 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts) // Look for the terminating zero byte, which may have occurred before // hitting err. - for i, c := range buf[done : done+n] { - if c == 0 { - return stringFromImmutableBytes(buf[:done+i]), nil - } + if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 { + return stringFromImmutableBytes(buf[:done+i]), nil } + done += n if err != nil { return stringFromImmutableBytes(buf[:done]), err diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 0f247bf77..e3e554b88 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) @@ -11,13 +12,14 @@ go_library( "file_description.go", "file_description_impl_util.go", "filesystem.go", + "filesystem_impl_util.go", "filesystem_type.go", "mount.go", "mount_unsafe.go", "options.go", + "pathname.go", "permissions.go", "resolving_path.go", - "syscalls.go", "testutil.go", "vfs.go", ], @@ -31,9 +33,9 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/usermem", + "//pkg/syncutil", "//pkg/syserror", "//pkg/waiter", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 7847854bc..9aa133bcb 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -39,8 +39,8 @@ Mount references are held by: - Mount: Each referenced Mount holds a reference on its parent, which is the mount containing its mount point. -- VirtualFilesystem: A reference is held on all Mounts that are attached - (reachable by Mount traversal). +- VirtualFilesystem: A reference is held on each Mount that has not been + umounted. MountNamespace and FileDescription references are held by users of VFS. The expectation is that each `kernel.Task` holds a reference on its corresponding diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index 32cf9151b..705194ebc 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -24,6 +24,9 @@ type contextID int const ( // CtxMountNamespace is a Context.Value key for a MountNamespace. CtxMountNamespace contextID = iota + + // CtxRoot is a Context.Value key for a VFS root. + CtxRoot ) // MountNamespaceFromContext returns the MountNamespace used by ctx. It does @@ -35,3 +38,13 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace { } return nil } + +// RootFromContext returns the VFS root used by ctx. It takes a reference on +// the returned VirtualDentry. If ctx does not have a specific VFS root, +// RootFromContext returns a zero-value VirtualDentry. +func RootFromContext(ctx context.Context) VirtualDentry { + if v := ctx.Value(CtxRoot); v != nil { + return v.(VirtualDentry) + } + return VirtualDentry{} +} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 45912fc58..40f4c1d09 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -16,6 +16,7 @@ package vfs import ( "fmt" + "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/syserror" @@ -50,7 +51,7 @@ import ( // and not inodes. Furthermore, when parties outside the scope of VFS can // rename inodes on such filesystems, VFS generally cannot "follow" the rename, // both due to synchronization issues and because it may not even be able to -// name the destination path; this implies that it would in fact be *incorrect* +// name the destination path; this implies that it would in fact be incorrect // for Dentries to be associated with inodes on such filesystems. Consequently, // operations that are inode operations in Linux are FilesystemImpl methods // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do @@ -84,6 +85,9 @@ type Dentry struct { // mounts is accessed using atomic memory operations. mounts uint32 + // mu synchronizes disowning and mounting over this Dentry. + mu sync.Mutex + // children are child Dentries. children map[string]*Dentry @@ -114,7 +118,7 @@ func (d *Dentry) Impl() DentryImpl { type DentryImpl interface { // IncRef increments the Dentry's reference count. A Dentry with a non-zero // reference count must remain coherent with the state of the filesystem. - IncRef(fs *Filesystem) + IncRef() // TryIncRef increments the Dentry's reference count and returns true. If // the Dentry's reference count is zero, TryIncRef may do nothing and @@ -122,10 +126,10 @@ type DentryImpl interface { // guarantee that the Dentry is coherent with the state of the filesystem.) // // TryIncRef does not require that a reference is held on the Dentry. - TryIncRef(fs *Filesystem) bool + TryIncRef() bool // DecRef decrements the Dentry's reference count. - DecRef(fs *Filesystem) + DecRef() } // IsDisowned returns true if d is disowned. @@ -142,16 +146,20 @@ func (d *Dentry) isMounted() bool { return atomic.LoadUint32(&d.mounts) != 0 } -func (d *Dentry) incRef(fs *Filesystem) { - d.impl.IncRef(fs) +// IncRef increments d's reference count. +func (d *Dentry) IncRef() { + d.impl.IncRef() } -func (d *Dentry) tryIncRef(fs *Filesystem) bool { - return d.impl.TryIncRef(fs) +// TryIncRef increments d's reference count and returns true. If d's reference +// count is zero, TryIncRef may instead do nothing and return false. +func (d *Dentry) TryIncRef() bool { + return d.impl.TryIncRef() } -func (d *Dentry) decRef(fs *Filesystem) { - d.impl.DecRef(fs) +// DecRef decrements d's reference count. +func (d *Dentry) DecRef() { + d.impl.DecRef() } // These functions are exported so that filesystem implementations can use @@ -228,36 +236,48 @@ func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dent panic("d is already disowned") } } - vfs.mountMu.RLock() - if _, ok := mntns.mountpoints[d]; ok { - vfs.mountMu.RUnlock() + vfs.mountMu.Lock() + if mntns.mountpoints[d] != 0 { + vfs.mountMu.Unlock() return syserror.EBUSY } - // Return with vfs.mountMu locked, which will be unlocked by - // AbortDeleteDentry or CommitDeleteDentry. + d.mu.Lock() + vfs.mountMu.Unlock() + // Return with d.mu locked to block attempts to mount over it; it will be + // unlocked by AbortDeleteDentry or CommitDeleteDentry. return nil } // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion // fails. -func (vfs *VirtualFilesystem) AbortDeleteDentry() { - vfs.mountMu.RUnlock() +func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { + d.mu.Unlock() } // CommitDeleteDentry must be called after the file represented by d is // deleted, and causes d to become disowned. // +// CommitDeleteDentry is a mutator of d and d.Parent(). +// // Preconditions: PrepareDeleteDentry was previously called on d. func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { - delete(d.parent.children, d.name) + if d.parent != nil { + delete(d.parent.children, d.name) + } d.setDisowned() - // TODO: lazily unmount mounts at d - vfs.mountMu.RUnlock() + d.mu.Unlock() + if d.isMounted() { + vfs.forgetDisownedMountpoint(d) + } } // DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as // appropriate for in-memory filesystems that don't need to ensure that some // external state change succeeds before committing the deletion. +// +// DeleteDentry is a mutator of d and d.Parent(). +// +// Preconditions: d is a child Dentry. func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error { if err := vfs.PrepareDeleteDentry(mntns, d); err != nil { return err @@ -266,6 +286,27 @@ func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) err return nil } +// ForceDeleteDentry causes d to become disowned. It should only be used in +// cases where VFS has no ability to stop the deletion (e.g. d represents the +// local state of a file on a remote filesystem on which the file has already +// been deleted). +// +// ForceDeleteDentry is a mutator of d and d.Parent(). +// +// Preconditions: d is a child Dentry. +func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) { + if checkInvariants { + if d.parent == nil { + panic("d is independent") + } + if d.IsDisowned() { + panic("d is already disowned") + } + } + d.mu.Lock() + vfs.CommitDeleteDentry(d) +} + // PrepareRenameDentry must be called before attempting to rename the file // represented by from. If to is not nil, it represents the file that will be // replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the @@ -291,18 +332,21 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t } } } - vfs.mountMu.RLock() - if _, ok := mntns.mountpoints[from]; ok { - vfs.mountMu.RUnlock() + vfs.mountMu.Lock() + if mntns.mountpoints[from] != 0 { + vfs.mountMu.Unlock() return syserror.EBUSY } if to != nil { - if _, ok := mntns.mountpoints[to]; ok { - vfs.mountMu.RUnlock() + if mntns.mountpoints[to] != 0 { + vfs.mountMu.Unlock() return syserror.EBUSY } + to.mu.Lock() } - // Return with vfs.mountMu locked, which will be unlocked by + from.mu.Lock() + vfs.mountMu.Unlock() + // Return with from.mu and to.mu locked, which will be unlocked by // AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry. return nil @@ -310,38 +354,76 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t // AbortRenameDentry must be called after PrepareRenameDentry if the rename // fails. -func (vfs *VirtualFilesystem) AbortRenameDentry() { - vfs.mountMu.RUnlock() +func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { + from.mu.Unlock() + if to != nil { + to.mu.Unlock() + } } // CommitRenameReplaceDentry must be called after the file represented by from // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file // that was replaced by from. // +// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and +// to.Parent(). +// // Preconditions: PrepareRenameDentry was previously called on from and to. // newParent.Child(newName) == to. func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) { - if to != nil { - to.setDisowned() - // TODO: lazily unmount mounts at d - } if newParent.children == nil { newParent.children = make(map[string]*Dentry) } newParent.children[newName] = from from.parent = newParent from.name = newName - vfs.mountMu.RUnlock() + from.mu.Unlock() + if to != nil { + to.setDisowned() + to.mu.Unlock() + if to.isMounted() { + vfs.forgetDisownedMountpoint(to) + } + } } // CommitRenameExchangeDentry must be called after the files represented by // from and to are exchanged by rename(RENAME_EXCHANGE). // +// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and +// to.Parent(). +// // Preconditions: PrepareRenameDentry was previously called on from and to. func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { from.parent, to.parent = to.parent, from.parent from.name, to.name = to.name, from.name from.parent.children[from.name] = from to.parent.children[to.name] = to - vfs.mountMu.RUnlock() + from.mu.Unlock() + to.mu.Unlock() +} + +// forgetDisownedMountpoint is called when a mount point is deleted to umount +// all mounts using it in all other mount namespaces. +// +// forgetDisownedMountpoint is analogous to Linux's +// fs/namespace.c:__detach_mounts(). +func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) { + var ( + vdsToDecRef []VirtualDentry + mountsToDecRef []*Mount + ) + vfs.mountMu.Lock() + vfs.mounts.seq.BeginWrite() + for mnt := range vfs.mountpoints[d] { + vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{}, vdsToDecRef, mountsToDecRef) + } + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.DecRef() + } } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 86bde7fb3..c5a9adca3 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -47,15 +48,14 @@ type FileDescription struct { impl FileDescriptionImpl } -// Init must be called before first use of fd. It takes references on mnt and -// d. +// Init must be called before first use of fd. It takes ownership of references +// on mnt and d held by the caller. func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { fd.refs = 1 fd.vd = VirtualDentry{ mount: mnt, dentry: d, } - fd.vd.IncRef() fd.impl = impl } @@ -64,6 +64,18 @@ func (fd *FileDescription) Impl() FileDescriptionImpl { return fd.impl } +// Mount returns the mount on which fd was opened. It does not take a reference +// on the returned Mount. +func (fd *FileDescription) Mount() *Mount { + return fd.vd.mount +} + +// Dentry returns the dentry at which fd was opened. It does not take a +// reference on the returned Dentry. +func (fd *FileDescription) Dentry() *Dentry { + return fd.vd.dentry +} + // VirtualDentry returns the location at which fd was opened. It does not take // a reference on the returned VirtualDentry. func (fd *FileDescription) VirtualDentry() VirtualDentry { @@ -75,6 +87,22 @@ func (fd *FileDescription) IncRef() { atomic.AddInt64(&fd.refs, 1) } +// TryIncRef increments fd's reference count and returns true. If fd's +// reference count is already zero, TryIncRef does nothing and returns false. +// +// TryIncRef does not require that a reference is held on fd. +func (fd *FileDescription) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&fd.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) { + return true + } + } +} + // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef() { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { @@ -102,7 +130,7 @@ type FileDescriptionImpl interface { // OnClose is called when a file descriptor representing the // FileDescription is closed. Note that returning a non-nil error does not // prevent the file descriptor from being closed. - OnClose() error + OnClose(ctx context.Context) error // StatusFlags returns file description status flags, as for // fcntl(F_GETFL). @@ -180,12 +208,26 @@ type FileDescriptionImpl interface { // ConfigureMMap mutates opts to implement mmap(2) for the file. Most // implementations that support memory mapping can call // GenericConfigureMMap with the appropriate memmap.Mappable. - ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error + ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error // Ioctl implements the ioctl(2) syscall. Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) - // TODO: extended attributes; file locking + // Listxattr returns all extended attribute names for the file. + Listxattr(ctx context.Context) ([]string, error) + + // Getxattr returns the value associated with the given extended attribute + // for the file. + Getxattr(ctx context.Context, name string) (string, error) + + // Setxattr changes the value associated with the given extended attribute + // for the file. + Setxattr(ctx context.Context, opts SetxattrOptions) error + + // Removexattr removes the given extended attribute from the file. + Removexattr(ctx context.Context, name string) error + + // TODO: file locking } // Dirent holds the information contained in struct linux_dirent64. @@ -199,8 +241,11 @@ type Dirent struct { // Ino is the inode number. Ino uint64 - // Off is this Dirent's offset. - Off int64 + // NextOff is the offset of the *next* Dirent in the directory; that is, + // FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will + // cause the next call to FileDescription.IterDirents() to yield the next + // Dirent. (The offset of the first Dirent in a directory is always 0.) + NextOff int64 } // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. @@ -211,3 +256,172 @@ type IterDirentsCallback interface { // called. Handle(dirent Dirent) bool } + +// OnClose is called when a file descriptor representing the FileDescription is +// closed. Returning a non-nil error should not prevent the file descriptor +// from being closed. +func (fd *FileDescription) OnClose(ctx context.Context) error { + return fd.impl.OnClose(ctx) +} + +// StatusFlags returns file description status flags, as for fcntl(F_GETFL). +func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { + flags, err := fd.impl.StatusFlags(ctx) + flags |= linux.O_LARGEFILE + return flags, err +} + +// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). +func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + return fd.impl.SetStatusFlags(ctx, flags) +} + +// Stat returns metadata for the file represented by fd. +func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + return fd.impl.Stat(ctx, opts) +} + +// SetStat updates metadata for the file represented by fd. +func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error { + return fd.impl.SetStat(ctx, opts) +} + +// StatFS returns metadata for the filesystem containing the file represented +// by fd. +func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + return fd.impl.StatFS(ctx) +} + +// PRead reads from the file represented by fd into dst, starting at the given +// offset, and returns the number of bytes read. PRead is permitted to return +// partial reads with a nil error. +func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return fd.impl.PRead(ctx, dst, offset, opts) +} + +// Read is similar to PRead, but does not specify an offset. +func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return fd.impl.Read(ctx, dst, opts) +} + +// PWrite writes src to the file represented by fd, starting at the given +// offset, and returns the number of bytes written. PWrite is permitted to +// return partial writes with a nil error. +func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return fd.impl.PWrite(ctx, src, offset, opts) +} + +// Write is similar to PWrite, but does not specify an offset. +func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return fd.impl.Write(ctx, src, opts) +} + +// IterDirents invokes cb on each entry in the directory represented by fd. If +// IterDirents has been called since the last call to Seek, it continues +// iteration from the end of the last call. +func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return fd.impl.IterDirents(ctx, cb) +} + +// Seek changes fd's offset (assuming one exists) and returns its new value. +func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.impl.Seek(ctx, offset, whence) +} + +// Sync has the semantics of fsync(2). +func (fd *FileDescription) Sync(ctx context.Context) error { + return fd.impl.Sync(ctx) +} + +// ConfigureMMap mutates opts to implement mmap(2) for the file represented by +// fd. +func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.impl.ConfigureMMap(ctx, opts) +} + +// Ioctl implements the ioctl(2) syscall. +func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return fd.impl.Ioctl(ctx, uio, args) +} + +// Listxattr returns all extended attribute names for the file represented by +// fd. +func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { + names, err := fd.impl.Listxattr(ctx) + if err == syserror.ENOTSUP { + // Linux doesn't actually return ENOTSUP in this case; instead, + // fs/xattr.c:vfs_listxattr() falls back to allowing the security + // subsystem to return security extended attributes, which by default + // don't exist. + return nil, nil + } + return names, err +} + +// Getxattr returns the value associated with the given extended attribute for +// the file represented by fd. +func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) { + return fd.impl.Getxattr(ctx, name) +} + +// Setxattr changes the value associated with the given extended attribute for +// the file represented by fd. +func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error { + return fd.impl.Setxattr(ctx, opts) +} + +// Removexattr removes the given extended attribute from the file represented +// by fd. +func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { + return fd.impl.Removexattr(ctx, name) +} + +// SyncFS instructs the filesystem containing fd to execute the semantics of +// syncfs(2). +func (fd *FileDescription) SyncFS(ctx context.Context) error { + return fd.vd.mount.fs.impl.Sync(ctx) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (fd *FileDescription) MappedName(ctx context.Context) string { + vfsroot := RootFromContext(ctx) + s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd) + if vfsroot.Ok() { + vfsroot.DecRef() + } + return s +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (fd *FileDescription) DeviceID() uint64 { + stat, err := fd.impl.Stat(context.Background(), StatOptions{ + // There is no STATX_DEV; we assume that Stat will return it if it's + // available regardless of mask. + Mask: 0, + // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev + // directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return 0 + } + return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor)) +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (fd *FileDescription) InodeID() uint64 { + stat, err := fd.impl.Stat(context.Background(), StatOptions{ + Mask: linux.STATX_INO, + // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil || stat.Mask&linux.STATX_INO == 0 { + return 0 + } + return stat.Ino +} + +// Msync implements memmap.MappingIdentity.Msync. +func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { + return fd.impl.Sync(ctx) +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index ba230da72..3df49991c 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -45,7 +45,7 @@ type FileDescriptionDefaultImpl struct{} // OnClose implements FileDescriptionImpl.OnClose analogously to // file_operations::flush == NULL in Linux. -func (FileDescriptionDefaultImpl) OnClose() error { +func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error { return nil } @@ -117,7 +117,7 @@ func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { // ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to // file_operations::mmap == NULL in Linux. -func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { +func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return syserror.ENODEV } @@ -127,6 +127,31 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg return 0, syserror.ENOTTY } +// Listxattr implements FileDescriptionImpl.Listxattr analogously to +// inode_operations::listxattr == NULL in Linux. +func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) { + // This isn't exactly accurate; see FileDescription.Listxattr. + return nil, syserror.ENOTSUP +} + +// Getxattr implements FileDescriptionImpl.Getxattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) { + return "", syserror.ENOTSUP +} + +// Setxattr implements FileDescriptionImpl.Setxattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error { + return syserror.ENOTSUP +} + +// Removexattr implements FileDescriptionImpl.Removexattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error { + return syserror.ENOTSUP +} + // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. @@ -252,3 +277,12 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6 fd.off = offset return offset, nil } + +// GenericConfigureMMap may be used by most implementations of +// FileDescriptionImpl.ConfigureMMap. +func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error { + opts.Mappable = m + opts.MappingIdentity = fd + fd.IncRef() + return nil +} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 511b829fc..ac7799296 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -90,7 +90,7 @@ func TestGenCountFD(t *testing.T) { vfsObj := New() // vfs.New() vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &NewFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{}) if err != nil { t.Fatalf("failed to create testfs root mount: %v", err) } @@ -103,7 +103,7 @@ func TestGenCountFD(t *testing.T) { // The first read causes Generate to be called to fill the FD's buffer. buf := make([]byte, 2) ioseq := usermem.BytesIOSequence(buf) - n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err := fd.Read(ctx, ioseq, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err) } @@ -112,17 +112,17 @@ func TestGenCountFD(t *testing.T) { } // A second read without seeking is still at EOF. - n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err = fd.Read(ctx, ioseq, ReadOptions{}) if n != 0 || err != io.EOF { t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err) } // Seeking to the beginning of the file causes it to be regenerated. - n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET) + n, err = fd.Seek(ctx, 0, linux.SEEK_SET) if n != 0 || err != nil { t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err) } - n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err = fd.Read(ctx, ioseq, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err) } @@ -131,7 +131,7 @@ func TestGenCountFD(t *testing.T) { } // PRead at the beginning of the file also causes it to be regenerated. - n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{}) + n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err) } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 7a074b718..b766614e7 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" ) @@ -33,15 +34,28 @@ type Filesystem struct { // operations. refs int64 + // vfs is the VirtualFilesystem that uses this Filesystem. vfs is + // immutable. + vfs *VirtualFilesystem + // impl is the FilesystemImpl associated with this Filesystem. impl is // immutable. This should be the last field in Dentry. impl FilesystemImpl } // Init must be called before first use of fs. -func (fs *Filesystem) Init(impl FilesystemImpl) { +func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) { fs.refs = 1 + fs.vfs = vfsObj fs.impl = impl + vfsObj.filesystemsMu.Lock() + vfsObj.filesystems[fs] = struct{}{} + vfsObj.filesystemsMu.Unlock() +} + +// VirtualFilesystem returns the containing VirtualFilesystem. +func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem { + return fs.vfs } // Impl returns the FilesystemImpl associated with fs. @@ -49,14 +63,35 @@ func (fs *Filesystem) Impl() FilesystemImpl { return fs.impl } -func (fs *Filesystem) incRef() { +// IncRef increments fs' reference count. +func (fs *Filesystem) IncRef() { if atomic.AddInt64(&fs.refs, 1) <= 1 { - panic("Filesystem.incRef() called without holding a reference") + panic("Filesystem.IncRef() called without holding a reference") + } +} + +// TryIncRef increments fs' reference count and returns true. If fs' reference +// count is zero, TryIncRef does nothing and returns false. +// +// TryIncRef does not require that a reference is held on fs. +func (fs *Filesystem) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&fs.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) { + return true + } } } -func (fs *Filesystem) decRef() { +// DecRef decrements fs' reference count. +func (fs *Filesystem) DecRef() { if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.vfs.filesystemsMu.Lock() + delete(fs.vfs.filesystems, fs) + fs.vfs.filesystemsMu.Unlock() fs.impl.Release() } else if refs < 0 { panic("Filesystem.decRef() called without holding a reference") @@ -151,5 +186,70 @@ type FilesystemImpl interface { // UnlinkAt removes the non-directory file at rp. UnlinkAt(ctx context.Context, rp *ResolvingPath) error - // TODO: d_path(); extended attributes; inotify_add_watch(); bind() + // ListxattrAt returns all extended attribute names for the file at rp. + ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) + + // GetxattrAt returns the value associated with the given extended + // attribute for the file at rp. + GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) + + // SetxattrAt changes the value associated with the given extended + // attribute for the file at rp. + SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error + + // RemovexattrAt removes the given extended attribute from the file at rp. + RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error + + // PrependPath prepends a path from vd to vd.Mount().Root() to b. + // + // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered + // before vd.Mount().Root(), PrependPath should stop prepending path + // components and return a PrependPathAtVFSRootError. + // + // If traversal of vd.Dentry()'s ancestors encounters an independent + // ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a + // descendant of vd.Mount().Root()), PrependPath should stop prepending + // path components and return a PrependPathAtNonMountRootError. + // + // Filesystems for which Dentries do not have meaningful paths may prepend + // an arbitrary descriptive string to b and then return a + // PrependPathSyntheticError. + // + // Most implementations can acquire the appropriate locks to ensure that + // Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of + // its ancestors, then call GenericPrependPath. + // + // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. + PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error + + // TODO: inotify_add_watch(); bind() +} + +// PrependPathAtVFSRootError is returned by implementations of +// FilesystemImpl.PrependPath() when they encounter the contextual VFS root. +type PrependPathAtVFSRootError struct{} + +// Error implements error.Error. +func (PrependPathAtVFSRootError) Error() string { + return "vfs.FilesystemImpl.PrependPath() reached VFS root" +} + +// PrependPathAtNonMountRootError is returned by implementations of +// FilesystemImpl.PrependPath() when they encounter an independent ancestor +// Dentry that is not the Mount root. +type PrependPathAtNonMountRootError struct{} + +// Error implements error.Error. +func (PrependPathAtNonMountRootError) Error() string { + return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root" +} + +// PrependPathSyntheticError is returned by implementations of +// FilesystemImpl.PrependPath() for which prepended names do not represent real +// paths. +type PrependPathSyntheticError struct{} + +// Error implements error.Error. +func (PrependPathSyntheticError) Error() string { + return "vfs.FilesystemImpl.PrependPath() prepended synthetic name" } diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go new file mode 100644 index 000000000..7315a588e --- /dev/null +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -0,0 +1,69 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "strings" + + "gvisor.dev/gvisor/pkg/fspath" +) + +// GenericParseMountOptions parses a comma-separated list of options of the +// form "key" or "key=value", where neither key nor value contain commas, and +// returns it as a map. If str contains duplicate keys, then the last value +// wins. For example: +// +// str = "key0=value0,key1,key2=value2,key0=value3" -> map{'key0':'value3','key1':'','key2':'value2'} +// +// GenericParseMountOptions is not appropriate if values may contain commas, +// e.g. in the case of the mpol mount option for tmpfs(5). +func GenericParseMountOptions(str string) map[string]string { + m := make(map[string]string) + for _, opt := range strings.Split(str, ",") { + if len(opt) > 0 { + res := strings.SplitN(opt, "=", 2) + if len(res) == 2 { + m[res[0]] = res[1] + } else { + m[opt] = "" + } + } + } + return m +} + +// GenericPrependPath may be used by implementations of +// FilesystemImpl.PrependPath() for which a single statically-determined lock +// or set of locks is sufficient to ensure its preconditions (as opposed to +// e.g. per-Dentry locks). +// +// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for +// vd.Dentry() and all of its ancestors. +func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error { + mnt, d := vd.mount, vd.dentry + for { + if mnt == vfsroot.mount && d == vfsroot.dentry { + return PrependPathAtVFSRootError{} + } + if d == mnt.root { + return nil + } + if d.parent == nil { + return PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index f401ad7f3..c335e206d 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -25,21 +25,21 @@ import ( // // FilesystemType is analogous to Linux's struct file_system_type. type FilesystemType interface { - // NewFilesystem returns a Filesystem configured by the given options, + // GetFilesystem returns a Filesystem configured by the given options, // along with its mount root. A reference is taken on the returned // Filesystem and Dentry. - NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) + GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) } -// NewFilesystemOptions contains options to FilesystemType.NewFilesystem. -type NewFilesystemOptions struct { +// GetFilesystemOptions contains options to FilesystemType.GetFilesystem. +type GetFilesystemOptions struct { // Data is the string passed as the 5th argument to mount(2), which is // usually a comma-separated list of filesystem-specific mount options. Data string // InternalData holds opaque FilesystemType-specific data. There is // intentionally no way for applications to specify InternalData; if it is - // not nil, the call to NewFilesystem originates from within the sentry. + // not nil, the call to GetFilesystem originates from within the sentry. InternalData interface{} } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 11702f720..ec23ab0dd 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -18,6 +18,7 @@ import ( "math" "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -38,16 +39,12 @@ import ( // Mount is analogous to Linux's struct mount. (gVisor does not distinguish // between struct mount and struct vfsmount.) type Mount struct { - // The lower 63 bits of refs are a reference count. The MSB of refs is set - // if the Mount has been eagerly unmounted, as by umount(2) without the - // MNT_DETACH flag. refs is accessed using atomic memory operations. - refs int64 - - // The lower 63 bits of writers is the number of calls to - // Mount.CheckBeginWrite() that have not yet been paired with a call to - // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. - // writers is accessed using atomic memory operations. - writers int64 + // vfs, fs, and root are immutable. References are held on fs and root. + // + // Invariant: root belongs to fs. + vfs *VirtualFilesystem + fs *Filesystem + root *Dentry // key is protected by VirtualFilesystem.mountMu and // VirtualFilesystem.mounts.seq, and may be nil. References are held on @@ -57,13 +54,29 @@ type Mount struct { // key.parent.fs. key mountKey - // fs, root, and ns are immutable. References are held on fs and root (but - // not ns). - // - // Invariant: root belongs to fs. - fs *Filesystem - root *Dentry - ns *MountNamespace + // ns is the namespace in which this Mount was mounted. ns is protected by + // VirtualFilesystem.mountMu. + ns *MountNamespace + + // The lower 63 bits of refs are a reference count. The MSB of refs is set + // if the Mount has been eagerly umounted, as by umount(2) without the + // MNT_DETACH flag. refs is accessed using atomic memory operations. + refs int64 + + // children is the set of all Mounts for which Mount.key.parent is this + // Mount. children is protected by VirtualFilesystem.mountMu. + children map[*Mount]struct{} + + // umounted is true if VFS.umountRecursiveLocked() has been called on this + // Mount. VirtualFilesystem does not hold a reference on Mounts for which + // umounted is true. umounted is protected by VirtualFilesystem.mountMu. + umounted bool + + // The lower 63 bits of writers is the number of calls to + // Mount.CheckBeginWrite() that have not yet been paired with a call to + // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. + // writers is accessed using atomic memory operations. + writers int64 } // A MountNamespace is a collection of Mounts. @@ -73,13 +86,16 @@ type Mount struct { // // MountNamespace is analogous to Linux's struct mnt_namespace. type MountNamespace struct { - refs int64 // accessed using atomic memory operations - // root is the MountNamespace's root mount. root is immutable. root *Mount - // mountpoints contains all Dentries which are mount points in this - // namespace. mountpoints is protected by VirtualFilesystem.mountMu. + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // mountpoints maps all Dentries which are mount points in this namespace + // to the number of Mounts for which they are mount points. mountpoints is + // protected by VirtualFilesystem.mountMu. // // mountpoints is used to determine if a Dentry can be moved or removed // (which requires that the Dentry is not a mount point in the calling @@ -89,26 +105,27 @@ type MountNamespace struct { // MountNamespace; this is required to ensure that // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate // correctly on unreferenced MountNamespaces. - mountpoints map[*Dentry]struct{} + mountpoints map[*Dentry]uint32 } // NewMountNamespace returns a new mount namespace with a root filesystem // configured by the given arguments. A reference is taken on the returned // MountNamespace. -func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) { +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { fsType := vfs.getFilesystemType(fsTypeName) if fsType == nil { return nil, syserror.ENODEV } - fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts) if err != nil { return nil, err } mntns := &MountNamespace{ refs: 1, - mountpoints: make(map[*Dentry]struct{}), + mountpoints: make(map[*Dentry]uint32), } mntns.root = &Mount{ + vfs: vfs, fs: fs, root: root, ns: mntns, @@ -117,13 +134,13 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth return mntns, nil } -// NewMount creates and mounts a new Filesystem. -func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error { +// MountAt creates and mounts a Filesystem configured by the given arguments. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { fsType := vfs.getFilesystemType(fsTypeName) if fsType == nil { return syserror.ENODEV } - fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { return err } @@ -131,17 +148,19 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti // lock ordering. vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) if err != nil { - root.decRef(fs) - fs.decRef() + root.DecRef() + fs.DecRef() return err } vfs.mountMu.Lock() + vd.dentry.mu.Lock() for { if vd.dentry.IsDisowned() { + vd.dentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef() - root.decRef(fs) - fs.decRef() + root.DecRef() + fs.DecRef() return syserror.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and @@ -153,36 +172,272 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti if nextmnt == nil { break } - nextmnt.incRef() - nextmnt.root.incRef(nextmnt.fs) + // It's possible that nextmnt has been umounted but not disconnected, + // in which case vfs no longer holds a reference on it, and the last + // reference may be concurrently dropped even though we're holding + // vfs.mountMu. + if !nextmnt.tryIncMountedRef() { + break + } + // This can't fail since we're holding vfs.mountMu. + nextmnt.root.IncRef() + vd.dentry.mu.Unlock() vd.DecRef() vd = VirtualDentry{ mount: nextmnt, dentry: nextmnt.root, } + vd.dentry.mu.Lock() } // TODO: Linux requires that either both the mount point and the mount root // are directories, or neither are, and returns ENOTDIR if this is not the // case. mntns := vd.mount.ns mnt := &Mount{ + vfs: vfs, fs: fs, root: root, ns: mntns, refs: 1, } - mnt.storeKey(vd.mount, vd.dentry) + vfs.mounts.seq.BeginWrite() + vfs.connectLocked(mnt, vd, mntns) + vfs.mounts.seq.EndWrite() + vd.dentry.mu.Unlock() + vfs.mountMu.Unlock() + return nil +} + +// UmountAt removes the Mount at the given path. +func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { + if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { + return syserror.EINVAL + } + + // MNT_FORCE is currently unimplemented except for the permission check. + if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { + return syserror.EPERM + } + + vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) + if err != nil { + return err + } + defer vd.DecRef() + if vd.dentry != vd.mount.root { + return syserror.EINVAL + } + vfs.mountMu.Lock() + if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns { + vfs.mountMu.Unlock() + return syserror.EINVAL + } + + // TODO(jamieliu): Linux special-cases umount of the caller's root, which + // we don't implement yet (we'll just fail it since the caller holds a + // reference on it). + + vfs.mounts.seq.BeginWrite() + if opts.Flags&linux.MNT_DETACH == 0 { + if len(vd.mount.children) != 0 { + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + return syserror.EBUSY + } + // We are holding a reference on vd.mount. + expectedRefs := int64(1) + if !vd.mount.umounted { + expectedRefs = 2 + } + if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + return syserror.EBUSY + } + } + vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{ + eager: opts.Flags&linux.MNT_DETACH == 0, + disconnectHierarchy: true, + }, nil, nil) + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.DecRef() + } + return nil +} + +type umountRecursiveOptions struct { + // If eager is true, ensure that future calls to Mount.tryIncMountedRef() + // on umounted mounts fail. + // + // eager is analogous to Linux's UMOUNT_SYNC. + eager bool + + // If disconnectHierarchy is true, Mounts that are umounted hierarchically + // should be disconnected from their parents. (Mounts whose parents are not + // umounted, which in most cases means the Mount passed to the initial call + // to umountRecursiveLocked, are unconditionally disconnected for + // consistency with Linux.) + // + // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED. + disconnectHierarchy bool +} + +// umountRecursiveLocked marks mnt and its descendants as umounted. It does not +// release mount or dentry references; instead, it appends VirtualDentries and +// Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef +// respectively, and returns updated slices. (This is necessary because +// filesystem locks possibly taken by DentryImpl.DecRef() may precede +// vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.) +// +// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree(). +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. +func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) { + if !mnt.umounted { + mnt.umounted = true + mountsToDecRef = append(mountsToDecRef, mnt) + if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) { + vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt)) + } + } + if opts.eager { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs < 0 { + break + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) { + break + } + } + } + for child := range mnt.children { + vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef) + } + return vdsToDecRef, mountsToDecRef +} + +// connectLocked makes vd the mount parent/point for mnt. It consumes +// references held by vd. +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. d.mu must be locked. mnt.parent() == nil. +func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { + mnt.storeKey(vd) + if vd.mount.children == nil { + vd.mount.children = make(map[*Mount]struct{}) + } + vd.mount.children[mnt] = struct{}{} atomic.AddUint32(&vd.dentry.mounts, 1) - mntns.mountpoints[vd.dentry] = struct{}{} + mntns.mountpoints[vd.dentry]++ + vfs.mounts.insertSeqed(mnt) vfsmpmounts, ok := vfs.mountpoints[vd.dentry] if !ok { vfsmpmounts = make(map[*Mount]struct{}) vfs.mountpoints[vd.dentry] = vfsmpmounts } vfsmpmounts[mnt] = struct{}{} - vfs.mounts.Insert(mnt) - vfs.mountMu.Unlock() - return nil +} + +// disconnectLocked makes vd have no mount parent/point and returns its old +// mount parent/point with a reference held. +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. mnt.parent() != nil. +func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { + vd := mnt.loadKey() + mnt.storeKey(VirtualDentry{}) + delete(vd.mount.children, mnt) + atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1 + mnt.ns.mountpoints[vd.dentry]-- + if mnt.ns.mountpoints[vd.dentry] == 0 { + delete(mnt.ns.mountpoints, vd.dentry) + } + vfs.mounts.removeSeqed(mnt) + vfsmpmounts := vfs.mountpoints[vd.dentry] + delete(vfsmpmounts, mnt) + if len(vfsmpmounts) == 0 { + delete(vfs.mountpoints, vd.dentry) + } + return vd +} + +// tryIncMountedRef increments mnt's reference count and returns true. If mnt's +// reference count is already zero, or has been eagerly umounted, +// tryIncMountedRef does nothing and returns false. +// +// tryIncMountedRef does not require that a reference is held on mnt. +func (mnt *Mount) tryIncMountedRef() bool { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted + return false + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { + return true + } + } +} + +// IncRef increments mnt's reference count. +func (mnt *Mount) IncRef() { + // In general, negative values for mnt.refs are valid because the MSB is + // the eager-unmount bit. + atomic.AddInt64(&mnt.refs, 1) +} + +// DecRef decrements mnt's reference count. +func (mnt *Mount) DecRef() { + refs := atomic.AddInt64(&mnt.refs, -1) + if refs&^math.MinInt64 == 0 { // mask out MSB + var vd VirtualDentry + if mnt.parent() != nil { + mnt.vfs.mountMu.Lock() + mnt.vfs.mounts.seq.BeginWrite() + vd = mnt.vfs.disconnectLocked(mnt) + mnt.vfs.mounts.seq.EndWrite() + mnt.vfs.mountMu.Unlock() + } + mnt.root.DecRef() + mnt.fs.DecRef() + if vd.Ok() { + vd.DecRef() + } + } +} + +// IncRef increments mntns' reference count. +func (mntns *MountNamespace) IncRef() { + if atomic.AddInt64(&mntns.refs, 1) <= 1 { + panic("MountNamespace.IncRef() called without holding a reference") + } +} + +// DecRef decrements mntns' reference count. +func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) { + if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { + vfs.mountMu.Lock() + vfs.mounts.seq.BeginWrite() + vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ + disconnectHierarchy: true, + }, nil, nil) + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.DecRef() + } + } else if refs < 0 { + panic("MountNamespace.DecRef() called without holding a reference") + } } // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes @@ -223,7 +478,7 @@ retryFirst: // Raced with umount. continue } - mnt.decRef() + mnt.DecRef() mnt = next d = next.root } @@ -231,12 +486,12 @@ retryFirst: } // getMountpointAt returns the mount point for the stack of Mounts including -// mnt. It takes a reference on the returned Mount and Dentry. If no such mount +// mnt. It takes a reference on the returned VirtualDentry. If no such mount // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). // // Preconditions: References are held on mnt and root. vfsroot is not (mnt, // mnt.root). -func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) { +func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry { // The first mount is special-cased: // // - The caller must have already checked mnt against vfsroot. @@ -246,21 +501,26 @@ func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) // - We don't drop the caller's reference on mnt. retryFirst: epoch := vfs.mounts.seq.BeginRead() - parent, point := mnt.loadKey() + parent, point := mnt.parent(), mnt.point() if !vfs.mounts.seq.ReadOk(epoch) { goto retryFirst } if parent == nil { - return nil, nil + return VirtualDentry{} } if !parent.tryIncMountedRef() { // Raced with umount. goto retryFirst } - if !point.tryIncRef(parent.fs) { + if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can only // happen due to a racing change to Mount.key. - parent.decRef() + parent.DecRef() + goto retryFirst + } + if !vfs.mounts.seq.ReadOk(epoch) { + point.DecRef() + parent.DecRef() goto retryFirst } mnt = parent @@ -274,7 +534,7 @@ retryFirst: } retryNotFirst: epoch := vfs.mounts.seq.BeginRead() - parent, point := mnt.loadKey() + parent, point := mnt.parent(), mnt.point() if !vfs.mounts.seq.ReadOk(epoch) { goto retryNotFirst } @@ -285,59 +545,23 @@ retryFirst: // Raced with umount. goto retryNotFirst } - if !point.tryIncRef(parent.fs) { + if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can // only happen due to a racing change to Mount.key. - parent.decRef() + parent.DecRef() goto retryNotFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.decRef(parent.fs) - parent.decRef() + point.DecRef() + parent.DecRef() goto retryNotFirst } - d.decRef(mnt.fs) - mnt.decRef() + d.DecRef() + mnt.DecRef() mnt = parent d = point } - return mnt, d -} - -// tryIncMountedRef increments mnt's reference count and returns true. If mnt's -// reference count is already zero, or has been eagerly unmounted, -// tryIncMountedRef does nothing and returns false. -// -// tryIncMountedRef does not require that a reference is held on mnt. -func (mnt *Mount) tryIncMountedRef() bool { - for { - refs := atomic.LoadInt64(&mnt.refs) - if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted - return false - } - if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { - return true - } - } -} - -func (mnt *Mount) incRef() { - // In general, negative values for mnt.refs are valid because the MSB is - // the eager-unmount bit. - atomic.AddInt64(&mnt.refs, 1) -} - -func (mnt *Mount) decRef() { - refs := atomic.AddInt64(&mnt.refs, -1) - if refs&^math.MinInt64 == 0 { // mask out MSB - parent, point := mnt.loadKey() - if point != nil { - point.decRef(parent.fs) - parent.decRef() - } - mnt.root.decRef(mnt.fs) - mnt.fs.decRef() - } + return VirtualDentry{mnt, d} } // CheckBeginWrite increments the counter of in-progress write operations on @@ -360,7 +584,7 @@ func (mnt *Mount) EndWrite() { atomic.AddInt64(&mnt.writers, -1) } -// Preconditions: VirtualFilesystem.mountMu must be locked for writing. +// Preconditions: VirtualFilesystem.mountMu must be locked. func (mnt *Mount) setReadOnlyLocked(ro bool) error { if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { return nil @@ -383,22 +607,6 @@ func (mnt *Mount) Filesystem() *Filesystem { return mnt.fs } -// IncRef increments mntns' reference count. -func (mntns *MountNamespace) IncRef() { - if atomic.AddInt64(&mntns.refs, 1) <= 1 { - panic("MountNamespace.IncRef() called without holding a reference") - } -} - -// DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef() { - if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 { - // TODO: unmount mntns.root - } else if refs < 0 { - panic("MountNamespace.DecRef() called without holding a reference") - } -} - // Root returns mntns' root. A reference is taken on the returned // VirtualDentry. func (mntns *MountNamespace) Root() VirtualDentry { diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go index f394d7483..adff0b94b 100644 --- a/pkg/sentry/vfs/mount_test.go +++ b/pkg/sentry/vfs/mount_test.go @@ -37,7 +37,7 @@ func TestMountTableInsertLookup(t *testing.T) { mt.Init() mount := &Mount{} - mount.storeKey(&Mount{}, &Dentry{}) + mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) mt.Insert(mount) if m := mt.Lookup(mount.parent(), mount.point()); m != mount { @@ -78,18 +78,10 @@ const enableComparativeBenchmarks = false func newBenchMount() *Mount { mount := &Mount{} - mount.storeKey(&Mount{}, &Dentry{}) + mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) return mount } -func vdkey(mnt *Mount) VirtualDentry { - parent, point := mnt.loadKey() - return VirtualDentry{ - mount: parent, - dentry: point, - } -} - func BenchmarkMountTableParallelLookup(b *testing.B) { for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { for _, numMounts := range benchNumMounts { @@ -101,7 +93,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) { for i := 0; i < numMounts; i++ { mount := newBenchMount() mt.Insert(mount) - keys = append(keys, vdkey(mount)) + keys = append(keys, mount.loadKey()) } var ready sync.WaitGroup @@ -153,7 +145,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) { keys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { mount := newBenchMount() - key := vdkey(mount) + key := mount.loadKey() ms[key] = mount keys = append(keys, key) } @@ -208,7 +200,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) { keys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { mount := newBenchMount() - key := vdkey(mount) + key := mount.loadKey() ms.Store(key, mount) keys = append(keys, key) } @@ -290,7 +282,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) { ms := make(map[VirtualDentry]*Mount) for i := 0; i < numMounts; i++ { mount := newBenchMount() - ms[vdkey(mount)] = mount + ms[mount.loadKey()] = mount } negkeys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { @@ -325,7 +317,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { var ms sync.Map for i := 0; i < numMounts; i++ { mount := newBenchMount() - ms.Store(vdkey(mount), mount) + ms.Store(mount.loadKey(), mount) } negkeys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { @@ -379,7 +371,7 @@ func BenchmarkMountMapInsert(b *testing.B) { b.ResetTimer() for i := range mounts { mount := mounts[i] - ms[vdkey(mount)] = mount + ms[mount.loadKey()] = mount } } @@ -399,7 +391,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) { b.ResetTimer() for i := range mounts { mount := mounts[i] - ms.Store(vdkey(mount), mount) + ms.Store(mount.loadKey(), mount) } } @@ -432,13 +424,13 @@ func BenchmarkMountMapRemove(b *testing.B) { ms := make(map[VirtualDentry]*Mount) for i := range mounts { mount := mounts[i] - ms[vdkey(mount)] = mount + ms[mount.loadKey()] = mount } b.ResetTimer() for i := range mounts { mount := mounts[i] - delete(ms, vdkey(mount)) + delete(ms, mount.loadKey()) } } @@ -454,12 +446,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) { var ms sync.Map for i := range mounts { mount := mounts[i] - ms.Store(vdkey(mount), mount) + ms.Store(mount.loadKey(), mount) } b.ResetTimer() for i := range mounts { mount := mounts[i] - ms.Delete(vdkey(mount)) + ms.Delete(mount.loadKey()) } } diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index b0511aa40..ab13fa461 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.14 +// +build !go1.15 // Check go:linkname function signatures when updating Go version. @@ -26,7 +26,7 @@ import ( "sync/atomic" "unsafe" - "gvisor.dev/gvisor/third_party/gvsync" + "gvisor.dev/gvisor/pkg/syncutil" ) // mountKey represents the location at which a Mount is mounted. It is @@ -38,16 +38,6 @@ type mountKey struct { point unsafe.Pointer // *Dentry } -// Invariant: mnt.key's fields are nil. parent and point are non-nil. -func (mnt *Mount) storeKey(parent *Mount, point *Dentry) { - atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent)) - atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point)) -} - -func (mnt *Mount) loadKey() (*Mount, *Dentry) { - return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point)) -} - func (mnt *Mount) parent() *Mount { return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) } @@ -56,6 +46,19 @@ func (mnt *Mount) point() *Dentry { return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) } +func (mnt *Mount) loadKey() VirtualDentry { + return VirtualDentry{ + mount: mnt.parent(), + dentry: mnt.point(), + } +} + +// Invariant: mnt.key.parent == nil. vd.Ok(). +func (mnt *Mount) storeKey(vd VirtualDentry) { + atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount)) + atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry)) +} + // mountTable maps (mount parent, mount point) pairs to mounts. It supports // efficient concurrent lookup, even in the presence of concurrent mutators // (provided mutation is sufficiently uncommon). @@ -72,7 +75,7 @@ type mountTable struct { // intrinsics and inline assembly, limiting the performance of this // approach.) - seq gvsync.SeqCount + seq syncutil.SeqCount seed uint32 // for hashing keys // size holds both length (number of elements) and capacity (number of @@ -201,9 +204,19 @@ loop: // Insert inserts the given mount into mt. // -// Preconditions: There are no concurrent mutators of mt. mt must not already -// contain a Mount with the same mount point and parent. +// Preconditions: mt must not already contain a Mount with the same mount point +// and parent. func (mt *mountTable) Insert(mount *Mount) { + mt.seq.BeginWrite() + mt.insertSeqed(mount) + mt.seq.EndWrite() +} + +// insertSeqed inserts the given mount into mt. +// +// Preconditions: mt.seq must be in a writer critical section. mt must not +// already contain a Mount with the same mount point and parent. +func (mt *mountTable) insertSeqed(mount *Mount) { hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) // We're under the maximum load factor if: @@ -215,10 +228,8 @@ func (mt *mountTable) Insert(mount *Mount) { tcap := uintptr(1) << order if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) { // Atomically insert the new element into the table. - mt.seq.BeginWrite() atomic.AddUint64(&mt.size, mtSizeLenOne) mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash) - mt.seq.EndWrite() return } @@ -241,8 +252,6 @@ func (mt *mountTable) Insert(mount *Mount) { for { oldSlot := (*mountSlot)(oldCur) if oldSlot.value != nil { - // Don't need to lock mt.seq yet since newSlots isn't visible - // to readers. mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash) } if oldCur == oldLast { @@ -252,11 +261,9 @@ func (mt *mountTable) Insert(mount *Mount) { } // Insert the new element into the new table. mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash) - // Atomically switch to the new table. - mt.seq.BeginWrite() + // Switch to the new table. atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne) atomic.StorePointer(&mt.slots, newSlots) - mt.seq.EndWrite() } // Preconditions: There are no concurrent mutators of the table (slots, cap). @@ -294,9 +301,18 @@ func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, has // Remove removes the given mount from mt. // -// Preconditions: There are no concurrent mutators of mt. mt must contain -// mount. +// Preconditions: mt must contain mount. func (mt *mountTable) Remove(mount *Mount) { + mt.seq.BeginWrite() + mt.removeSeqed(mount) + mt.seq.EndWrite() +} + +// removeSeqed removes the given mount from mt. +// +// Preconditions: mt.seq must be in a writer critical section. mt must contain +// mount. +func (mt *mountTable) removeSeqed(mount *Mount) { hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) tcap := uintptr(1) << (mt.size & mtSizeOrderMask) mask := tcap - 1 @@ -311,7 +327,6 @@ func (mt *mountTable) Remove(mount *Mount) { // backward until we either find an empty slot, or an element that // is already in its first-probed slot. (This is backward shift // deletion.) - mt.seq.BeginWrite() for { nextOff := (off + mountSlotBytes) & offmask nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff)) @@ -330,7 +345,6 @@ func (mt *mountTable) Remove(mount *Mount) { } atomic.StorePointer(&slot.value, nil) atomic.AddUint64(&mt.size, mtSizeLenNegOne) - mt.seq.EndWrite() return } if checkInvariants && slotValue == nil { diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 187e5410c..97ee4a446 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -31,14 +31,14 @@ type GetDentryOptions struct { // FilesystemImpl.MkdirAt(). type MkdirOptions struct { // Mode is the file mode bits for the created directory. - Mode uint16 + Mode linux.FileMode } // MknodOptions contains options to VirtualFilesystem.MknodAt() and // FilesystemImpl.MknodAt(). type MknodOptions struct { // Mode is the file type and mode bits for the created file. - Mode uint16 + Mode linux.FileMode // If Mode specifies a character or block device special file, DevMajor and // DevMinor are the major and minor device numbers for the created device. @@ -46,6 +46,12 @@ type MknodOptions struct { DevMinor uint32 } +// MountOptions contains options to VirtualFilesystem.MountAt(). +type MountOptions struct { + // GetFilesystemOptions contains options to FilesystemType.GetFilesystem(). + GetFilesystemOptions GetFilesystemOptions +} + // OpenOptions contains options to VirtualFilesystem.OpenAt() and // FilesystemImpl.OpenAt(). type OpenOptions struct { @@ -61,7 +67,7 @@ type OpenOptions struct { // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the // created file. - Mode uint16 + Mode linux.FileMode } // ReadOptions contains options to FileDescription.PRead(), @@ -95,6 +101,20 @@ type SetStatOptions struct { Stat linux.Statx } +// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(), +// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and +// FileDescriptionImpl.Setxattr(). +type SetxattrOptions struct { + // Name is the name of the extended attribute being mutated. + Name string + + // Value is the extended attribute's new value. + Value string + + // Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2). + Flags uint32 +} + // StatOptions contains options to VirtualFilesystem.StatAt(), // FilesystemImpl.StatAt(), FileDescription.Stat(), and // FileDescriptionImpl.Stat(). @@ -114,6 +134,12 @@ type StatOptions struct { Sync uint32 } +// UmountOptions contains options to VirtualFilesystem.UmountAt(). +type UmountOptions struct { + // Flags contains flags as specified for umount2(2). + Flags uint32 +} + // WriteOptions contains options to FileDescription.PWrite(), // FileDescriptionImpl.PWrite(), FileDescription.Write(), and // FileDescriptionImpl.Write(). diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go new file mode 100644 index 000000000..8e155654f --- /dev/null +++ b/pkg/sentry/vfs/pathname.go @@ -0,0 +1,153 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +var fspathBuilderPool = sync.Pool{ + New: func() interface{} { + return &fspath.Builder{} + }, +} + +func getFSPathBuilder() *fspath.Builder { + return fspathBuilderPool.Get().(*fspath.Builder) +} + +func putFSPathBuilder(b *fspath.Builder) { + // No methods can be called on b after b.String(), so reset it to its zero + // value (as returned by fspathBuilderPool.New) instead. + *b = fspath.Builder{} + fspathBuilderPool.Put(b) +} + +// PathnameWithDeleted returns an absolute pathname to vd, consistent with +// Linux's d_path(). In particular, if vd.Dentry() has been disowned, +// PathnameWithDeleted appends " (deleted)" to the returned pathname. +func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() + + origD := vd.dentry +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + // GenericPrependPath() will have returned + // PrependPathAtVFSRootError in this case since it checks + // against vfsroot before mnt.root, but other implementations + // of FilesystemImpl.PrependPath() may return nil instead. + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + break loop + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + // continue loop + case PrependPathSyntheticError: + // Skip prepending "/" and appending " (deleted)". + return b.String(), nil + case PrependPathAtVFSRootError, PrependPathAtNonMountRootError: + break loop + default: + return "", err + } + } + b.PrependByte('/') + if origD.IsDisowned() { + b.AppendString(" (deleted)") + } + return b.String(), nil +} + +// PathnameForGetcwd returns an absolute pathname to vd, consistent with +// Linux's sys_getcwd(). +func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + if vd.dentry.IsDisowned() { + return "", syserror.ENOENT + } + + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() + unreachable := false +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + unreachable = true + break loop + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + case PrependPathAtVFSRootError: + break loop + case PrependPathAtNonMountRootError, PrependPathSyntheticError: + unreachable = true + break loop + default: + return "", err + } + } + b.PrependByte('/') + if unreachable { + b.PrependString("(unreachable)") + } + return b.String(), nil +} + +// As of this writing, we do not have equivalents to: +// +// - d_absolute_path(), which returns EINVAL if (effectively) any call to +// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError. +// +// - dentry_path(), which does not walk up mounts (and only returns the path +// relative to Filesystem root), but also appends "//deleted" for disowned +// Dentries. +// +// These should be added as necessary. diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f8e74355c..f1edb0680 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -119,3 +119,65 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { return false } } + +// CheckSetStat checks that creds has permission to change the metadata of a +// file with the given permissions, UID, and GID as specified by stat, subject +// to the rules of Linux's fs/attr.c:setattr_prepare(). +func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error { + if stat.Mask&linux.STATX_MODE != 0 { + if !CanActAsOwner(creds, kuid) { + return syserror.EPERM + } + // TODO(b/30815691): "If the calling process is not privileged (Linux: + // does not have the CAP_FSETID capability), and the group of the file + // does not match the effective group ID of the process or one of its + // supplementary group IDs, the S_ISGID bit will be turned off, but + // this will not cause an error to be returned." - chmod(2) + } + if stat.Mask&linux.STATX_UID != 0 { + if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) || + HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { + return syserror.EPERM + } + } + if stat.Mask&linux.STATX_GID != 0 { + if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) || + HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { + return syserror.EPERM + } + } + if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 { + if !CanActAsOwner(creds, kuid) { + if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || + (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) || + (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) { + return syserror.EPERM + } + // isDir is irrelevant in the following call to + // GenericCheckPermissions since ats == MayWrite means that + // CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE + // applies, regardless of isDir. + if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil { + return err + } + } + } + return nil +} + +// CanActAsOwner returns true if creds can act as the owner of a file with the +// given owning UID, consistent with Linux's +// fs/inode.c:inode_owner_or_capable(). +func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool { + if creds.EffectiveKUID == kuid { + return true + } + return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok() +} + +// HasCapabilityOnFile returns true if creds has the given capability with +// respect to a file with the given owning UID and GID, consistent with Linux's +// kernel/capability.c:capable_wrt_inode_uidgid(). +func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool { + return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok() +} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 8d05c8583..621f5a6f8 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -149,20 +149,20 @@ func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { func (rp *ResolvingPath) decRefStartAndMount() { if rp.flags&rpflagsHaveStartRef != 0 { - rp.start.decRef(rp.mount.fs) + rp.start.DecRef() } if rp.flags&rpflagsHaveMountRef != 0 { - rp.mount.decRef() + rp.mount.DecRef() } } func (rp *ResolvingPath) releaseErrorState() { if rp.nextStart != nil { - rp.nextStart.decRef(rp.nextMount.fs) + rp.nextStart.DecRef() rp.nextStart = nil } if rp.nextMount != nil { - rp.nextMount.decRef() + rp.nextMount.DecRef() rp.nextMount = nil } } @@ -269,11 +269,11 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { parent = d } else if d == rp.mount.root { // At mount root ... - mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root) - if mnt != nil { + vd := rp.vfs.getMountpointAt(rp.mount, rp.root) + if vd.Ok() { // ... of non-root mount. - rp.nextMount = mnt - rp.nextStart = mntpt + rp.nextMount = vd.mount + rp.nextStart = vd.dentry return nil, resolveMountRootError{} } // ... of root mount. diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go deleted file mode 100644 index 23f2b9e08..000000000 --- a/pkg/sentry/vfs/syscalls.go +++ /dev/null @@ -1,217 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" -) - -// PathOperation specifies the path operated on by a VFS method. -// -// PathOperation is passed to VFS methods by pointer to reduce memory copying: -// it's somewhat large and should never escape. (Options structs are passed by -// pointer to VFS and FileDescription methods for the same reason.) -type PathOperation struct { - // Root is the VFS root. References on Root are borrowed from the provider - // of the PathOperation. - // - // Invariants: Root.Ok(). - Root VirtualDentry - - // Start is the starting point for the path traversal. References on Start - // are borrowed from the provider of the PathOperation (i.e. the caller of - // the VFS method to which the PathOperation was passed). - // - // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. - Start VirtualDentry - - // Path is the pathname traversed by this operation. - Pathname string - - // If FollowFinalSymlink is true, and the Dentry traversed by the final - // path component represents a symbolic link, the symbolic link should be - // followed. - FollowFinalSymlink bool -} - -// GetDentryAt returns a VirtualDentry representing the given path, at which a -// file must exist. A reference is taken on the returned VirtualDentry. -func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return VirtualDentry{}, err - } - for { - d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) - if err == nil { - vd := VirtualDentry{ - mount: rp.mount, - dentry: d, - } - rp.mount.incRef() - vfs.putResolvingPath(rp) - return vd, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return VirtualDentry{}, err - } - } -} - -// MkdirAt creates a directory at the given path. -func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { - // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is - // also honored." - mkdir(2) - opts.Mode &= 01777 - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err - } - for { - err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return err - } - } -} - -// OpenAt returns a FileDescription providing access to the file at the given -// path. A reference is taken on the returned FileDescription. -func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { - // Remove: - // - // - O_LARGEFILE, which we always report in FileDescription status flags - // since only 64-bit architectures are supported at this time. - // - // - O_CLOEXEC, which affects file descriptors and therefore must be - // handled outside of VFS. - // - // - Unknown flags. - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE - // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. - if opts.Flags&linux.O_SYNC != 0 { - opts.Flags |= linux.O_DSYNC - } - // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified - // with O_DIRECTORY and a writable access mode (to ensure that it fails on - // filesystem implementations that do not support it). - if opts.Flags&linux.O_TMPFILE != 0 { - if opts.Flags&linux.O_DIRECTORY == 0 { - return nil, syserror.EINVAL - } - if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EINVAL - } - if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { - return nil, syserror.EINVAL - } - } - // O_PATH causes most other flags to be ignored. - if opts.Flags&linux.O_PATH != 0 { - opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH - } - // "On Linux, the following bits are also honored in mode: [S_ISUID, - // S_ISGID, S_ISVTX]" - open(2) - opts.Mode &= 07777 - - if opts.Flags&linux.O_NOFOLLOW != 0 { - pop.FollowFinalSymlink = false - } - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil, err - } - if opts.Flags&linux.O_DIRECTORY != 0 { - rp.mustBeDir = true - rp.mustBeDirOrig = true - } - for { - fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return fd, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return nil, err - } - } -} - -// StatAt returns metadata for the file at the given path. -func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return linux.Statx{}, err - } - for { - stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return stat, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return linux.Statx{}, err - } - } -} - -// StatusFlags returns file description status flags. -func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { - flags, err := fd.impl.StatusFlags(ctx) - flags |= linux.O_LARGEFILE - return flags, err -} - -// SetStatusFlags sets file description status flags. -func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - return fd.impl.SetStatusFlags(ctx, flags) -} - -// TODO: -// -// - VFS.SyncAllFilesystems() for sync(2) -// -// - Something for syncfs(2) -// -// - VFS.LinkAt() -// -// - VFS.MknodAt() -// -// - VFS.ReadlinkAt() -// -// - VFS.RenameAt() -// -// - VFS.RmdirAt() -// -// - VFS.SetStatAt() -// -// - VFS.StatFSAt() -// -// - VFS.SymlinkAt() -// -// - VFS.UnlinkAt() -// -// - FileDescription.(almost everything) diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go index 70b192ece..d94117bce 100644 --- a/pkg/sentry/vfs/testutil.go +++ b/pkg/sentry/vfs/testutil.go @@ -15,7 +15,10 @@ package vfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -33,10 +36,10 @@ type FDTestFilesystem struct { vfsfs Filesystem } -// NewFilesystem implements FilesystemType.NewFilesystem. -func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) { +// GetFilesystem implements FilesystemType.GetFilesystem. +func (fstype FDTestFilesystemType) GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) { var fs FDTestFilesystem - fs.vfsfs.Init(&fs) + fs.vfsfs.Init(vfsObj, &fs) return &fs.vfsfs, fs.NewDentry(), nil } @@ -114,6 +117,32 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err return syserror.EPERM } +// ListxattrAt implements FilesystemImpl.ListxattrAt. +func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) { + return nil, syserror.EPERM +} + +// GetxattrAt implements FilesystemImpl.GetxattrAt. +func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) { + return "", syserror.EPERM +} + +// SetxattrAt implements FilesystemImpl.SetxattrAt. +func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error { + return syserror.EPERM +} + +// RemovexattrAt implements FilesystemImpl.RemovexattrAt. +func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error { + return syserror.EPERM +} + +// PrependPath implements FilesystemImpl.PrependPath. +func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error { + b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry))) + return PrependPathSyntheticError{} +} + type fdTestDentry struct { vfsd Dentry } @@ -126,14 +155,14 @@ func (fs *FDTestFilesystem) NewDentry() *Dentry { } // IncRef implements DentryImpl.IncRef. -func (d *fdTestDentry) IncRef(vfsfs *Filesystem) { +func (d *fdTestDentry) IncRef() { } // TryIncRef implements DentryImpl.TryIncRef. -func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool { +func (d *fdTestDentry) TryIncRef() bool { return true } // DecRef implements DentryImpl.DecRef. -func (d *fdTestDentry) DecRef(vfsfs *Filesystem) { +func (d *fdTestDentry) DecRef() { } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 4a8a69540..e60898d7c 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -16,13 +16,24 @@ // // Lock order: // -// Filesystem implementation locks +// FilesystemImpl/FileDescriptionImpl locks // VirtualFilesystem.mountMu +// Dentry.mu +// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry +// VirtualFilesystem.filesystemsMu // VirtualFilesystem.fsTypesMu +// +// Locking Dentry.mu in multiple Dentries requires holding +// VirtualFilesystem.mountMu. package vfs import ( "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" ) // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. @@ -33,7 +44,7 @@ type VirtualFilesystem struct { // mountMu serializes mount mutations. // // mountMu is analogous to Linux's namespace_sem. - mountMu sync.RWMutex + mountMu sync.Mutex // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts // are uniquely namespaced, including mount parent in the key correctly @@ -52,7 +63,7 @@ type VirtualFilesystem struct { // mountpoints maps mount points to mounts at those points in all // namespaces. mountpoints is protected by mountMu. // - // mountpoints is used to find mounts that must be unmounted due to + // mountpoints is used to find mounts that must be umounted due to // removal of a mount point Dentry from another mount namespace. ("A file // or directory that is a mount point in one namespace that is not a mount // point in another namespace, may be renamed, unlinked, or removed @@ -62,6 +73,11 @@ type VirtualFilesystem struct { // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} + // filesystems contains all Filesystems. filesystems is protected by + // filesystemsMu. + filesystemsMu sync.Mutex + filesystems map[*Filesystem]struct{} + // fsTypes contains all FilesystemTypes that are usable in the // VirtualFilesystem. fsTypes is protected by fsTypesMu. fsTypesMu sync.RWMutex @@ -72,12 +88,466 @@ type VirtualFilesystem struct { func New() *VirtualFilesystem { vfs := &VirtualFilesystem{ mountpoints: make(map[*Dentry]map[*Mount]struct{}), + filesystems: make(map[*Filesystem]struct{}), fsTypes: make(map[string]FilesystemType), } vfs.mounts.Init() return vfs } +// PathOperation specifies the path operated on by a VFS method. +// +// PathOperation is passed to VFS methods by pointer to reduce memory copying: +// it's somewhat large and should never escape. (Options structs are passed by +// pointer to VFS and FileDescription methods for the same reason.) +type PathOperation struct { + // Root is the VFS root. References on Root are borrowed from the provider + // of the PathOperation. + // + // Invariants: Root.Ok(). + Root VirtualDentry + + // Start is the starting point for the path traversal. References on Start + // are borrowed from the provider of the PathOperation (i.e. the caller of + // the VFS method to which the PathOperation was passed). + // + // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. + Start VirtualDentry + + // Path is the pathname traversed by this operation. + Pathname string + + // If FollowFinalSymlink is true, and the Dentry traversed by the final + // path component represents a symbolic link, the symbolic link should be + // followed. + FollowFinalSymlink bool +} + +// GetDentryAt returns a VirtualDentry representing the given path, at which a +// file must exist. A reference is taken on the returned VirtualDentry. +func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return VirtualDentry{}, err + } + for { + d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) + if err == nil { + vd := VirtualDentry{ + mount: rp.mount, + dentry: d, + } + rp.mount.IncRef() + vfs.putResolvingPath(rp) + return vd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, err + } + } +} + +// LinkAt creates a hard link at newpop representing the existing file at +// oldpop. +func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { + oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) + if err != nil { + return err + } + rp, err := vfs.getResolvingPath(creds, newpop) + if err != nil { + oldVD.DecRef() + return err + } + for { + err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) + if err == nil { + oldVD.DecRef() + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + oldVD.DecRef() + vfs.putResolvingPath(rp) + return err + } + } +} + +// MkdirAt creates a directory at the given path. +func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { + // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is + // also honored." - mkdir(2) + opts.Mode &= 0777 | linux.S_ISVTX + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// MknodAt creates a file of the given mode at the given path. It returns an +// error from the syserror package. +func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil + } + for { + if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil { + vfs.putResolvingPath(rp) + return nil + } + // Handle mount traversals. + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// OpenAt returns a FileDescription providing access to the file at the given +// path. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { + // Remove: + // + // - O_LARGEFILE, which we always report in FileDescription status flags + // since only 64-bit architectures are supported at this time. + // + // - O_CLOEXEC, which affects file descriptors and therefore must be + // handled outside of VFS. + // + // - Unknown flags. + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE + // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. + if opts.Flags&linux.O_SYNC != 0 { + opts.Flags |= linux.O_DSYNC + } + // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified + // with O_DIRECTORY and a writable access mode (to ensure that it fails on + // filesystem implementations that do not support it). + if opts.Flags&linux.O_TMPFILE != 0 { + if opts.Flags&linux.O_DIRECTORY == 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { + return nil, syserror.EINVAL + } + } + // O_PATH causes most other flags to be ignored. + if opts.Flags&linux.O_PATH != 0 { + opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH + } + // "On Linux, the following bits are also honored in mode: [S_ISUID, + // S_ISGID, S_ISVTX]" - open(2) + opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX + + if opts.Flags&linux.O_NOFOLLOW != 0 { + pop.FollowFinalSymlink = false + } + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil, err + } + if opts.Flags&linux.O_DIRECTORY != 0 { + rp.mustBeDir = true + rp.mustBeDirOrig = true + } + for { + fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return fd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// ReadlinkAt returns the target of the symbolic link at the given path. +func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return "", err + } + for { + target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return target, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return "", err + } + } +} + +// RenameAt renames the file at oldpop to newpop. +func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { + oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) + if err != nil { + return err + } + rp, err := vfs.getResolvingPath(creds, newpop) + if err != nil { + oldVD.DecRef() + return err + } + for { + err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts) + if err == nil { + oldVD.DecRef() + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + oldVD.DecRef() + vfs.putResolvingPath(rp) + return err + } + } +} + +// RmdirAt removes the directory at the given path. +func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.RmdirAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// SetStatAt changes metadata for the file at the given path. +func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// StatAt returns metadata for the file at the given path. +func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return linux.Statx{}, err + } + for { + stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return stat, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statx{}, err + } + } +} + +// StatFSAt returns metadata for the filesystem containing the file at the +// given path. +func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return linux.Statfs{}, err + } + for { + statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return statfs, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statfs{}, err + } + } +} + +// SymlinkAt creates a symbolic link at the given path with the given target. +func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// UnlinkAt deletes the non-directory file at the given path. +func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.UnlinkAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// ListxattrAt returns all extended attribute names for the file at the given +// path. +func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil, err + } + for { + names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return names, nil + } + if err == syserror.ENOTSUP { + // Linux doesn't actually return ENOTSUP in this case; instead, + // fs/xattr.c:vfs_listxattr() falls back to allowing the security + // subsystem to return security extended attributes, which by + // default don't exist. + vfs.putResolvingPath(rp) + return nil, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// GetxattrAt returns the value associated with the given extended attribute +// for the file at the given path. +func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return "", err + } + for { + val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name) + if err == nil { + vfs.putResolvingPath(rp) + return val, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return "", err + } + } +} + +// SetxattrAt changes the value associated with the given extended attribute +// for the file at the given path. +func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// RemovexattrAt removes the given extended attribute from the file at rp. +func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// SyncAllFilesystems has the semantics of Linux's sync(2). +func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { + fss := make(map[*Filesystem]struct{}) + vfs.filesystemsMu.Lock() + for fs := range vfs.filesystems { + if !fs.TryIncRef() { + continue + } + fss[fs] = struct{}{} + } + vfs.filesystemsMu.Unlock() + var retErr error + for fs := range fss { + if err := fs.impl.Sync(ctx); err != nil && retErr == nil { + retErr = err + } + fs.DecRef() + } + return retErr +} + // A VirtualDentry represents a node in a VFS tree, by combining a Dentry // (which represents a node in a Filesystem's tree) and a Mount (which // represents the Filesystem's position in a VFS mount tree). @@ -111,15 +581,15 @@ func (vd VirtualDentry) Ok() bool { // IncRef increments the reference counts on the Mount and Dentry represented // by vd. func (vd VirtualDentry) IncRef() { - vd.mount.incRef() - vd.dentry.incRef(vd.mount.fs) + vd.mount.IncRef() + vd.dentry.IncRef() } // DecRef decrements the reference counts on the Mount and Dentry represented // by vd. func (vd VirtualDentry) DecRef() { - vd.dentry.decRef(vd.mount.fs) - vd.mount.decRef() + vd.dentry.DecRef() + vd.mount.DecRef() } // Mount returns the Mount associated with vd. It does not take a reference on diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index 145102c0d..5e4611333 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -42,8 +42,35 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) -// DefaultTimeout is a resonable timeout value for most applications. -const DefaultTimeout = 3 * time.Minute +// Opts configures the watchdog. +type Opts struct { + // TaskTimeout is the amount of time to allow a task to execute the + // same syscall without blocking before it's declared stuck. + TaskTimeout time.Duration + + // TaskTimeoutAction indicates what action to take when a stuck tasks + // is detected. + TaskTimeoutAction Action + + // StartupTimeout is the amount of time to allow between watchdog + // creation and calling watchdog.Start. + StartupTimeout time.Duration + + // StartupTimeoutAction indicates what action to take when + // watchdog.Start is not called within the timeout. + StartupTimeoutAction Action +} + +// DefaultOpts is a default set of options for the watchdog. +var DefaultOpts = Opts{ + // Task timeout. + TaskTimeout: 3 * time.Minute, + TaskTimeoutAction: LogWarning, + + // Startup timeout. + StartupTimeout: 30 * time.Second, + StartupTimeoutAction: LogWarning, +} // descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period // is discounted from task's last update time. It's set high enough that small scheduling delays won't @@ -61,6 +88,7 @@ type Action int const ( // LogWarning logs warning message followed by stack trace. LogWarning Action = iota + // Panic will do the same logging as LogWarning and panic(). Panic ) @@ -80,17 +108,13 @@ func (a Action) String() string { // Watchdog is the main watchdog class. It controls a goroutine that periodically // analyses all tasks and reports if any of them appear to be stuck. type Watchdog struct { + // Configuration options are embedded. + Opts + // period indicates how often to check all tasks. It's calculated based on - // 'taskTimeout'. + // opts.TaskTimeout. period time.Duration - // taskTimeout is the amount of time to allow a task to execute the same syscall - // without blocking before it's declared stuck. - taskTimeout time.Duration - - // timeoutAction indicates what action to take when a stuck tasks is detected. - timeoutAction Action - // k is where the tasks come from. k *kernel.Kernel @@ -113,8 +137,12 @@ type Watchdog struct { // mu protects the fields below. mu sync.Mutex - // started is true if the watchdog has been started before. - started bool + // running is true if the watchdog is running. + running bool + + // startCalled is true if Start has ever been called. It remains true + // even if Stop is called. + startCalled bool } type offender struct { @@ -122,58 +150,81 @@ type offender struct { } // New creates a new watchdog. -func New(k *kernel.Kernel, taskTimeout time.Duration, a Action) *Watchdog { - // 4 is arbitrary, just don't want to prolong 'taskTimeout' too much. - period := taskTimeout / 4 - return &Watchdog{ - k: k, - period: period, - taskTimeout: taskTimeout, - timeoutAction: a, - offenders: make(map[*kernel.Task]*offender), - stop: make(chan struct{}), - done: make(chan struct{}), +func New(k *kernel.Kernel, opts Opts) *Watchdog { + // 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much. + period := opts.TaskTimeout / 4 + w := &Watchdog{ + Opts: opts, + k: k, + period: period, + offenders: make(map[*kernel.Task]*offender), + stop: make(chan struct{}), + done: make(chan struct{}), + } + + // Handle StartupTimeout if it exists. + if w.StartupTimeout > 0 { + log.Infof("Watchdog waiting %v for startup", w.StartupTimeout) + go w.waitForStart() // S/R-SAFE: watchdog is stopped buring save and restarted after restore. } + + return w } // Start starts the watchdog. func (w *Watchdog) Start() { - if w.taskTimeout == 0 { - log.Infof("Watchdog disabled") - return - } - w.mu.Lock() defer w.mu.Unlock() - if w.started { + w.startCalled = true + + if w.running { return } + if w.TaskTimeout == 0 { + log.Infof("Watchdog task timeout disabled") + return + } w.lastRun = w.k.MonotonicClock().Now() - log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.taskTimeout, w.timeoutAction) + log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction) go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore. - w.started = true + w.running = true } // Stop requests the watchdog to stop and wait for it. func (w *Watchdog) Stop() { - if w.taskTimeout == 0 { + if w.TaskTimeout == 0 { return } w.mu.Lock() defer w.mu.Unlock() - if !w.started { + if !w.running { return } log.Infof("Stopping watchdog") w.stop <- struct{}{} <-w.done - w.started = false + w.running = false log.Infof("Watchdog stopped") } +// waitForStart waits for Start to be called and takes action if it does not +// happen within the startup timeout. +func (w *Watchdog) waitForStart() { + <-time.After(w.StartupTimeout) + w.mu.Lock() + defer w.mu.Unlock() + if w.startCalled { + // We are fine. + return + } + var buf bytes.Buffer + buf.WriteString("Watchdog.Start() not called within %s:\n") + w.doAction(w.StartupTimeoutAction, false, &buf) +} + // loop is the main watchdog routine. It only returns when 'Stop()' is called. func (w *Watchdog) loop() { // Loop until someone stops it. @@ -202,7 +253,7 @@ func (w *Watchdog) runTurn() { select { case <-done: - case <-time.After(w.taskTimeout): + case <-time.After(w.TaskTimeout): // Report if the watchdog is not making progress. // No one is wathching the watchdog watcher though. w.reportStuckWatchdog() @@ -231,12 +282,14 @@ func (w *Watchdog) runTurn() { if tsched.State == kernel.TaskGoroutineRunningSys { lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick))) elapsed := now.Sub(lastUpdateTime) - discount - if elapsed > w.taskTimeout { + if elapsed > w.TaskTimeout { tc, ok := w.offenders[t] if !ok { // New stuck task detected. // - // TODO(b/65849403): Tasks blocked doing IO may be considered stuck in kernel. + // Note that tasks blocked doing IO may be considered stuck in kernel, + // unless they are surrounded b + // Task.UninterruptibleSleepStart/Finish. tc = &offender{lastUpdateTime: lastUpdateTime} stuckTasks.Increment() newTaskFound = true @@ -261,28 +314,34 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo tid := w.k.TaskSet().Root.IDOfTask(t) buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime))) } + buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine") - w.onStuckTask(newTaskFound, &buf) + + // Dump stack only if a new task is detected or if it sometime has + // passed since the last time a stack dump was generated. + skipStack := newTaskFound || time.Since(w.lastStackDump) >= stackDumpSameTaskPeriod + w.doAction(w.TaskTimeoutAction, skipStack, &buf) } func (w *Watchdog) reportStuckWatchdog() { var buf bytes.Buffer buf.WriteString("Watchdog goroutine is stuck:\n") - w.onStuckTask(true, &buf) + w.doAction(w.TaskTimeoutAction, false, &buf) } -func (w *Watchdog) onStuckTask(newTaskFound bool, msg *bytes.Buffer) { - switch w.timeoutAction { +// doAction will take the given action. If the action is LogWarnind and +// skipStack is true, then the stack printing will be skipped. +func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) { + switch action { case LogWarning: - // Dump stack only if a new task is detected or if it sometime has passed since - // the last time a stack dump was generated. - if !newTaskFound && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod { + if skipStack { msg.WriteString("\n...[stack dump skipped]...") log.Warningf(msg.String()) - } else { - log.TracebackAll(msg.String()) - w.lastStackDump = time.Now() + return + } + log.TracebackAll(msg.String()) + w.lastStackDump = time.Now() case Panic: // Panic will skip over running tasks, which is likely the culprit here. So manually @@ -301,5 +360,8 @@ func (w *Watchdog) onStuckTask(newTaskFound bool, msg *bytes.Buffer) { case <-time.After(1 * time.Second): } panic(fmt.Sprintf("Stack for running G's are skipped while panicking.\n%s", msg.String())) + default: + panic(fmt.Sprintf("Unknown watchdog action %v", action)) + } } |