diff options
Diffstat (limited to 'pkg/sentry/kernel')
47 files changed, 1522 insertions, 233 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e61d39c82..2706927ff 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -1,10 +1,11 @@ load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") - go_template_instance( name = "pending_signals_list", out = "pending_signals_list.go", @@ -34,7 +35,7 @@ go_template_instance( out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", - template = "//third_party/gvsync:generic_seqatomic", + template = "//pkg/syncutil:generic_seqatomic", types = { "Value": "TaskGoroutineSchedInfo", }, @@ -83,6 +84,12 @@ proto_library( deps = ["//pkg/sentry/arch:registers_proto"], ) +cc_proto_library( + name = "uncaught_signal_cc_proto", + visibility = ["//visibility:public"], + deps = [":uncaught_signal_proto"], +) + go_proto_library( name = "uncaught_signal_go_proto", importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto", @@ -144,6 +151,7 @@ go_library( "threads.go", "timekeeper.go", "timekeeper_state.go", + "tty.go", "uts_namespace.go", "vdso.go", "version.go", @@ -201,12 +209,12 @@ go_library( "//pkg/sentry/usermem", "//pkg/state", "//pkg/state/statefile", + "//pkg/syncutil", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", "//pkg/waiter", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 1d00a6310..04c244447 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -1,14 +1,14 @@ -package(licenses = ["notice"]) - load("//tools/go_generics:defs.bzl", "go_template_instance") load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_template_instance( name = "atomicptr_credentials", out = "atomicptr_credentials_unsafe.go", package = "auth", suffix = "Credentials", - template = "//third_party/gvsync:generic_atomicptr", + template = "//pkg/syncutil:generic_atomicptr", types = { "Value": "Credentials", }, diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index e3f5b0d83..3c9dceaba 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -15,6 +15,8 @@ package kernel import ( + "time" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" ) @@ -97,6 +99,21 @@ func TaskFromContext(ctx context.Context) *Task { return nil } +// Deadline implements context.Context.Deadline. +func (*Task) Deadline() (time.Time, bool) { + return time.Time{}, false +} + +// Done implements context.Context.Done. +func (*Task) Done() <-chan struct{} { + return nil +} + +// Err implements context.Context.Err. +func (*Task) Err() error { + return nil +} + // AsyncContext returns a context.Context that may be used by goroutines that // do work on behalf of t and therefore share its contextual values, but are // not t's task goroutine (e.g. asynchronous I/O). @@ -129,6 +146,21 @@ func (ctx taskAsyncContext) IsLogging(level log.Level) bool { return ctx.t.IsLogging(level) } +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return ctx.t.Deadline() +} + +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return ctx.t.Done() +} + +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return ctx.t.Err() +} + // Value implements context.Context.Value. func (ctx taskAsyncContext) Value(key interface{}) interface{} { return ctx.t.Value(key) diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD index bec13a3d9..3a88a585c 100644 --- a/pkg/sentry/kernel/contexttest/BUILD +++ b/pkg/sentry/kernel/contexttest/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "contexttest", testonly = 1, diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index f46c43128..3361e8b7d 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "epoll_list", diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index 1c5f979d4..e65b961e8 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -1,6 +1,7 @@ -package(licenses = ["notice"]) +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +package(licenses = ["notice"]) go_library( name = "eventfd", diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 5eddca115..49d81b712 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "fasync", srcs = ["fasync.go"], diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index cc3f43a45..11f613a11 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -81,6 +81,9 @@ type FDTable struct { // mu protects below. mu sync.Mutex `state:"nosave"` + // next is start position to find fd. + next int32 + // used contains the number of non-nil entries. It must be accessed // atomically. It may be read atomically without holding mu (but not // written). @@ -226,6 +229,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags f.mu.Lock() defer f.mu.Unlock() + // From f.next to find available fd. + if fd < f.next { + fd = f.next + } + // Install all entries. for i := fd; i < end && len(fds) < len(files); i++ { if d, _, _ := f.get(i); d == nil { @@ -242,6 +250,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return nil, syscall.EMFILE } + if fd == f.next { + // Update next search start position. + f.next = fds[len(fds)-1] + 1 + } + return fds, nil } @@ -361,6 +374,11 @@ func (f *FDTable) Remove(fd int32) *fs.File { f.mu.Lock() defer f.mu.Unlock() + // Update current available position. + if fd < f.next { + f.next = fd + } + orig, _, _ := f.get(fd) if orig != nil { orig.IncRef() // Reference for caller. @@ -377,6 +395,10 @@ func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) { f.forEach(func(fd int32, file *fs.File, flags FDFlags) { if cond(file, flags) { f.set(fd, nil, FDFlags{}) // Clear from table. + // Update current available position. + if fd < f.next { + f.next = fd + } } }) } diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 2413788e7..2bcb6216a 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -70,6 +70,42 @@ func TestFDTableMany(t *testing.T) { if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) } + + i := int32(2) + fdTable.Remove(i) + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i { + t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err) + } + }) +} + +func TestFDTableOverLimit(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + if _, err := fdTable.NewFDs(ctx, maxFD, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(maxFD, f): got nil, wanted error") + } + + if _, err := fdTable.NewFDs(ctx, maxFD-2, []*fs.File{file, file, file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(maxFD-2, {f,f,f}): got nil, wanted error") + } + + if fds, err := fdTable.NewFDs(ctx, maxFD-3, []*fs.File{file, file, file}, FDFlags{}); err != nil { + t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err) + } else { + for _, fd := range fds { + fdTable.Remove(fd) + } + } + + if fds, err := fdTable.NewFDs(ctx, maxFD-1, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != maxFD-1 { + t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) + } else if len(fds) != 1 || fds[0] != 0 { + t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds) + } }) } diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 6a31dc044..75ec31761 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -1,14 +1,15 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "atomicptr_bucket", out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", - template = "//third_party/gvsync:generic_atomicptr", + template = "//pkg/syncutil:generic_atomicptr", types = { "Value": "bucket", }, diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 8c1f79ab5..bd3fb4c03 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -24,6 +24,7 @@ // TaskSet.mu // SignalHandlers.mu // Task.mu +// runningTasksMu // // Locking SignalHandlers.mu in multiple SignalHandlers requires locking // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same @@ -135,6 +136,22 @@ type Kernel struct { // syslog is the kernel log. syslog syslog + // runningTasksMu synchronizes disable/enable of cpuClockTicker when + // the kernel is idle (runningTasks == 0). + // + // runningTasksMu is used to exclude critical sections when the timer + // disables itself and when the first active task enables the timer, + // ensuring that tasks always see a valid cpuClock value. + runningTasksMu sync.Mutex `state:"nosave"` + + // runningTasks is the total count of tasks currently in + // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are + // not blocked or stopped. + // + // runningTasks must be accessed atomically. Increments from 0 to 1 are + // further protected by runningTasksMu (see incRunningTasks). + runningTasks int64 + // cpuClock is incremented every linux.ClockTick. cpuClock is used to // measure task CPU usage, since sampling monotonicClock twice on every // syscall turns out to be unreasonably expensive. This is similar to how @@ -150,6 +167,22 @@ type Kernel struct { // cpuClockTicker increments cpuClock. cpuClockTicker *ktime.Timer `state:"nosave"` + // cpuClockTickerDisabled indicates that cpuClockTicker has been + // disabled because no tasks are running. + // + // cpuClockTickerDisabled is protected by runningTasksMu. + cpuClockTickerDisabled bool + + // cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the + // point it was disabled. It is cached here to avoid a lock ordering + // violation with cpuClockTicker.mu when runningTaskMu is held. + // + // cpuClockTickerSetting is only valid when cpuClockTickerDisabled is + // true. + // + // cpuClockTickerSetting is protected by runningTasksMu. + cpuClockTickerSetting ktime.Setting + // fdMapUids is an ever-increasing counter for generating FDTable uids. // // fdMapUids is mutable, and is accessed using atomic memory operations. @@ -358,7 +391,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() - if err := state.Save(w, k.FeatureSet(), nil); err != nil { + if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) @@ -366,7 +399,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // Save the kernel state. kernelStart := time.Now() var stats state.Stats - if err := state.Save(w, k, &stats); err != nil { + if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil { return err } log.Infof("Kernel save stats: %s", &stats) @@ -374,7 +407,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // Save the memory file's state. memoryStart := time.Now() - if err := k.mf.SaveTo(w); err != nil { + if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { return err } log.Infof("Memory save took [%s].", time.Since(memoryStart)) @@ -509,7 +542,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() var features cpuid.FeatureSet - if err := state.Load(r, &features, nil); err != nil { + if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) @@ -525,7 +558,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the kernel state. kernelStart := time.Now() var stats state.Stats - if err := state.Load(r, k, &stats); err != nil { + if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil { return err } log.Infof("Kernel load stats: %s", &stats) @@ -533,7 +566,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the memory file's state. memoryStart := time.Now() - if err := k.mf.LoadFrom(r); err != nil { + if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { return err } log.Infof("Memory load took [%s].", time.Since(memoryStart)) @@ -771,8 +804,21 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, // Create a fresh task context. remainingTraversals = uint(args.MaxSymlinkTraversals) + loadArgs := loader.LoadArgs{ + Mounts: mounts, + Root: root, + WorkingDirectory: wd, + RemainingTraversals: &remainingTraversals, + ResolveFinal: true, + Filename: args.Filename, + File: args.File, + CloseOnExec: false, + Argv: args.Argv, + Envv: args.Envv, + Features: k.featureSet, + } - tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet) + tc, se := k.LoadTaskImage(ctx, loadArgs) if se != nil { return nil, 0, errors.New(se.String()) } @@ -795,9 +841,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, AbstractSocketNamespace: args.AbstractSocketNamespace, ContainerID: args.ContainerID, } - if _, err := k.tasks.NewTask(config); err != nil { + t, err := k.tasks.NewTask(config) + if err != nil { return nil, 0, err } + t.traceExecEvent(tc) // Simulate exec for tracing. // Success. tgid := k.tasks.Root.IDOfThreadGroup(tg) @@ -912,6 +960,102 @@ func (k *Kernel) resumeTimeLocked() { } } +func (k *Kernel) incRunningTasks() { + for { + tasks := atomic.LoadInt64(&k.runningTasks) + if tasks != 0 { + // Standard case. Simply increment. + if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) { + continue + } + return + } + + // Transition from 0 -> 1. Synchronize with other transitions and timer. + k.runningTasksMu.Lock() + tasks = atomic.LoadInt64(&k.runningTasks) + if tasks != 0 { + // We're no longer the first task, no need to + // re-enable. + atomic.AddInt64(&k.runningTasks, 1) + k.runningTasksMu.Unlock() + return + } + + if !k.cpuClockTickerDisabled { + // Timer was never disabled. + atomic.StoreInt64(&k.runningTasks, 1) + k.runningTasksMu.Unlock() + return + } + + // We need to update cpuClock for all of the ticks missed while we + // slept, and then re-enable the timer. + // + // The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify + // always increments cpuClock by 1 regardless of the number of + // expirations as a heuristic to avoid over-accounting in cases of CPU + // throttling. + // + // We want to cover the normal case, when all time should be accounted, + // so we increment for all expirations. Throttling is less concerning + // here because the ticker is only disabled from Notify. This means + // that Notify must schedule and compensate for the throttled period + // before the timer is disabled. Throttling while the timer is disabled + // doesn't matter, as nothing is running or reading cpuClock anyways. + // + // S/R also adds complication, as there are two cases. Recall that + // monotonicClock will jump forward on restore. + // + // 1. If the ticker is enabled during save, then on Restore Notify is + // called with many expirations, covering the time jump, but cpuClock + // is only incremented by 1. + // + // 2. If the ticker is disabled during save, then after Restore the + // first wakeup will call this function and cpuClock will be + // incremented by the number of expirations across the S/R. + // + // These cause very different value of cpuClock. But again, since + // nothing was running while the ticker was disabled, those differences + // don't matter. + setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now()) + if exp > 0 { + atomic.AddUint64(&k.cpuClock, exp) + } + + // Now that cpuClock is updated it is safe to allow other tasks to + // transition to running. + atomic.StoreInt64(&k.runningTasks, 1) + + // N.B. we must unlock before calling Swap to maintain lock ordering. + // + // cpuClockTickerDisabled need not wait until after Swap to become + // true. It is sufficient that the timer *will* be enabled. + k.cpuClockTickerDisabled = false + k.runningTasksMu.Unlock() + + // This won't call Notify (unless it's been ClockTick since setting.At + // above). This means we skip the thread group work in Notify. However, + // since nothing was running while we were disabled, none of the timers + // could have expired. + k.cpuClockTicker.Swap(setting) + + return + } +} + +func (k *Kernel) decRunningTasks() { + tasks := atomic.AddInt64(&k.runningTasks, -1) + if tasks < 0 { + panic(fmt.Sprintf("Invalid running count %d", tasks)) + } + + // Nothing to do. The next CPU clock tick will disable the timer if + // there is still nothing running. This provides approximately one tick + // of slack in which we can switch back and forth between idle and + // active without an expensive transition. +} + // WaitExited blocks until all tasks in k have exited. func (k *Kernel) WaitExited() { k.tasks.liveGoroutines.Wait() @@ -976,6 +1120,22 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { return lastErr } +// RebuildTraceContexts rebuilds the trace context for all tasks. +// +// Unfortunately, if these are built while tracing is not enabled, then we will +// not have meaningful trace data. Rebuilding here ensures that we can do so +// after tracing has been enabled. +func (k *Kernel) RebuildTraceContexts() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + for t, tid := range k.tasks.Root.tids { + t.rebuildTraceContext(tid) + } +} + // FeatureSet returns the FeatureSet. func (k *Kernel) FeatureSet() *cpuid.FeatureSet { return k.featureSet @@ -1180,6 +1340,7 @@ func (k *Kernel) ListSockets() []*SocketEntry { return socks } +// supervisorContext is a privileged context. type supervisorContext struct { context.NoopSleeper log.Logger diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD index ebcfaa619..d7a7d1169 100644 --- a/pkg/sentry/kernel/memevent/BUILD +++ b/pkg/sentry/kernel/memevent/BUILD @@ -1,5 +1,6 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@rules_cc//cc:defs.bzl", "cc_proto_library") package(licenses = ["notice"]) @@ -24,6 +25,12 @@ proto_library( visibility = ["//visibility:public"], ) +cc_proto_library( + name = "memory_events_cc_proto", + visibility = ["//visibility:public"], + deps = [":memory_events_proto"], +) + go_proto_library( name = "memory_events_go_proto", importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto", diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 4d15cca85..9d34f6d4d 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "buffer_list", @@ -23,8 +24,10 @@ go_library( "device.go", "node.go", "pipe.go", + "pipe_util.go", "reader.go", "reader_writer.go", + "vfs.go", "writer.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/pipe", @@ -39,6 +42,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/safemem", "//pkg/sentry/usermem", + "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go index 69ef2a720..95bee2d37 100644 --- a/pkg/sentry/kernel/pipe/buffer.go +++ b/pkg/sentry/kernel/pipe/buffer.go @@ -15,6 +15,7 @@ package pipe import ( + "io" "sync" "gvisor.dev/gvisor/pkg/sentry/safemem" @@ -67,6 +68,17 @@ func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { return n, err } +// WriteFromReader writes to the buffer from an io.Reader. +func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) { + dst := b.data[b.write:] + if count < int64(len(dst)) { + dst = b.data[b.write:][:count] + } + n, err := r.Read(dst) + b.write += n + return int64(n), err +} + // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write])) @@ -75,6 +87,19 @@ func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { return n, err } +// ReadToWriter reads from the buffer into an io.Writer. +func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) { + src := b.data[b.read:b.write] + if count < int64(len(src)) { + src = b.data[b.read:][:count] + } + n, err := w.Write(src) + if !dup { + b.read += n + } + return int64(n), err +} + // bufferPool is a pool for buffers. var bufferPool = sync.Pool{ New: func() interface{} { diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index a2dc72204..4a19ab7ce 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -18,7 +18,6 @@ import ( "sync" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -91,10 +90,10 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi switch { case flags.Read && !flags.Write: // O_RDONLY. r := i.p.Open(ctx, d, flags) - i.newHandleLocked(&i.rWakeup) + newHandleLocked(&i.rWakeup) if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { - if !i.waitFor(&i.wWakeup, ctx) { + if !waitFor(&i.mu, &i.wWakeup, ctx) { r.DecRef() return nil, syserror.ErrInterrupted } @@ -107,7 +106,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi case flags.Write && !flags.Read: // O_WRONLY. w := i.p.Open(ctx, d, flags) - i.newHandleLocked(&i.wWakeup) + newHandleLocked(&i.wWakeup) if i.p.isNamed && !i.p.HasReaders() { // On a nonblocking, write-only open, the open fails with ENXIO if the @@ -117,7 +116,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi return nil, syserror.ENXIO } - if !i.waitFor(&i.rWakeup, ctx) { + if !waitFor(&i.mu, &i.rWakeup, ctx) { w.DecRef() return nil, syserror.ErrInterrupted } @@ -127,8 +126,8 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi case flags.Read && flags.Write: // O_RDWR. // Pipes opened for read-write always succeeds without blocking. rw := i.p.Open(ctx, d, flags) - i.newHandleLocked(&i.rWakeup) - i.newHandleLocked(&i.wWakeup) + newHandleLocked(&i.rWakeup) + newHandleLocked(&i.wWakeup) return rw, nil default: @@ -136,65 +135,6 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi } } -// waitFor blocks until the underlying pipe has at least one reader/writer is -// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this -// function will block for either readers or writers, depending on where -// 'wakeupChan' points. -// -// f.mu must be held by the caller. waitFor returns with f.mu held, but it will -// drop f.mu before blocking for any reader/writers. -func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { - // Ideally this function would simply use a condition variable. However, the - // wait needs to be interruptible via 'sleeper', so we must sychronize via a - // channel. The synchronization below relies on the fact that closing a - // channel unblocks all receives on the channel. - - // Does an appropriate wakeup channel already exist? If not, create a new - // one. This is all done under f.mu to avoid races. - if *wakeupChan == nil { - *wakeupChan = make(chan struct{}) - } - - // Grab a local reference to the wakeup channel since it may disappear as - // soon as we drop f.mu. - wakeup := *wakeupChan - - // Drop the lock and prepare to sleep. - i.mu.Unlock() - cancel := sleeper.SleepStart() - - // Wait for either a new reader/write to be signalled via 'wakeup', or - // for the sleep to be cancelled. - select { - case <-wakeup: - sleeper.SleepFinish(true) - case <-cancel: - sleeper.SleepFinish(false) - } - - // Take the lock and check if we were woken. If we were woken and - // interrupted, the former takes priority. - i.mu.Lock() - select { - case <-wakeup: - return true - default: - return false - } -} - -// newHandleLocked signals a new pipe reader or writer depending on where -// 'wakeupChan' points. This unblocks any corresponding reader or writer -// waiting for the other end of the channel to be opened, see Fifo.waitFor. -// -// i.mu must be held. -func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) { - if *wakeupChan != nil { - close(*wakeupChan) - *wakeupChan = nil - } -} - func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { return syserror.EPIPE } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index adbad7764..16fa80abe 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -85,11 +85,11 @@ func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs. } func newNamedPipe(t *testing.T) *Pipe { - return NewPipe(contexttest.Context(t), true, DefaultPipeSize, usermem.PageSize) + return NewPipe(true, DefaultPipeSize, usermem.PageSize) } func newAnonPipe(t *testing.T) *Pipe { - return NewPipe(contexttest.Context(t), false, DefaultPipeSize, usermem.PageSize) + return NewPipe(false, DefaultPipeSize, usermem.PageSize) } // assertRecvBlocks ensures that a recv attempt on c blocks for at least diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 247e2928e..1a1b38f83 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -99,7 +98,7 @@ type Pipe struct { // NewPipe initializes and returns a pipe. // // N.B. The size and atomicIOBytes will be bounded. -func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe { +func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe { if sizeBytes < MinimumPipeSize { sizeBytes = MinimumPipeSize } @@ -112,17 +111,33 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) if atomicIOBytes > sizeBytes { atomicIOBytes = sizeBytes } - return &Pipe{ - isNamed: isNamed, - max: sizeBytes, - atomicIOBytes: atomicIOBytes, + var p Pipe + initPipe(&p, isNamed, sizeBytes, atomicIOBytes) + return &p +} + +func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) { + if sizeBytes < MinimumPipeSize { + sizeBytes = MinimumPipeSize + } + if sizeBytes > MaximumPipeSize { + sizeBytes = MaximumPipeSize + } + if atomicIOBytes <= 0 { + atomicIOBytes = 1 } + if atomicIOBytes > sizeBytes { + atomicIOBytes = sizeBytes + } + pipe.isNamed = isNamed + pipe.max = sizeBytes + pipe.atomicIOBytes = atomicIOBytes } // NewConnectedPipe initializes a pipe and returns a pair of objects // representing the read and write ends of the pipe. func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) { - p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes) + p := NewPipe(false /* isNamed */, sizeBytes, atomicIOBytes) // Build an fs.Dirent for the pipe which will be shared by both // returned files. @@ -173,13 +188,24 @@ func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.F } } +type readOps struct { + // left returns the bytes remaining. + left func() int64 + + // limit limits subsequence reads. + limit func(int64) + + // read performs the actual read operation. + read func(*buffer) (int64, error) +} + // read reads data from the pipe into dst and returns the number of bytes // read, or returns ErrWouldBlock if the pipe is empty. // // Precondition: this pipe must have readers. -func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) { +func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { // Don't block for a zero-length read even if the pipe is empty. - if dst.NumBytes() == 0 { + if ops.left() == 0 { return 0, nil } @@ -196,12 +222,12 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) } // Limit how much we consume. - if dst.NumBytes() > p.size { - dst = dst.TakeFirst64(p.size) + if ops.left() > p.size { + ops.limit(p.size) } done := int64(0) - for dst.NumBytes() > 0 { + for ops.left() > 0 { // Pop the first buffer. first := p.data.Front() if first == nil { @@ -209,10 +235,9 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) } // Copy user data. - n, err := dst.CopyOutFrom(ctx, first) + n, err := ops.read(first) done += int64(n) p.size -= n - dst = dst.DropFirst64(n) // Empty buffer? if first.Empty() { @@ -230,12 +255,57 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) return done, nil } +// dup duplicates all data from this pipe into the given writer. +// +// There is no blocking behavior implemented here. The writer may propagate +// some blocking error. All the writes must be complete writes. +func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + + // Is the pipe empty? + if p.size == 0 { + if !p.HasWriters() { + // See above. + return 0, nil + } + return 0, syserror.ErrWouldBlock + } + + // Limit how much we consume. + if ops.left() > p.size { + ops.limit(p.size) + } + + done := int64(0) + for buf := p.data.Front(); buf != nil; buf = buf.Next() { + n, err := ops.read(buf) + done += n + if err != nil { + return done, err + } + } + + return done, nil +} + +type writeOps struct { + // left returns the bytes remaining. + left func() int64 + + // limit should limit subsequent writes. + limit func(int64) + + // write should write to the provided buffer. + write func(*buffer) (int64, error) +} + // write writes data from sv into the pipe and returns the number of bytes // written. If no bytes are written because the pipe is full (or has less than // atomicIOBytes free capacity), write returns ErrWouldBlock. // // Precondition: this pipe must have writers. -func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) { +func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { p.mu.Lock() defer p.mu.Unlock() @@ -246,17 +316,16 @@ func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be // atomic, but requires no atomicity for writes larger than this. - wanted := src.NumBytes() + wanted := ops.left() if avail := p.max - p.size; wanted > avail { if wanted <= p.atomicIOBytes { return 0, syserror.ErrWouldBlock } - // Limit to the available capacity. - src = src.TakeFirst64(avail) + ops.limit(avail) } done := int64(0) - for src.NumBytes() > 0 { + for ops.left() > 0 { // Need a new buffer? last := p.data.Back() if last == nil || last.Full() { @@ -266,10 +335,9 @@ func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) } // Copy user data. - n, err := src.CopyInTo(ctx, last) + n, err := ops.write(last) done += int64(n) p.size += n - src = src.DropFirst64(n) // Handle errors. if err != nil { diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go new file mode 100644 index 000000000..ef9641e6a --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -0,0 +1,213 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "io" + "math" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// This file contains Pipe file functionality that is tied to neither VFS nor +// the old fs architecture. + +// Release cleans up the pipe's state. +func (p *Pipe) Release() { + p.rClose() + p.wClose() + + // Wake up readers and writers. + p.Notify(waiter.EventIn | waiter.EventOut) +} + +// Read reads from the Pipe into dst. +func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) { + n, err := p.read(ctx, readOps{ + left: func() int64 { + return dst.NumBytes() + }, + limit: func(l int64) { + dst = dst.TakeFirst64(l) + }, + read: func(buf *buffer) (int64, error) { + n, err := dst.CopyOutFrom(ctx, buf) + dst = dst.DropFirst64(n) + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventOut) + } + return n, err +} + +// WriteTo writes to w from the Pipe. +func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) { + ops := readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(buf *buffer) (int64, error) { + n, err := buf.ReadToWriter(w, count, dup) + count -= n + return n, err + }, + } + if dup { + // There is no notification for dup operations. + return p.dup(ctx, ops) + } + n, err := p.read(ctx, ops) + if n > 0 { + p.Notify(waiter.EventOut) + } + return n, err +} + +// Write writes to the Pipe from src. +func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) { + n, err := p.write(ctx, writeOps{ + left: func() int64 { + return src.NumBytes() + }, + limit: func(l int64) { + src = src.TakeFirst64(l) + }, + write: func(buf *buffer) (int64, error) { + n, err := src.CopyInTo(ctx, buf) + src = src.DropFirst64(n) + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventIn) + } + return n, err +} + +// ReadFrom reads from r to the Pipe. +func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) { + n, err := p.write(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(buf *buffer) (int64, error) { + n, err := buf.WriteFromReader(r, count) + count -= n + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventIn) + } + return n, err +} + +// Readiness returns the ready events in the underlying pipe. +func (p *Pipe) Readiness(mask waiter.EventMask) waiter.EventMask { + return p.rwReadiness() & mask +} + +// Ioctl implements ioctls on the Pipe. +func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Switch on ioctl request. + switch int(args[1].Int()) { + case linux.FIONREAD: + v := p.queued() + if v > math.MaxInt32 { + v = math.MaxInt32 // Silently truncate. + } + // Copy result to user-space. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + default: + return 0, syscall.ENOTTY + } +} + +// waitFor blocks until the underlying pipe has at least one reader/writer is +// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this +// function will block for either readers or writers, depending on where +// 'wakeupChan' points. +// +// mu must be held by the caller. waitFor returns with mu held, but it will +// drop mu before blocking for any reader/writers. +func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { + // Ideally this function would simply use a condition variable. However, the + // wait needs to be interruptible via 'sleeper', so we must sychronize via a + // channel. The synchronization below relies on the fact that closing a + // channel unblocks all receives on the channel. + + // Does an appropriate wakeup channel already exist? If not, create a new + // one. This is all done under f.mu to avoid races. + if *wakeupChan == nil { + *wakeupChan = make(chan struct{}) + } + + // Grab a local reference to the wakeup channel since it may disappear as + // soon as we drop f.mu. + wakeup := *wakeupChan + + // Drop the lock and prepare to sleep. + mu.Unlock() + cancel := sleeper.SleepStart() + + // Wait for either a new reader/write to be signalled via 'wakeup', or + // for the sleep to be cancelled. + select { + case <-wakeup: + sleeper.SleepFinish(true) + case <-cancel: + sleeper.SleepFinish(false) + } + + // Take the lock and check if we were woken. If we were woken and + // interrupted, the former takes priority. + mu.Lock() + select { + case <-wakeup: + return true + default: + return false + } +} + +// newHandleLocked signals a new pipe reader or writer depending on where +// 'wakeupChan' points. This unblocks any corresponding reader or writer +// waiting for the other end of the channel to be opened, see Fifo.waitFor. +// +// Precondition: the mutex protecting wakeupChan must be held. +func newHandleLocked(wakeupChan *chan struct{}) { + if *wakeupChan != nil { + close(*wakeupChan) + *wakeupChan = nil + } +} diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go index f69dbf27b..b4d29fc77 100644 --- a/pkg/sentry/kernel/pipe/reader_writer.go +++ b/pkg/sentry/kernel/pipe/reader_writer.go @@ -15,16 +15,13 @@ package pipe import ( - "math" - "syscall" + "io" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/waiter" ) // ReaderWriter satisfies the FileOperations interface and services both @@ -44,53 +41,27 @@ type ReaderWriter struct { *Pipe } -// Release implements fs.FileOperations.Release. -func (rw *ReaderWriter) Release() { - rw.Pipe.rClose() - rw.Pipe.wClose() - - // Wake up readers and writers. - rw.Pipe.Notify(waiter.EventIn | waiter.EventOut) -} - // Read implements fs.FileOperations.Read. func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { - n, err := rw.Pipe.read(ctx, dst) - if n > 0 { - rw.Pipe.Notify(waiter.EventOut) - } - return n, err + return rw.Pipe.Read(ctx, dst) +} + +// WriteTo implements fs.FileOperations.WriteTo. +func (rw *ReaderWriter) WriteTo(ctx context.Context, _ *fs.File, w io.Writer, count int64, dup bool) (int64, error) { + return rw.Pipe.WriteTo(ctx, w, count, dup) } // Write implements fs.FileOperations.Write. func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { - n, err := rw.Pipe.write(ctx, src) - if n > 0 { - rw.Pipe.Notify(waiter.EventIn) - } - return n, err + return rw.Pipe.Write(ctx, src) } -// Readiness returns the ready events in the underlying pipe. -func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask { - return rw.Pipe.rwReadiness() & mask +// ReadFrom implements fs.FileOperations.WriteTo. +func (rw *ReaderWriter) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { + return rw.Pipe.ReadFrom(ctx, r, count) } // Ioctl implements fs.FileOperations.Ioctl. func (rw *ReaderWriter) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - // Switch on ioctl request. - switch int(args[1].Int()) { - case linux.FIONREAD: - v := rw.queued() - if v > math.MaxInt32 { - v = math.MaxInt32 // Silently truncate. - } - // Copy result to user-space. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) - return 0, err - default: - return 0, syscall.ENOTTY - } + return rw.Pipe.Ioctl(ctx, io, args) } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go new file mode 100644 index 000000000..6416e0dd8 --- /dev/null +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -0,0 +1,220 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// This file contains types enabling the pipe package to be used with the vfs +// package. + +// VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should +// not be copied. +type VFSPipe struct { + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // pipe is the underlying pipe. + pipe Pipe + + // Channels for synchronizing the creation of new readers and writers + // of this fifo. See waitFor and newHandleLocked. + // + // These are not saved/restored because all waiters are unblocked on + // save, and either automatically restart (via ERESTARTSYS) or return + // EINTR on resume. On restarts via ERESTARTSYS, the appropriate + // channel will be recreated. + rWakeup chan struct{} `state:"nosave"` + wWakeup chan struct{} `state:"nosave"` +} + +// NewVFSPipe returns an initialized VFSPipe. +func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe { + var vp VFSPipe + initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes) + return &vp +} + +// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics +// during open: +// +// "Normally, opening the FIFO blocks until the other end is opened also. A +// process can open a FIFO in nonblocking mode. In this case, opening for +// read-only will succeed even if no-one has opened on the write side yet, +// opening for write-only will fail with ENXIO (no such device or address) +// unless the other end has already been opened. Under Linux, opening a FIFO +// for read and write will succeed both in blocking and nonblocking mode. POSIX +// leaves this behavior undefined. This can be used to open a FIFO for writing +// while there are no readers available." - fifo(7) +func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { + vp.mu.Lock() + defer vp.mu.Unlock() + + readable := vfs.MayReadFileWithOpenFlags(flags) + writable := vfs.MayWriteFileWithOpenFlags(flags) + if !readable && !writable { + return nil, syserror.EINVAL + } + + vfd, err := vp.open(rp, vfsd, vfsfd, flags) + if err != nil { + return nil, err + } + + switch { + case readable && writable: + // Pipes opened for read-write always succeed without blocking. + newHandleLocked(&vp.rWakeup) + newHandleLocked(&vp.wWakeup) + + case readable: + newHandleLocked(&vp.rWakeup) + // If this pipe is being opened as nonblocking and there's no + // writer, we have to wait for a writer to open the other end. + if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + return nil, syserror.EINTR + } + + case writable: + newHandleLocked(&vp.wWakeup) + + if !vp.pipe.HasReaders() { + // Nonblocking, write-only opens fail with ENXIO when + // the read side isn't open yet. + if flags&linux.O_NONBLOCK != 0 { + return nil, syserror.ENXIO + } + // Wait for a reader to open the other end. + if !waitFor(&vp.mu, &vp.rWakeup, ctx) { + return nil, syserror.EINTR + } + } + + default: + panic("invalid pipe flags: must be readable, writable, or both") + } + + return vfd, nil +} + +// Preconditions: vp.mu must be held. +func (vp *VFSPipe) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { + var fd VFSPipeFD + fd.flags = flags + fd.readable = vfs.MayReadFileWithOpenFlags(flags) + fd.writable = vfs.MayWriteFileWithOpenFlags(flags) + fd.vfsfd = vfsfd + fd.pipe = &vp.pipe + if fd.writable { + // The corresponding Mount.EndWrite() is in VFSPipe.Release(). + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + } + + switch { + case fd.readable && fd.writable: + vp.pipe.rOpen() + vp.pipe.wOpen() + case fd.readable: + vp.pipe.rOpen() + case fd.writable: + vp.pipe.wOpen() + default: + panic("invalid pipe flags: must be readable, writable, or both") + } + + return &fd, nil +} + +// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is +// expected that filesystesm will use this in a struct implementing +// vfs.FileDescriptionImpl. +type VFSPipeFD struct { + pipe *Pipe + flags uint32 + readable bool + writable bool + vfsfd *vfs.FileDescription +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *VFSPipeFD) Release() { + var event waiter.EventMask + if fd.readable { + fd.pipe.rClose() + event |= waiter.EventIn + } + if fd.writable { + fd.pipe.wClose() + event |= waiter.EventOut + } + if event == 0 { + panic("invalid pipe flags: must be readable, writable, or both") + } + + if fd.writable { + fd.vfsfd.VirtualDentry().Mount().EndWrite() + } + + fd.pipe.Notify(event) +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *VFSPipeFD) OnClose(_ context.Context) error { + return nil +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EINVAL + } + + return fd.pipe.Read(ctx, dst) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EINVAL + } + + return fd.pipe.Write(ctx, src) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return fd.pipe.Ioctl(ctx, uio, args) +} diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go index c5d095af7..2e861a5a8 100644 --- a/pkg/sentry/kernel/posixtimer.go +++ b/pkg/sentry/kernel/posixtimer.go @@ -117,9 +117,9 @@ func (it *IntervalTimer) signalRejectedLocked() { } // Notify implements ktime.TimerListener.Notify. -func (it *IntervalTimer) Notify(exp uint64) { +func (it *IntervalTimer) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { if it.target == nil { - return + return ktime.Setting{}, false } it.target.tg.pidns.owner.mu.RLock() @@ -129,7 +129,7 @@ func (it *IntervalTimer) Notify(exp uint64) { if it.sigpending { it.overrunCur += exp - return + return ktime.Setting{}, false } // sigpending must be set before sendSignalTimerLocked() so that it can be @@ -148,6 +148,8 @@ func (it *IntervalTimer) Notify(exp uint64) { if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil { it.signalRejectedLocked() } + + return ktime.Setting{}, false } // Destroy implements ktime.TimerListener.Destroy. Users of Timer should call diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 0acdf769d..61e412911 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -17,7 +17,6 @@ package kernel import ( - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD index 1725b8562..98ea7a0d8 100644 --- a/pkg/sentry/kernel/sched/BUILD +++ b/pkg/sentry/kernel/sched/BUILD @@ -1,4 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 36edf10f3..f4c00cd86 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -1,7 +1,8 @@ -package(licenses = ["notice"]) - +load("@io_bazel_rules_go//go:def.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) go_template_instance( name = "waiter_list", diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 93fe68a3e..de9617e9d 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred return syserror.ERANGE } - // TODO(b/29354920): Clear undo entries in all processes + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = val sem.pid = pid s.changeTime = ktime.NowFromContext(ctx) @@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti for i, val := range vals { sem := &s.sems[i] - // TODO(b/29354920): Clear undo entries in all processes + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = int16(val) sem.pid = pid sem.wakeWaiters() @@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch } // All operations succeeded, apply them. - // TODO(b/29354920): handle undo operations. + // TODO(gvisor.dev/issue/137): handle undo operations. for i, v := range tmpVals { s.sems[i].value = v s.sems[i].wakeWaiters() diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 81fcd8258..047b5214d 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -47,6 +47,11 @@ type Session struct { // The id is immutable. id SessionID + // foreground is the foreground process group. + // + // This is protected by TaskSet.mu. + foreground *ProcessGroup + // ProcessGroups is a list of process groups in this Session. This is // protected by TaskSet.mu. processGroups processGroupList @@ -260,12 +265,14 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { func (tg *ThreadGroup) CreateSession() error { tg.pidns.owner.mu.Lock() defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() return tg.createSession() } // createSession creates a new session for a threadgroup. // -// Precondition: callers must hold TaskSet.mu for writing. +// Precondition: callers must hold TaskSet.mu and the signal mutex for writing. func (tg *ThreadGroup) createSession() error { // Get the ID for this thread in the current namespace. id := tg.pidns.tgids[tg] @@ -321,8 +328,14 @@ func (tg *ThreadGroup) createSession() error { childTG.processGroup.incRefWithParent(pg) childTG.processGroup.decRefWithParent(oldParentPG) }) - tg.processGroup.decRefWithParent(oldParentPG) + // If tg.processGroup is an orphan, decRefWithParent will lock + // the signal mutex of each thread group in tg.processGroup. + // However, tg's signal mutex may already be locked at this + // point. We change tg's process group before calling + // decRefWithParent to avoid locking tg's signal mutex twice. + oldPG := tg.processGroup tg.processGroup = pg + oldPG.decRefWithParent(oldParentPG) } else { // The current process group may be nil only in the case of an // unparented thread group (i.e. the init process). This would @@ -346,6 +359,9 @@ func (tg *ThreadGroup) createSession() error { ns.processGroups[ProcessGroupID(local)] = pg } + // Disconnect from the controlling terminal. + tg.tty = nil + return nil } diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index aa7471eb6..cd48945e6 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "shm", srcs = [ diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD new file mode 100644 index 000000000..9f7e19b4d --- /dev/null +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -0,0 +1,22 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "signalfd", + srcs = ["signalfd.go"], + importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel", + "//pkg/sentry/usermem", + "//pkg/syserror", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go new file mode 100644 index 000000000..4b08d7d72 --- /dev/null +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -0,0 +1,140 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package signalfd provides an implementation of signal file descriptors. +package signalfd + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/anon" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SignalOperations represent a file with signalfd semantics. +// +// +stateify savable +type SignalOperations struct { + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoWrite `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + // target is the original task target. + // + // The semantics here are a bit broken. Linux will always use current + // for all reads, regardless of where the signalfd originated. We can't + // do exactly that because we need to plumb the context through + // EventRegister in order to support proper blocking behavior. This + // will undoubtedly become very complicated quickly. + target *kernel.Task + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // mask is the signal mask. Protected by mu. + mask linux.SignalSet +} + +// New creates a new signalfd object with the supplied mask. +func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // No task context? Not valid. + return nil, syserror.EINVAL + } + // name matches fs/signalfd.c:signalfd4. + dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[signalfd]") + return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &SignalOperations{ + target: t, + mask: mask, + }), nil +} + +// Release implements fs.FileOperations.Release. +func (s *SignalOperations) Release() {} + +// Mask returns the signal mask. +func (s *SignalOperations) Mask() linux.SignalSet { + s.mu.Lock() + mask := s.mask + s.mu.Unlock() + return mask +} + +// SetMask sets the signal mask. +func (s *SignalOperations) SetMask(mask linux.SignalSet) { + s.mu.Lock() + s.mask = mask + s.mu.Unlock() +} + +// Read implements fs.FileOperations.Read. +func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + // Attempt to dequeue relevant signals. + info, err := s.target.Sigtimedwait(s.Mask(), 0) + if err != nil { + // There must be no signal available. + return 0, syserror.ErrWouldBlock + } + + // Copy out the signal info using the specified format. + var buf [128]byte + binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + Signo: uint32(info.Signo), + Errno: info.Errno, + Code: info.Code, + PID: uint32(info.Pid()), + UID: uint32(info.Uid()), + Status: info.Status(), + Overrun: uint32(info.Overrun()), + Addr: info.Addr(), + }) + n, err := dst.CopyOut(ctx, buf[:]) + return int64(n), err +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SignalOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + if mask&waiter.EventIn != 0 && s.target.PendingSignals()&s.Mask() != 0 { + return waiter.EventIn // Pending signals. + } + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SignalOperations) EventRegister(entry *waiter.Entry, _ waiter.EventMask) { + // Register for the signal set; ignore the passed events. + s.target.SignalRegister(entry, waiter.EventMask(s.Mask())) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SignalOperations) EventUnregister(entry *waiter.Entry) { + // Unregister the original entry. + s.target.SignalUnregister(entry) +} diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 220fa73a2..2fdee0282 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -339,6 +339,14 @@ func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { return nil } +// LookupName looks up a syscall name. +func (s *SyscallTable) LookupName(sysno uintptr) string { + if sc, ok := s.Table[sysno]; ok { + return sc.Name + } + return fmt.Sprintf("sys_%d", sysno) // Unlikely. +} + // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e91f82bb3..ab0c6c4aa 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -15,6 +15,8 @@ package kernel import ( + gocontext "context" + "runtime/trace" "sync" "sync/atomic" @@ -35,7 +37,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/third_party/gvsync" + "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/waiter" ) // Task represents a thread of execution in the untrusted app. It @@ -82,7 +85,7 @@ type Task struct { // // gosched is protected by goschedSeq. gosched is owned by the task // goroutine. - goschedSeq gvsync.SeqCount `state:"nosave"` + goschedSeq syncutil.SeqCount `state:"nosave"` gosched TaskGoroutineSchedInfo // yieldCount is the number of times the task goroutine has called @@ -133,6 +136,13 @@ type Task struct { // signalStack is exclusive to the task goroutine. signalStack arch.SignalStack + // signalQueue is a set of registered waiters for signal-related events. + // + // signalQueue is protected by the signalMutex. Note that the task does + // not implement all queue methods, specifically the readiness checks. + // The task only broadcast a notification on signal delivery. + signalQueue waiter.Queue `state:"zerovalue"` + // If groupStopPending is true, the task should participate in a group // stop in the interrupt path. // @@ -382,7 +392,14 @@ type Task struct { // logPrefix is a string containing the task's thread ID in the root PID // namespace, and is prepended to log messages emitted by Task.Infof etc. - logPrefix atomic.Value `state:".(string)"` + logPrefix atomic.Value `state:"nosave"` + + // traceContext and traceTask are both used for tracing, and are + // updated along with the logPrefix in updateInfoLocked. + // + // These are exclusive to the task goroutine. + traceContext gocontext.Context `state:"nosave"` + traceTask *trace.Task `state:"nosave"` // creds is the task's credentials. // @@ -520,14 +537,6 @@ func (t *Task) loadPtraceTracer(tracer *Task) { t.ptraceTracer.Store(tracer) } -func (t *Task) saveLogPrefix() string { - return t.logPrefix.Load().(string) -} - -func (t *Task) loadLogPrefix(prefix string) { - t.logPrefix.Store(prefix) -} - func (t *Task) saveSyscallFilters() []bpf.Program { if f := t.syscallFilters.Load(); f != nil { return f.([]bpf.Program) @@ -541,6 +550,7 @@ func (t *Task) loadSyscallFilters(filters []bpf.Program) { // afterLoad is invoked by stateify. func (t *Task) afterLoad() { + t.updateInfoLocked() t.interruptChan = make(chan struct{}, 1) t.gosched.State = TaskGoroutineNonexistent if t.stop != nil { @@ -701,9 +711,9 @@ func (t *Task) FDTable() *FDTable { return t.fdTable } -// GetFile is a convenience wrapper t.FDTable().GetFile. +// GetFile is a convenience wrapper for t.FDTable().Get. // -// Precondition: same as FDTable. +// Precondition: same as FDTable.Get. func (t *Task) GetFile(fd int32) *fs.File { f, _ := t.fdTable.Get(fd) return f diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index dd69939f9..4a4a69ee2 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -16,6 +16,7 @@ package kernel import ( "runtime" + "runtime/trace" "time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -133,19 +134,24 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { runtime.Gosched() } + region := trace.StartRegion(t.traceContext, blockRegion) select { case <-C: + region.End() t.SleepFinish(true) + // Woken by event. return nil case <-interrupt: + region.End() t.SleepFinish(false) // Return the indicated error on interrupt. return syserror.ErrInterrupted case <-timerChan: - // We've timed out. + region.End() t.SleepFinish(true) + // We've timed out. return syserror.ETIMEDOUT } } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 0916fd658..3eadfedb4 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -299,6 +299,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // nt that it must receive before its task goroutine starts running. tid := nt.k.tasks.Root.IDOfTask(nt) defer nt.Start(tid) + t.traceCloneEvent(tid) // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 8639d379f..bb5560acf 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -18,10 +18,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -132,30 +130,21 @@ func (t *Task) Stack() *arch.Stack { return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())} } -// LoadTaskImage loads filename into a new TaskContext. +// LoadTaskImage loads a specified file into a new TaskContext. // -// It takes several arguments: -// * mounts: MountNamespace to lookup filename in -// * root: Root to lookup filename under -// * wd: Working directory to lookup filename under -// * maxTraversals: maximum number of symlinks to follow -// * filename: path to binary to load -// * file: an open fs.File object of the binary to load. If set, -// file will be loaded and not filename. -// * argv: Binary argv -// * envv: Binary envv -// * fs: Binary FeatureSet -func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) { - // If File is not nil, we should load that instead of resolving filename. - if file != nil { - filename = file.MappedName(ctx) +// args.MemoryManager does not need to be set by the caller. +func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { + // If File is not nil, we should load that instead of resolving Filename. + if args.File != nil { + args.Filename = args.File.MappedName(ctx) } // Prepare a new user address space to load into. m := mm.NewMemoryManager(k, k) defer m.DecUsers(ctx) + args.MemoryManager = m - os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, k.extraAuxv, k.vdso) + os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) if err != nil { return nil, err } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 17a089b90..90a6190f1 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -129,6 +129,7 @@ type runSyscallAfterExecStop struct { } func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { + t.traceExecEvent(r.tc) t.tg.pidns.owner.mu.Lock() t.tg.execing = nil if t.killed() { @@ -253,7 +254,7 @@ func (t *Task) promoteLocked() { t.tg.leader = t t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) - t.updateLogPrefixLocked() + t.updateInfoLocked() // Reap the original leader. If it has a tracer, detach it instead of // waiting for it to acknowledge the original leader's death. oldLeader.exitParentNotified = true diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 535f03e50..435761e5a 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -236,6 +236,7 @@ func (*runExit) execute(t *Task) taskRunState { type runExitMain struct{} func (*runExitMain) execute(t *Task) taskRunState { + t.traceExitEvent() lastExiter := t.exitThreadGroup() // If the task has a cleartid, and the thread group wasn't killed by a diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index 78ff14b20..ce3e6ef28 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -465,8 +465,8 @@ func (t *Task) SetKeepCaps(k bool) { // disables the features we don't support anyway, is always set. This // drastically simplifies this function. // -// - We don't implement AT_SECURE, because no_new_privs always being set means -// that the conditions that require AT_SECURE never arise. (Compare Linux's +// - We don't set AT_SECURE = 1, because no_new_privs always being set means +// that the conditions that require AT_SECURE = 1 never arise. (Compare Linux's // security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().) // // - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index a29e9b9eb..0fb3661de 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -16,6 +16,7 @@ package kernel import ( "fmt" + "runtime/trace" "sort" "gvisor.dev/gvisor/pkg/log" @@ -127,11 +128,88 @@ func (t *Task) debugDumpStack() { } } -// updateLogPrefix updates the task's cached log prefix to reflect its -// current thread ID. +// trace definitions. +// +// Note that all region names are prefixed by ':' in order to ensure that they +// are lexically ordered before all system calls, which use the naked system +// call name (e.g. "read") for maximum clarity. +const ( + traceCategory = "task" + runRegion = ":run" + blockRegion = ":block" + cpuidRegion = ":cpuid" + faultRegion = ":fault" +) + +// updateInfoLocked updates the task's cached log prefix and tracing +// information to reflect its current thread ID. // // Preconditions: The task's owning TaskSet.mu must be locked. -func (t *Task) updateLogPrefixLocked() { +func (t *Task) updateInfoLocked() { // Use the task's TID in the root PID namespace for logging. - t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t])) + tid := t.tg.pidns.owner.Root.tids[t] + t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid)) + t.rebuildTraceContext(tid) +} + +// rebuildTraceContext rebuilds the trace context. +// +// Precondition: the passed tid must be the tid in the root namespace. +func (t *Task) rebuildTraceContext(tid ThreadID) { + // Re-initialize the trace context. + if t.traceTask != nil { + t.traceTask.End() + } + + // Note that we define the "task type" to be the dynamic TID. This does + // not align perfectly with the documentation for "tasks" in the + // tracing package. Tasks may be assumed to be bounded by analysis + // tools. However, if we just use a generic "task" type here, then the + // "user-defined tasks" page on the tracing dashboard becomes nearly + // unusable, as it loads all traces from all tasks. + // + // We can assume that the number of tasks in the system is not + // arbitrarily large (in general it won't be, especially for cases + // where we're collecting a brief profile), so using the TID is a + // reasonable compromise in this case. + t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid)) +} + +// traceCloneEvent is called when a new task is spawned. +// +// ntid must be the new task's ThreadID in the root namespace. +func (t *Task) traceCloneEvent(ntid ThreadID) { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid) +} + +// traceExitEvent is called when a task exits. +func (t *Task) traceExitEvent() { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status()) +} + +// traceExecEvent is called when a task calls exec. +func (t *Task) traceExecEvent(tc *TaskContext) { + if !trace.IsEnabled() { + return + } + d := tc.MemoryManager.Executable() + if d == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") + return + } + defer d.DecRef() + root := t.fsContext.RootDirectory() + if root == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>") + return + } + defer root.DecRef() + n, _ := d.FullName(root) + trace.Logf(t.traceContext, traceCategory, "exec: %s", n) } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index c92266c59..d97f8c189 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -17,6 +17,7 @@ package kernel import ( "bytes" "runtime" + "runtime/trace" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -205,9 +206,11 @@ func (*runApp) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.RUnlock() } + region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) + region.End() if clearSinglestep { t.Arch().ClearSingleStep() @@ -225,6 +228,7 @@ func (*runApp) execute(t *Task) taskRunState { case platform.ErrContextSignalCPUID: // Is this a CPUID instruction? + region := trace.StartRegion(t.traceContext, cpuidRegion) expected := arch.CPUIDInstruction[:] found := make([]byte, len(expected)) _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) @@ -232,10 +236,12 @@ func (*runApp) execute(t *Task) taskRunState { // Skip the cpuid instruction. t.Arch().CPUIDEmulate(t) t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + region.End() // Resume execution. return (*runApp)(nil) } + region.End() // Not an actual CPUID, but required copy-in. // The instruction at the given RIP was not a CPUID, and we // fallthrough to the default signal deliver behavior below. @@ -251,8 +257,10 @@ func (*runApp) execute(t *Task) taskRunState { // an application-generated signal and we should continue execution // normally. if at.Any() { + region := trace.StartRegion(t.traceContext, faultRegion) addr := usermem.Addr(info.Addr()) err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + region.End() if err == nil { // The fault was handled appropriately. // We can resume running the application. @@ -260,6 +268,12 @@ func (*runApp) execute(t *Task) taskRunState { } // Is this a vsyscall that we need emulate? + // + // Note that we don't track vsyscalls as part of a + // specific trace region. This is because regions don't + // stack, and the actual system call will count as a + // region. We should be able to easily identify + // vsyscalls by having a <fault><syscall> pair. if at.Execute { if sysno, ok := t.tc.st.LookupEmulate(addr); ok { return t.doVsyscall(addr, sysno) diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index e76c069b0..8b148db35 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -126,12 +126,22 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { t.gosched.Timestamp = now t.gosched.State = state t.goschedSeq.EndWrite() + + if state != TaskGoroutineRunningApp { + // Task is blocking/stopping. + t.k.decRunningTasks() + } } // Preconditions: The caller must be running on the task goroutine, and leaving // a state indicated by a previous call to // t.accountTaskGoroutineEnter(state). func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { + if state != TaskGoroutineRunningApp { + // Task is unblocking/continuing. + t.k.incRunningTasks() + } + now := t.k.CPUClockNow() if t.gosched.State != state { panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys)) @@ -330,7 +340,7 @@ func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker { } // Notify implements ktime.TimerListener.Notify. -func (ticker *kernelCPUClockTicker) Notify(exp uint64) { +func (ticker *kernelCPUClockTicker) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { // Only increment cpuClock by 1 regardless of the number of expirations. // This approximately compensates for cases where thread throttling or bad // Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and @@ -426,6 +436,27 @@ func (ticker *kernelCPUClockTicker) Notify(exp uint64) { tgs[i] = nil } ticker.tgs = tgs[:0] + + // If nothing is running, we can disable the timer. + tasks := atomic.LoadInt64(&ticker.k.runningTasks) + if tasks == 0 { + ticker.k.runningTasksMu.Lock() + defer ticker.k.runningTasksMu.Unlock() + tasks := atomic.LoadInt64(&ticker.k.runningTasks) + if tasks != 0 { + // Raced with a 0 -> 1 transition. + return setting, false + } + + // Stop the timer. We must cache the current setting so the + // kernel can access it without violating the lock order. + ticker.k.cpuClockTickerSetting = setting + ticker.k.cpuClockTickerDisabled = true + setting.Enabled = false + return setting, true + } + + return setting, false } // Destroy implements ktime.TimerListener.Destroy. diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 266959a07..39cd1340d 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -28,6 +28,7 @@ import ( ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" ) // SignalAction is an internal signal action. @@ -497,6 +498,9 @@ func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) { // // Preconditions: The signal mutex must be locked. func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool { + // Notify that the signal is queued. + t.signalQueue.Notify(waiter.EventMask(linux.MakeSignalSet(sig))) + // - Do not choose tasks that are blocking the signal. if linux.SignalSetOf(sig)&t.signalMask != 0 { return false @@ -1108,3 +1112,17 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { t.tg.signalHandlers.mu.Unlock() return t.deliverSignal(info, act) } + +// SignalRegister registers a waiter for pending signals. +func (t *Task) SignalRegister(e *waiter.Entry, mask waiter.EventMask) { + t.tg.signalHandlers.mu.Lock() + t.signalQueue.EventRegister(e, mask) + t.tg.signalHandlers.mu.Unlock() +} + +// SignalUnregister unregisters a waiter for pending signals. +func (t *Task) SignalUnregister(e *waiter.Entry) { + t.tg.signalHandlers.mu.Lock() + t.signalQueue.EventUnregister(e) + t.tg.signalHandlers.mu.Unlock() +} diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index d60cd62c7..3522a4ae5 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -154,10 +154,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { // Below this point, newTask is expected not to fail (there is no rollback // of assignTIDsLocked or any of the following). - // Logging on t's behalf will panic if t.logPrefix hasn't been initialized. - // This is the earliest point at which we can do so (since t now has thread - // IDs). - t.updateLogPrefixLocked() + // Logging on t's behalf will panic if t.logPrefix hasn't been + // initialized. This is the earliest point at which we can do so + // (since t now has thread IDs). + t.updateInfoLocked() if cfg.InheritParent != nil { t.parent = cfg.InheritParent.parent @@ -172,9 +172,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { if parentPG := tg.parentPG(); parentPG == nil { tg.createSession() } else { - // Inherit the process group. + // Inherit the process group and terminal. parentPG.incRefWithParent(parentPG) tg.processGroup = parentPG + tg.tty = t.parent.tg.tty } } tg.tasks.PushBack(t) diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index b543d536a..3180f5560 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -17,6 +17,7 @@ package kernel import ( "fmt" "os" + "runtime/trace" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -160,6 +161,10 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u ctrl = ctrlStopAndReinvokeSyscall } else { fn := s.Lookup(sysno) + var region *trace.Region // Only non-nil if tracing == true. + if trace.IsEnabled() { + region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) + } if fn != nil { // Call our syscall implementation. rval, ctrl, err = fn(t, args) @@ -167,6 +172,9 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u // Use the missing function if not found. rval, err = t.SyscallTable().Missing(t, sysno, args) } + if region != nil { + region.End() + } } if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 2a97e3e8e..72568d296 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -19,10 +19,13 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" ) // A ThreadGroup is a logical grouping of tasks that has widespread @@ -245,6 +248,12 @@ type ThreadGroup struct { // // mounts is immutable. mounts *fs.MountNamespace + + // tty is the thread group's controlling terminal. If nil, there is no + // controlling terminal. + // + // tty is protected by the signal mutex. + tty *TTY } // newThreadGroup returns a new, empty thread group in PID namespace ns. The @@ -324,6 +333,176 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) { } } +// SetControllingTTY sets tty as the controlling terminal of tg. +func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error { + tty.mu.Lock() + defer tty.mu.Unlock() + + // We might be asked to set the controlling terminal of multiple + // processes, so we lock both the TaskSet and SignalHandlers. + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // "The calling process must be a session leader and not have a + // controlling terminal already." - tty_ioctl(4) + if tg.processGroup.session.leader != tg || tg.tty != nil { + return syserror.EINVAL + } + + // "If this terminal is already the controlling terminal of a different + // session group, then the ioctl fails with EPERM, unless the caller + // has the CAP_SYS_ADMIN capability and arg equals 1, in which case the + // terminal is stolen, and all processes that had it as controlling + // terminal lose it." - tty_ioctl(4) + if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { + if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 { + return syserror.EPERM + } + // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. + for othertg := range tg.pidns.owner.Root.tgids { + // This won't deadlock by locking tg.signalHandlers + // because at this point: + // - We only lock signalHandlers if it's in the same + // session as the tty's controlling thread group. + // - We know that the calling thread group is not in + // the same session as the tty's controlling thread + // group. + if othertg.processGroup.session == tty.tg.processGroup.session { + othertg.signalHandlers.mu.Lock() + othertg.tty = nil + othertg.signalHandlers.mu.Unlock() + } + } + } + + // Set the controlling terminal and foreground process group. + tg.tty = tty + tg.processGroup.session.foreground = tg.processGroup + // Set this as the controlling process of the terminal. + tty.tg = tg + + return nil +} + +// ReleaseControllingTTY gives up tty as the controlling tty of tg. +func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error { + tty.mu.Lock() + defer tty.mu.Unlock() + + // We might be asked to set the controlling terminal of multiple + // processes, so we lock both the TaskSet and SignalHandlers. + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + // Just below, we may re-lock signalHandlers in order to send signals. + // Thus we can't defer Unlock here. + tg.signalHandlers.mu.Lock() + + if tg.tty == nil || tg.tty != tty { + tg.signalHandlers.mu.Unlock() + return syserror.ENOTTY + } + + // "If the process was session leader, then send SIGHUP and SIGCONT to + // the foreground process group and all processes in the current + // session lose their controlling terminal." - tty_ioctl(4) + // Remove tty as the controlling tty for each process in the session, + // then send them SIGHUP and SIGCONT. + + // If we're not the session leader, we don't have to do much. + if tty.tg != tg { + tg.tty = nil + tg.signalHandlers.mu.Unlock() + return nil + } + + tg.signalHandlers.mu.Unlock() + + // We're the session leader. SIGHUP and SIGCONT the foreground process + // group and remove all controlling terminals in the session. + var lastErr error + for othertg := range tg.pidns.owner.Root.tgids { + if othertg.processGroup.session == tg.processGroup.session { + othertg.signalHandlers.mu.Lock() + othertg.tty = nil + if othertg.processGroup == tg.processGroup.session.foreground { + if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil { + lastErr = err + } + if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil { + lastErr = err + } + } + othertg.signalHandlers.mu.Unlock() + } + } + + return lastErr +} + +// ForegroundProcessGroup returns the process group ID of the foreground +// process group. +func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) { + tty.mu.Lock() + defer tty.mu.Unlock() + + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // "When fd does not refer to the controlling terminal of the calling + // process, -1 is returned" - tcgetpgrp(3) + if tg.tty != tty { + return -1, syserror.ENOTTY + } + + return int32(tg.processGroup.session.foreground.id), nil +} + +// SetForegroundProcessGroup sets the foreground process group of tty to pgid. +func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) { + tty.mu.Lock() + defer tty.mu.Unlock() + + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // TODO(b/129283598): "If tcsetpgrp() is called by a member of a + // background process group in its session, and the calling process is + // not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all + // members of this background process group." + + // tty must be the controlling terminal. + if tg.tty != tty { + return -1, syserror.ENOTTY + } + + // pgid must be positive. + if pgid < 0 { + return -1, syserror.EINVAL + } + + // pg must not be empty. Empty process groups are removed from their + // pid namespaces. + pg, ok := tg.pidns.processGroups[pgid] + if !ok { + return -1, syserror.ESRCH + } + + // pg must be part of this process's session. + if tg.processGroup.session != pg.session { + return -1, syserror.EPERM + } + + tg.processGroup.session.foreground.id = pgid + return 0, nil +} + // itimerRealListener implements ktime.Listener for ITIMER_REAL expirations. // // +stateify savable @@ -332,8 +511,9 @@ type itimerRealListener struct { } // Notify implements ktime.TimerListener.Notify. -func (l *itimerRealListener) Notify(exp uint64) { +func (l *itimerRealListener) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM)) + return ktime.Setting{}, false } // Destroy implements ktime.TimerListener.Destroy. diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 9beae4b31..31847e1df 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "time", srcs = [ diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index aa6c75d25..107394183 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -280,13 +280,16 @@ func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { // A TimerListener receives expirations from a Timer. type TimerListener interface { // Notify is called when its associated Timer expires. exp is the number of - // expirations. + // expirations. setting is the next timer Setting. // // Notify is called with the associated Timer's mutex locked, so Notify // must not take any locks that precede Timer.mu in lock order. // + // If Notify returns true, the timer will use the returned setting + // rather than the passed one. + // // Preconditions: exp > 0. - Notify(exp uint64) + Notify(exp uint64, setting Setting) (newSetting Setting, update bool) // Destroy is called when the timer is destroyed. Destroy() @@ -533,7 +536,9 @@ func (t *Timer) Tick() { s, exp := t.setting.At(now) t.setting = s if exp > 0 { - t.listener.Notify(exp) + if newS, ok := t.listener.Notify(exp, t.setting); ok { + t.setting = newS + } } t.resetKickerLocked(now) } @@ -588,7 +593,9 @@ func (t *Timer) Get() (Time, Setting) { s, exp := t.setting.At(now) t.setting = s if exp > 0 { - t.listener.Notify(exp) + if newS, ok := t.listener.Notify(exp, t.setting); ok { + t.setting = newS + } } t.resetKickerLocked(now) return now, s @@ -620,7 +627,9 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { } oldS, oldExp := t.setting.At(now) if oldExp > 0 { - t.listener.Notify(oldExp) + t.listener.Notify(oldExp, oldS) + // N.B. The returned Setting doesn't matter because we're about + // to overwrite. } if f != nil { f() @@ -628,7 +637,9 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { newS, newExp := s.At(now) t.setting = newS if newExp > 0 { - t.listener.Notify(newExp) + if newS, ok := t.listener.Notify(newExp, t.setting); ok { + t.setting = newS + } } t.resetKickerLocked(now) return now, oldS @@ -683,11 +694,13 @@ func NewChannelNotifier() (TimerListener, <-chan struct{}) { } // Notify implements ktime.TimerListener.Notify. -func (c *ChannelNotifier) Notify(uint64) { +func (c *ChannelNotifier) Notify(uint64, Setting) (Setting, bool) { select { case c.tchan <- struct{}{}: default: } + + return Setting{}, false } // Destroy implements ktime.TimerListener.Destroy and will close the channel. diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go new file mode 100644 index 000000000..048de26dc --- /dev/null +++ b/pkg/sentry/kernel/tty.go @@ -0,0 +1,39 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "sync" + +// TTY defines the relationship between a thread group and its controlling +// terminal. +// +// +stateify savable +type TTY struct { + // Index is the terminal index. It is immutable. + Index uint32 + + mu sync.Mutex `state:"nosave"` + + // tg is protected by mu. + tg *ThreadGroup +} + +// TTY returns the thread group's controlling terminal. If nil, there is no +// controlling terminal. +func (tg *ThreadGroup) TTY() *TTY { + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.tty +} |