From 198f1cddb82d46570ae63cb704b4a1b88cf0de1f Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 28 Oct 2019 10:18:55 -0700 Subject: Update comment FDTable.GetFile doesn't exist. PiperOrigin-RevId: 277089842 --- pkg/sentry/kernel/task.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c82ef5486..11a8c6c87 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -709,9 +709,9 @@ func (t *Task) FDTable() *FDTable { return t.fdTable } -// GetFile is a convenience wrapper t.FDTable().GetFile. +// GetFile is a convenience wrapper t.FDTable().Get. // -// Precondition: same as FDTable. +// Precondition: same as FDTable.Get. func (t *Task) GetFile(fd int32) *fs.File { f, _ := t.fdTable.Get(fd) return f -- cgit v1.2.3 From 29273b03842a85bce8314799348231520ceb6e9c Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 29 Oct 2019 10:03:18 -0700 Subject: Disallow execveat on interpreter scripts with fd opened with O_CLOEXEC. When an interpreter script is opened with O_CLOEXEC and the resulting fd is passed into execveat, an ENOENT error should occur (the script would otherwise be inaccessible to the interpreter). This matches the actual behavior of Linux's execveat. PiperOrigin-RevId: 277306680 --- pkg/sentry/kernel/kernel.go | 1 + pkg/sentry/loader/loader.go | 9 +++++++++ pkg/sentry/syscalls/linux/sys_thread.go | 5 ++++- test/syscalls/linux/exec.cc | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index fcfe7a16d..e64d648e2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -812,6 +812,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, ResolveFinal: true, Filename: args.Filename, File: args.File, + CloseOnExec: false, Argv: args.Argv, Envv: args.Envv, Features: k.featureSet, diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 818941762..f75ebe08a 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -66,6 +66,12 @@ type LoadArgs struct { // nil, then File will be loaded and Filename will be ignored. File *fs.File + // CloseOnExec indicates that the executable (or one of its parent + // directories) was opened with O_CLOEXEC. If the executable is an + // interpreter script, then cause an ENOENT error to occur, since the + // script would otherwise be inaccessible to the interpreter. + CloseOnExec bool + // Argv is the vector of arguments to pass to the executable. Argv []string @@ -279,6 +285,9 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context d.IncRef() return loaded, ac, d, args.Argv, err case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)): + if args.CloseOnExec { + return loadedELF{}, nil, nil, nil, syserror.ENOENT + } args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv) if err != nil { ctx.Infof("Error loading interpreter script: %v", err) diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 2476f8858..4115116ff 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -120,6 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user var wd *fs.Dirent var executable *fs.File + var closeOnExec bool if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) { // Even if the pathname is absolute, we may still need the wd // for interpreter scripts if the path of the interpreter is @@ -127,11 +128,12 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user wd = t.FSContext().WorkingDirectory() } else { // Need to extract the given FD. - f := t.GetFile(dirFD) + f, fdFlags := t.FDTable().Get(dirFD) if f == nil { return 0, nil, syserror.EBADF } defer f.DecRef() + closeOnExec = fdFlags.CloseOnExec if atEmptyPath && len(pathname) == 0 { executable = f @@ -157,6 +159,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user ResolveFinal: resolveFinal, Filename: pathname, File: executable, + CloseOnExec: closeOnExec, Argv: argv, Envv: envv, Features: t.Arch().FeatureSet(), diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc index 21a5ffd40..a9067df2a 100644 --- a/test/syscalls/linux/exec.cc +++ b/test/syscalls/linux/exec.cc @@ -681,6 +681,39 @@ TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) { ArgEnvExitStatus(0, 0), ""); } +TEST(ExecveatTest, BasicWithCloexecFD) { + std::string path = WorkloadPath(kBasicWorkload); + const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC)); + + CheckExecveat(fd.get(), "", {path}, {}, AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH, + ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n")); +} + +TEST(ExecveatTest, InterpreterScriptWithCloexecFD) { + std::string path = WorkloadPath(kExitScript); + const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC)); + + int execve_errno; + ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), "", {path}, {}, + AT_EMPTY_PATH, /*child=*/nullptr, + &execve_errno)); + EXPECT_EQ(execve_errno, ENOENT); +} + +TEST(ExecveatTest, InterpreterScriptWithCloexecDirFD) { + std::string absolute_path = WorkloadPath(kExitScript); + std::string parent_dir = std::string(Dirname(absolute_path)); + std::string base = std::string(Basename(absolute_path)); + const FileDescriptor dirfd = + ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_CLOEXEC | O_DIRECTORY)); + + int execve_errno; + ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), base, {base}, {}, + /*flags=*/0, /*child=*/nullptr, + &execve_errno)); + EXPECT_EQ(execve_errno, ENOENT); +} + TEST(ExecveatTest, InvalidFlags) { int execve_errno; ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat( -- cgit v1.2.3 From d7f5e823e24501c33a377ee6c73210b00bf3d89f Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 29 Oct 2019 13:58:20 -0700 Subject: Fix grammar in comment. Missing "for". PiperOrigin-RevId: 277358513 --- pkg/sentry/kernel/task.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 11a8c6c87..9be3dae3c 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -709,7 +709,7 @@ func (t *Task) FDTable() *FDTable { return t.fdTable } -// GetFile is a convenience wrapper t.FDTable().Get. +// GetFile is a convenience wrapper for t.FDTable().Get. // // Precondition: same as FDTable.Get. func (t *Task) GetFile(fd int32) *fs.File { -- cgit v1.2.3 From a99d3479a84ca86843e500dbdf58db0af389b536 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Thu, 31 Oct 2019 18:02:04 -0700 Subject: Add context to state. PiperOrigin-RevId: 277840416 --- pkg/sentry/context/context.go | 63 +++++++++++++++++++++++--------------- pkg/sentry/kernel/context.go | 32 +++++++++++++++++++ pkg/sentry/kernel/kernel.go | 13 ++++---- pkg/sentry/pgalloc/save_restore.go | 13 ++++---- pkg/state/decode.go | 4 +++ pkg/state/encode.go | 4 +++ pkg/state/map.go | 11 +++++++ pkg/state/state.go | 7 +++-- pkg/state/state_test.go | 11 ++++--- 9 files changed, 115 insertions(+), 43 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go index dfd62cbdb..23e009ef3 100644 --- a/pkg/sentry/context/context.go +++ b/pkg/sentry/context/context.go @@ -12,10 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package context defines the sentry's Context type. +// Package context defines an internal context type. +// +// The given Context conforms to the standard Go context, but mandates +// additional methods that are specific to the kernel internals. Note however, +// that the Context described by this package carries additional constraints +// regarding concurrent access and retaining beyond the scope of a call. +// +// See the Context type for complete details. package context import ( + "context" + "time" + "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/log" ) @@ -59,6 +69,7 @@ func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) { type Context interface { log.Logger amutex.Sleeper + context.Context // UninterruptibleSleepStart indicates the beginning of an uninterruptible // sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate @@ -72,19 +83,36 @@ type Context interface { // AddressSpace is activated. Normally activate is the same value as the // deactivate parameter passed to UninterruptibleSleepStart. UninterruptibleSleepFinish(activate bool) +} + +// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep +// methods for anonymous embedding in other types that do not implement sleeps. +type NoopSleeper struct { + amutex.NoopSleeper +} + +// UninterruptibleSleepStart does nothing. +func (NoopSleeper) UninterruptibleSleepStart(bool) {} + +// UninterruptibleSleepFinish does nothing. +func (NoopSleeper) UninterruptibleSleepFinish(bool) {} + +// Deadline returns zero values, meaning no deadline. +func (NoopSleeper) Deadline() (time.Time, bool) { + return time.Time{}, false +} + +// Done returns nil. +func (NoopSleeper) Done() <-chan struct{} { + return nil +} - // Value returns the value associated with this Context for key, or nil if - // no value is associated with key. Successive calls to Value with the same - // key returns the same result. - // - // A key identifies a specific value in a Context. Functions that wish to - // retrieve values from Context typically allocate a key in a global - // variable then use that key as the argument to Context.Value. A key can - // be any type that supports equality; packages should define keys as an - // unexported type to avoid collisions. - Value(key interface{}) interface{} +// Err returns nil. +func (NoopSleeper) Err() error { + return nil } +// logContext implements basic logging. type logContext struct { log.Logger NoopSleeper @@ -95,19 +123,6 @@ func (logContext) Value(key interface{}) interface{} { return nil } -// NoopSleeper is a noop implementation of amutex.Sleeper and -// Context.UninterruptibleSleep* methods for anonymous embedding in other types -// that do not want to notify kernel.Task about sleeps. -type NoopSleeper struct { - amutex.NoopSleeper -} - -// UninterruptibleSleepStart does nothing. -func (NoopSleeper) UninterruptibleSleepStart(bool) {} - -// UninterruptibleSleepFinish does nothing. -func (NoopSleeper) UninterruptibleSleepFinish(bool) {} - // bgContext is the context returned by context.Background. var bgContext = &logContext{Logger: log.Log()} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index e3f5b0d83..3c9dceaba 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -15,6 +15,8 @@ package kernel import ( + "time" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" ) @@ -97,6 +99,21 @@ func TaskFromContext(ctx context.Context) *Task { return nil } +// Deadline implements context.Context.Deadline. +func (*Task) Deadline() (time.Time, bool) { + return time.Time{}, false +} + +// Done implements context.Context.Done. +func (*Task) Done() <-chan struct{} { + return nil +} + +// Err implements context.Context.Err. +func (*Task) Err() error { + return nil +} + // AsyncContext returns a context.Context that may be used by goroutines that // do work on behalf of t and therefore share its contextual values, but are // not t's task goroutine (e.g. asynchronous I/O). @@ -129,6 +146,21 @@ func (ctx taskAsyncContext) IsLogging(level log.Level) bool { return ctx.t.IsLogging(level) } +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return ctx.t.Deadline() +} + +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return ctx.t.Done() +} + +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return ctx.t.Err() +} + // Value implements context.Context.Value. func (ctx taskAsyncContext) Value(key interface{}) interface{} { return ctx.t.Value(key) diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index e64d648e2..28ba950bd 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -391,7 +391,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() - if err := state.Save(w, k.FeatureSet(), nil); err != nil { + if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) @@ -399,7 +399,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // Save the kernel state. kernelStart := time.Now() var stats state.Stats - if err := state.Save(w, k, &stats); err != nil { + if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil { return err } log.Infof("Kernel save stats: %s", &stats) @@ -407,7 +407,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // Save the memory file's state. memoryStart := time.Now() - if err := k.mf.SaveTo(w); err != nil { + if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { return err } log.Infof("Memory save took [%s].", time.Since(memoryStart)) @@ -542,7 +542,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() var features cpuid.FeatureSet - if err := state.Load(r, &features, nil); err != nil { + if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) @@ -558,7 +558,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the kernel state. kernelStart := time.Now() var stats state.Stats - if err := state.Load(r, k, &stats); err != nil { + if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil { return err } log.Infof("Kernel load stats: %s", &stats) @@ -566,7 +566,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the memory file's state. memoryStart := time.Now() - if err := k.mf.LoadFrom(r); err != nil { + if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { return err } log.Infof("Memory load took [%s].", time.Since(memoryStart)) @@ -1322,6 +1322,7 @@ func (k *Kernel) ListSockets() []*SocketEntry { return socks } +// supervisorContext is a privileged context. type supervisorContext struct { context.NoopSleeper log.Logger diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index 1effc7735..aafce1d00 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -16,6 +16,7 @@ package pgalloc import ( "bytes" + "context" "fmt" "io" "runtime" @@ -29,7 +30,7 @@ import ( ) // SaveTo writes f's state to the given stream. -func (f *MemoryFile) SaveTo(w io.Writer) error { +func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error { // Wait for reclaim. f.mu.Lock() defer f.mu.Unlock() @@ -78,10 +79,10 @@ func (f *MemoryFile) SaveTo(w io.Writer) error { } // Save metadata. - if err := state.Save(w, &f.fileSize, nil); err != nil { + if err := state.Save(ctx, w, &f.fileSize, nil); err != nil { return err } - if err := state.Save(w, &f.usage, nil); err != nil { + if err := state.Save(ctx, w, &f.usage, nil); err != nil { return err } @@ -114,9 +115,9 @@ func (f *MemoryFile) SaveTo(w io.Writer) error { } // LoadFrom loads MemoryFile state from the given stream. -func (f *MemoryFile) LoadFrom(r io.Reader) error { +func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error { // Load metadata. - if err := state.Load(r, &f.fileSize, nil); err != nil { + if err := state.Load(ctx, r, &f.fileSize, nil); err != nil { return err } if err := f.file.Truncate(f.fileSize); err != nil { @@ -124,7 +125,7 @@ func (f *MemoryFile) LoadFrom(r io.Reader) error { } newMappings := make([]uintptr, f.fileSize>>chunkShift) f.mappings.Store(newMappings) - if err := state.Load(r, &f.usage, nil); err != nil { + if err := state.Load(ctx, r, &f.usage, nil); err != nil { return err } diff --git a/pkg/state/decode.go b/pkg/state/decode.go index 47e6b878a..590c241a3 100644 --- a/pkg/state/decode.go +++ b/pkg/state/decode.go @@ -16,6 +16,7 @@ package state import ( "bytes" + "context" "encoding/binary" "errors" "fmt" @@ -133,6 +134,9 @@ func (os *objectState) findCycle() []*objectState { // to ensure that all callbacks are executed, otherwise the callback graph was // not acyclic. type decodeState struct { + // ctx is the decode context. + ctx context.Context + // objectByID is the set of objects in progress. objectsByID map[uint64]*objectState diff --git a/pkg/state/encode.go b/pkg/state/encode.go index 5d9409a45..c5118d3a9 100644 --- a/pkg/state/encode.go +++ b/pkg/state/encode.go @@ -16,6 +16,7 @@ package state import ( "container/list" + "context" "encoding/binary" "fmt" "io" @@ -38,6 +39,9 @@ type queuedObject struct { // The encoding process is a breadth-first traversal of the object graph. The // inherent races and dependencies are much simpler than the decode case. type encodeState struct { + // ctx is the encode context. + ctx context.Context + // lastID is the last object ID. // // See idsByObject for context. Because of the special zero encoding diff --git a/pkg/state/map.go b/pkg/state/map.go index 7e6fefed4..4f3ebb0da 100644 --- a/pkg/state/map.go +++ b/pkg/state/map.go @@ -15,6 +15,7 @@ package state import ( + "context" "fmt" "reflect" "sort" @@ -219,3 +220,13 @@ func (m Map) AfterLoad(fn func()) { // data dependencies have been cleared. m.os.callbacks = append(m.os.callbacks, fn) } + +// Context returns the current context object. +func (m Map) Context() context.Context { + if m.es != nil { + return m.es.ctx + } else if m.ds != nil { + return m.ds.ctx + } + return context.Background() // No context. +} diff --git a/pkg/state/state.go b/pkg/state/state.go index d408ff84a..dbe507ab4 100644 --- a/pkg/state/state.go +++ b/pkg/state/state.go @@ -50,6 +50,7 @@ package state import ( + "context" "fmt" "io" "reflect" @@ -86,9 +87,10 @@ func UnwrapErrState(err error) error { } // Save saves the given object state. -func Save(w io.Writer, rootPtr interface{}, stats *Stats) error { +func Save(ctx context.Context, w io.Writer, rootPtr interface{}, stats *Stats) error { // Create the encoding state. es := &encodeState{ + ctx: ctx, idsByObject: make(map[uintptr]uint64), w: w, stats: stats, @@ -101,9 +103,10 @@ func Save(w io.Writer, rootPtr interface{}, stats *Stats) error { } // Load loads a checkpoint. -func Load(r io.Reader, rootPtr interface{}, stats *Stats) error { +func Load(ctx context.Context, r io.Reader, rootPtr interface{}, stats *Stats) error { // Create the decoding state. ds := &decodeState{ + ctx: ctx, objectsByID: make(map[uint64]*objectState), deferred: make(map[uint64]*pb.Object), r: r, diff --git a/pkg/state/state_test.go b/pkg/state/state_test.go index 7c24bbcda..d7221e9e8 100644 --- a/pkg/state/state_test.go +++ b/pkg/state/state_test.go @@ -16,6 +16,7 @@ package state import ( "bytes" + "context" "io/ioutil" "math" "reflect" @@ -46,7 +47,7 @@ func runTest(t *testing.T, tests []TestCase) { saveBuffer := &bytes.Buffer{} saveObjectPtr := reflect.New(reflect.TypeOf(root)) saveObjectPtr.Elem().Set(reflect.ValueOf(root)) - if err := Save(saveBuffer, saveObjectPtr.Interface(), nil); err != nil && !test.Fail { + if err := Save(context.Background(), saveBuffer, saveObjectPtr.Interface(), nil); err != nil && !test.Fail { t.Errorf(" FAIL: Save failed unexpectedly: %v", err) continue } else if err != nil { @@ -56,7 +57,7 @@ func runTest(t *testing.T, tests []TestCase) { // Load a new copy of the object. loadObjectPtr := reflect.New(reflect.TypeOf(root)) - if err := Load(bytes.NewReader(saveBuffer.Bytes()), loadObjectPtr.Interface(), nil); err != nil && !test.Fail { + if err := Load(context.Background(), bytes.NewReader(saveBuffer.Bytes()), loadObjectPtr.Interface(), nil); err != nil && !test.Fail { t.Errorf(" FAIL: Load failed unexpectedly: %v", err) continue } else if err != nil { @@ -624,7 +625,7 @@ func BenchmarkEncoding(b *testing.B) { bs := buildObject(b.N) var stats Stats b.StartTimer() - if err := Save(ioutil.Discard, bs, &stats); err != nil { + if err := Save(context.Background(), ioutil.Discard, bs, &stats); err != nil { b.Errorf("save failed: %v", err) } b.StopTimer() @@ -638,12 +639,12 @@ func BenchmarkDecoding(b *testing.B) { bs := buildObject(b.N) var newBS benchStruct buf := &bytes.Buffer{} - if err := Save(buf, bs, nil); err != nil { + if err := Save(context.Background(), buf, bs, nil); err != nil { b.Errorf("save failed: %v", err) } var stats Stats b.StartTimer() - if err := Load(buf, &newBS, &stats); err != nil { + if err := Load(context.Background(), buf, &newBS, &stats); err != nil { b.Errorf("load failed: %v", err) } b.StopTimer() -- cgit v1.2.3 From 1d8b7292d72ce93d465e4ded19237fb92c08bc56 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Mon, 11 Nov 2019 09:42:04 +0000 Subject: Fix some build errors on arm64. Initialize the VDSO "os" and "arch" fields explicitly, or the VDSO load process would failed on arm64 platform. Signed-off-by: Haibo Xu Change-Id: Ic6768df88e43cd7c7956eb630511672ae11ac52f --- pkg/sentry/kernel/ptrace_arm64.go | 1 - pkg/sentry/loader/vdso.go | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 0acdf769d..61e412911 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -17,7 +17,6 @@ package kernel import ( - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index ada28aea3..df8a81907 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -268,6 +268,8 @@ func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, er // some applications may not be able to handle multiple [vdso] // hints. vdso: mm.NewSpecialMappable("", mfp, vdso), + os: info.os, + arch: info.arch, phdrs: info.phdrs, }, nil } -- cgit v1.2.3 From c0f89eba6ebdec08460bd796fc62d6aef674d141 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Thu, 21 Nov 2019 11:29:49 -0800 Subject: Import and structure cleanup. PiperOrigin-RevId: 281795269 --- pkg/eventchannel/BUILD | 1 + pkg/flipcall/BUILD | 2 +- pkg/flipcall/flipcall_unsafe.go | 10 +- pkg/sentry/BUILD | 3 + pkg/sentry/fs/BUILD | 2 +- pkg/sentry/fs/fdpipe/pipe_opener_test.go | 1 + pkg/sentry/fs/overlay.go | 4 +- pkg/sentry/fsimpl/memfs/BUILD | 3 +- pkg/sentry/kernel/BUILD | 4 +- pkg/sentry/kernel/auth/BUILD | 2 +- pkg/sentry/kernel/futex/BUILD | 2 +- pkg/sentry/kernel/signalfd/BUILD | 4 +- pkg/sentry/kernel/task.go | 4 +- pkg/sentry/mm/BUILD | 2 +- pkg/sentry/mm/mm.go | 6 +- pkg/sentry/strace/strace.proto | 3 +- pkg/sentry/time/BUILD | 4 +- pkg/sentry/vfs/BUILD | 2 +- pkg/sentry/vfs/mount_unsafe.go | 4 +- pkg/state/object.proto | 56 ++++---- pkg/syncutil/BUILD | 54 ++++++++ pkg/syncutil/LICENSE | 27 ++++ pkg/syncutil/README.md | 5 + pkg/syncutil/atomicptr_unsafe.go | 47 +++++++ pkg/syncutil/atomicptrtest/BUILD | 29 ++++ pkg/syncutil/atomicptrtest/atomicptr_test.go | 31 +++++ pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go | 21 +++ pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go | 16 +++ pkg/syncutil/downgradable_rwmutex_test.go | 150 ++++++++++++++++++++ pkg/syncutil/downgradable_rwmutex_unsafe.go | 143 +++++++++++++++++++ pkg/syncutil/memmove_unsafe.go | 28 ++++ pkg/syncutil/norace_unsafe.go | 35 +++++ pkg/syncutil/race_unsafe.go | 41 ++++++ pkg/syncutil/seqatomic_unsafe.go | 72 ++++++++++ pkg/syncutil/seqatomictest/BUILD | 35 +++++ pkg/syncutil/seqatomictest/seqatomic_test.go | 132 ++++++++++++++++++ pkg/syncutil/seqcount.go | 149 ++++++++++++++++++++ pkg/syncutil/seqcount_test.go | 153 +++++++++++++++++++++ pkg/syncutil/syncutil.go | 7 + test/syscalls/linux/accept_bind.cc | 2 + test/syscalls/linux/accept_bind_stream.cc | 2 + test/syscalls/linux/chmod.cc | 1 + test/syscalls/linux/chroot.cc | 1 + test/syscalls/linux/clock_gettime.cc | 1 + test/syscalls/linux/concurrency.cc | 1 + test/syscalls/linux/exec_binary.cc | 1 + test/syscalls/linux/file_base.h | 1 + test/syscalls/linux/flock.cc | 1 + test/syscalls/linux/fork.cc | 1 + test/syscalls/linux/getdents.cc | 1 + test/syscalls/linux/ip_socket_test_util.cc | 5 +- test/syscalls/linux/memory_accounting.cc | 1 + test/syscalls/linux/mlock.cc | 1 + test/syscalls/linux/mmap.cc | 1 + test/syscalls/linux/mount.cc | 1 + test/syscalls/linux/read.cc | 1 + test/syscalls/linux/rename.cc | 1 + test/syscalls/linux/seccomp.cc | 1 + test/syscalls/linux/select.cc | 1 + test/syscalls/linux/shm.cc | 1 - test/syscalls/linux/socket_blocking.cc | 1 + test/syscalls/linux/socket_ip_loopback_blocking.cc | 1 + .../linux/socket_ip_tcp_generic_loopback.cc | 1 + .../linux/socket_ip_tcp_loopback_blocking.cc | 1 + .../linux/socket_ip_tcp_loopback_nonblock.cc | 1 + .../socket_ipv4_tcp_unbound_external_networking.cc | 1 + ...et_ipv4_tcp_unbound_external_networking_test.cc | 3 +- ...et_ipv4_udp_unbound_external_networking_test.cc | 3 +- test/syscalls/linux/socket_netlink_util.cc | 4 +- test/syscalls/linux/socket_unix_blocking_local.cc | 3 +- test/syscalls/linux/socket_unix_dgram.cc | 1 + .../linux/socket_unix_dgram_non_blocking.cc | 1 + .../linux/socket_unix_non_stream_blocking_local.cc | 3 +- test/syscalls/linux/socket_unix_seqpacket.cc | 1 + .../linux/socket_unix_stream_blocking_local.cc | 3 +- .../linux/socket_unix_stream_nonblock_local.cc | 3 +- .../syscalls/linux/socket_unix_unbound_abstract.cc | 1 + .../linux/socket_unix_unbound_filesystem.cc | 1 + .../linux/socket_unix_unbound_seqpacket.cc | 1 + test/syscalls/linux/socket_unix_unbound_stream.cc | 1 + test/syscalls/linux/sync.cc | 3 +- test/syscalls/linux/truncate.cc | 1 + .../syscalls/linux/unix_domain_socket_test_util.cc | 1 + test/syscalls/linux/unix_domain_socket_test_util.h | 1 + test/syscalls/linux/utimes.cc | 1 + test/syscalls/linux/vdso_clock_gettime.cc | 1 + test/util/fs_util_test.cc | 4 +- test/util/mount_util.h | 1 + test/util/posix_error_test.cc | 1 + test/util/rlimit_util.cc | 1 + test/util/signal_util.cc | 1 + test/util/signal_util.h | 1 + test/util/temp_path.h | 1 + test/util/test_util_test.cc | 1 + third_party/gvsync/BUILD | 53 ------- third_party/gvsync/LICENSE | 27 ---- third_party/gvsync/README.md | 3 - third_party/gvsync/atomicptr_unsafe.go | 47 ------- third_party/gvsync/atomicptrtest/BUILD | 28 ---- third_party/gvsync/atomicptrtest/atomicptr_test.go | 31 ----- .../gvsync/downgradable_rwmutex_1_12_unsafe.go | 21 --- .../gvsync/downgradable_rwmutex_1_13_unsafe.go | 16 --- third_party/gvsync/downgradable_rwmutex_test.go | 150 -------------------- third_party/gvsync/downgradable_rwmutex_unsafe.go | 143 ------------------- third_party/gvsync/gvsync.go | 7 - third_party/gvsync/memmove_unsafe.go | 28 ---- third_party/gvsync/norace_unsafe.go | 35 ----- third_party/gvsync/race_unsafe.go | 41 ------ third_party/gvsync/seqatomic_unsafe.go | 72 ---------- third_party/gvsync/seqatomictest/BUILD | 34 ----- third_party/gvsync/seqatomictest/seqatomic_test.go | 132 ------------------ third_party/gvsync/seqcount.go | 149 -------------------- third_party/gvsync/seqcount_test.go | 153 --------------------- tools/go_marshal/test/BUILD | 3 +- tools/go_marshal/test/external/BUILD | 4 +- 115 files changed, 1302 insertions(+), 1250 deletions(-) create mode 100644 pkg/syncutil/BUILD create mode 100644 pkg/syncutil/LICENSE create mode 100644 pkg/syncutil/README.md create mode 100644 pkg/syncutil/atomicptr_unsafe.go create mode 100644 pkg/syncutil/atomicptrtest/BUILD create mode 100644 pkg/syncutil/atomicptrtest/atomicptr_test.go create mode 100644 pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go create mode 100644 pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go create mode 100644 pkg/syncutil/downgradable_rwmutex_test.go create mode 100644 pkg/syncutil/downgradable_rwmutex_unsafe.go create mode 100644 pkg/syncutil/memmove_unsafe.go create mode 100644 pkg/syncutil/norace_unsafe.go create mode 100644 pkg/syncutil/race_unsafe.go create mode 100644 pkg/syncutil/seqatomic_unsafe.go create mode 100644 pkg/syncutil/seqatomictest/BUILD create mode 100644 pkg/syncutil/seqatomictest/seqatomic_test.go create mode 100644 pkg/syncutil/seqcount.go create mode 100644 pkg/syncutil/seqcount_test.go create mode 100644 pkg/syncutil/syncutil.go delete mode 100644 third_party/gvsync/BUILD delete mode 100644 third_party/gvsync/LICENSE delete mode 100644 third_party/gvsync/README.md delete mode 100644 third_party/gvsync/atomicptr_unsafe.go delete mode 100644 third_party/gvsync/atomicptrtest/BUILD delete mode 100644 third_party/gvsync/atomicptrtest/atomicptr_test.go delete mode 100644 third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go delete mode 100644 third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go delete mode 100644 third_party/gvsync/downgradable_rwmutex_test.go delete mode 100644 third_party/gvsync/downgradable_rwmutex_unsafe.go delete mode 100644 third_party/gvsync/gvsync.go delete mode 100644 third_party/gvsync/memmove_unsafe.go delete mode 100644 third_party/gvsync/norace_unsafe.go delete mode 100644 third_party/gvsync/race_unsafe.go delete mode 100644 third_party/gvsync/seqatomic_unsafe.go delete mode 100644 third_party/gvsync/seqatomictest/BUILD delete mode 100644 third_party/gvsync/seqatomictest/seqatomic_test.go delete mode 100644 third_party/gvsync/seqcount.go delete mode 100644 third_party/gvsync/seqcount_test.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD index 71f2abc83..0b4b7cc44 100644 --- a/pkg/eventchannel/BUILD +++ b/pkg/eventchannel/BUILD @@ -25,6 +25,7 @@ go_library( proto_library( name = "eventchannel_proto", srcs = ["event.proto"], + visibility = ["//:sandbox"], ) go_proto_library( diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD index 5643d5f26..e590a71ba 100644 --- a/pkg/flipcall/BUILD +++ b/pkg/flipcall/BUILD @@ -19,7 +19,7 @@ go_library( "//pkg/abi/linux", "//pkg/log", "//pkg/memutil", - "//third_party/gvsync", + "//pkg/syncutil", ], ) diff --git a/pkg/flipcall/flipcall_unsafe.go b/pkg/flipcall/flipcall_unsafe.go index a37952637..27b8939fc 100644 --- a/pkg/flipcall/flipcall_unsafe.go +++ b/pkg/flipcall/flipcall_unsafe.go @@ -18,7 +18,7 @@ import ( "reflect" "unsafe" - "gvisor.dev/gvisor/third_party/gvsync" + "gvisor.dev/gvisor/pkg/syncutil" ) // Packets consist of a 16-byte header followed by an arbitrarily-sized @@ -75,13 +75,13 @@ func (ep *Endpoint) Data() []byte { var ioSync int64 func raceBecomeActive() { - if gvsync.RaceEnabled { - gvsync.RaceAcquire((unsafe.Pointer)(&ioSync)) + if syncutil.RaceEnabled { + syncutil.RaceAcquire((unsafe.Pointer)(&ioSync)) } } func raceBecomeInactive() { - if gvsync.RaceEnabled { - gvsync.RaceReleaseMerge((unsafe.Pointer)(&ioSync)) + if syncutil.RaceEnabled { + syncutil.RaceReleaseMerge((unsafe.Pointer)(&ioSync)) } } diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD index 2d6379c86..2a7122957 100644 --- a/pkg/sentry/BUILD +++ b/pkg/sentry/BUILD @@ -10,5 +10,8 @@ package_group( "//runsc/...", # Code generated by go_marshal relies on go_marshal libraries. "//tools/go_marshal/...", + + # Keep the old paths as a temporary measure. + "//third_party/golang/gvisor/pkg/sentry/...", ], ) diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 378602cc9..c035ffff7 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -68,9 +68,9 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/state", + "//pkg/syncutil", "//pkg/syserror", "//pkg/waiter", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index 8e4d839e1..577445148 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -25,6 +25,7 @@ import ( "time" "github.com/google/uuid" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index 1d3ff39e0..25573e986 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syncutil" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/third_party/gvsync" ) // The virtual filesystem implements an overlay configuration. For a high-level @@ -199,7 +199,7 @@ type overlayEntry struct { upper *Inode // dirCacheMu protects dirCache. - dirCacheMu gvsync.DowngradableRWMutex `state:"nosave"` + dirCacheMu syncutil.DowngradableRWMutex `state:"nosave"` // dirCache is cache of DentAttrs from upper and lower Inodes. dirCache *SortedDentryMap diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD index 04d667273..952b20c51 100644 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -1,10 +1,9 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template_instance") - go_template_instance( name = "dentry_list", out = "dentry_list.go", diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e041c51b3..2706927ff 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -35,7 +35,7 @@ go_template_instance( out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", - template = "//third_party/gvsync:generic_seqatomic", + template = "//pkg/syncutil:generic_seqatomic", types = { "Value": "TaskGoroutineSchedInfo", }, @@ -209,12 +209,12 @@ go_library( "//pkg/sentry/usermem", "//pkg/state", "//pkg/state/statefile", + "//pkg/syncutil", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", "//pkg/waiter", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 51de4568a..04c244447 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -8,7 +8,7 @@ go_template_instance( out = "atomicptr_credentials_unsafe.go", package = "auth", suffix = "Credentials", - template = "//third_party/gvsync:generic_atomicptr", + template = "//pkg/syncutil:generic_atomicptr", types = { "Value": "Credentials", }, diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 34286c7a8..75ec31761 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -9,7 +9,7 @@ go_template_instance( out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", - template = "//third_party/gvsync:generic_atomicptr", + template = "//pkg/syncutil:generic_atomicptr", types = { "Value": "bucket", }, diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 50b69d154..9f7e19b4d 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_stateify:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "signalfd", srcs = ["signalfd.go"], diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 9be3dae3c..80c8e5464 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -35,8 +35,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syncutil" "gvisor.dev/gvisor/pkg/waiter" - "gvisor.dev/gvisor/third_party/gvsync" ) // Task represents a thread of execution in the untrusted app. It @@ -83,7 +83,7 @@ type Task struct { // // gosched is protected by goschedSeq. gosched is owned by the task // goroutine. - goschedSeq gvsync.SeqCount `state:"nosave"` + goschedSeq syncutil.SeqCount `state:"nosave"` gosched TaskGoroutineSchedInfo // yieldCount is the number of times the task goroutine has called diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index a804b8b5c..839931f67 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -118,9 +118,9 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/syncutil", "//pkg/syserror", "//pkg/tcpip/buffer", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index f350e0109..58a5c186d 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -44,7 +44,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/third_party/gvsync" + "gvisor.dev/gvisor/pkg/syncutil" ) // MemoryManager implements a virtual address space. @@ -82,7 +82,7 @@ type MemoryManager struct { users int32 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. - mappingMu gvsync.DowngradableRWMutex `state:"nosave"` + mappingMu syncutil.DowngradableRWMutex `state:"nosave"` // vmas stores virtual memory areas. Since vmas are stored by value, // clients should usually use vmaIterator.ValuePtr() instead of @@ -125,7 +125,7 @@ type MemoryManager struct { // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. - activeMu gvsync.DowngradableRWMutex `state:"nosave"` + activeMu syncutil.DowngradableRWMutex `state:"nosave"` // pmas stores platform mapping areas used to implement vmas. Since pmas // are stored by value, clients should usually use pmaIterator.ValuePtr() diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto index 4b2f73a5f..906c52c51 100644 --- a/pkg/sentry/strace/strace.proto +++ b/pkg/sentry/strace/strace.proto @@ -32,8 +32,7 @@ message Strace { } } -message StraceEnter { -} +message StraceEnter {} message StraceExit { // Return value formatted as string. diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index d3a4cd943..18e212dff 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -9,7 +9,7 @@ go_template_instance( out = "seqatomic_parameters_unsafe.go", package = "time", suffix = "Parameters", - template = "//third_party/gvsync:generic_seqatomic", + template = "//pkg/syncutil:generic_seqatomic", types = { "Value": "Parameters", }, @@ -36,8 +36,8 @@ go_library( deps = [ "//pkg/log", "//pkg/metric", + "//pkg/syncutil", "//pkg/syserror", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 4f2c2de9f..74a325309 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -33,9 +33,9 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/usermem", + "//pkg/syncutil", "//pkg/syserror", "//pkg/waiter", - "//third_party/gvsync", ], ) diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index 75e6c7dfa..c98b42f91 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -26,7 +26,7 @@ import ( "sync/atomic" "unsafe" - "gvisor.dev/gvisor/third_party/gvsync" + "gvisor.dev/gvisor/pkg/syncutil" ) // mountKey represents the location at which a Mount is mounted. It is @@ -72,7 +72,7 @@ type mountTable struct { // intrinsics and inline assembly, limiting the performance of this // approach.) - seq gvsync.SeqCount + seq syncutil.SeqCount seed uint32 // for hashing keys // size holds both length (number of elements) and capacity (number of diff --git a/pkg/state/object.proto b/pkg/state/object.proto index 952289069..5ebcfb151 100644 --- a/pkg/state/object.proto +++ b/pkg/state/object.proto @@ -18,8 +18,8 @@ package gvisor.state.statefile; // Slice is a slice value. message Slice { - uint32 length = 1; - uint32 capacity = 2; + uint32 length = 1; + uint32 capacity = 2; uint64 ref_value = 3; } @@ -30,13 +30,13 @@ message Array { // Map is a map value. message Map { - repeated Object keys = 1; + repeated Object keys = 1; repeated Object values = 2; } // Interface is an interface value. message Interface { - string type = 1; + string type = 1; Object value = 2; } @@ -47,7 +47,7 @@ message Struct { // Field encodes a single field. message Field { - string name = 1; + string name = 1; Object value = 2; } @@ -113,28 +113,28 @@ message Float32s { // Note that ref_value references an Object.id, below. message Object { oneof value { - bool bool_value = 1; - bytes string_value = 2; - int64 int64_value = 3; - uint64 uint64_value = 4; - double double_value = 5; - uint64 ref_value = 6; - Slice slice_value = 7; - Array array_value = 8; - Interface interface_value = 9; - Struct struct_value = 10; - Map map_value = 11; - bytes byte_array_value = 12; - Uint16s uint16_array_value = 13; - Uint32s uint32_array_value = 14; - Uint64s uint64_array_value = 15; - Uintptrs uintptr_array_value = 16; - Int8s int8_array_value = 17; - Int16s int16_array_value = 18; - Int32s int32_array_value = 19; - Int64s int64_array_value = 20; - Bools bool_array_value = 21; - Float64s float64_array_value = 22; - Float32s float32_array_value = 23; + bool bool_value = 1; + bytes string_value = 2; + int64 int64_value = 3; + uint64 uint64_value = 4; + double double_value = 5; + uint64 ref_value = 6; + Slice slice_value = 7; + Array array_value = 8; + Interface interface_value = 9; + Struct struct_value = 10; + Map map_value = 11; + bytes byte_array_value = 12; + Uint16s uint16_array_value = 13; + Uint32s uint32_array_value = 14; + Uint64s uint64_array_value = 15; + Uintptrs uintptr_array_value = 16; + Int8s int8_array_value = 17; + Int16s int16_array_value = 18; + Int32s int32_array_value = 19; + Int64s int64_array_value = 20; + Bools bool_array_value = 21; + Float64s float64_array_value = 22; + Float32s float32_array_value = 23; } } diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD new file mode 100644 index 000000000..b06a90bef --- /dev/null +++ b/pkg/syncutil/BUILD @@ -0,0 +1,54 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template") + +package( + default_visibility = ["//:sandbox"], + licenses = ["notice"], +) + +exports_files(["LICENSE"]) + +go_template( + name = "generic_atomicptr", + srcs = ["atomicptr_unsafe.go"], + types = [ + "Value", + ], +) + +go_template( + name = "generic_seqatomic", + srcs = ["seqatomic_unsafe.go"], + types = [ + "Value", + ], + deps = [ + ":sync", + ], +) + +go_library( + name = "syncutil", + srcs = [ + "downgradable_rwmutex_1_12_unsafe.go", + "downgradable_rwmutex_1_13_unsafe.go", + "downgradable_rwmutex_unsafe.go", + "memmove_unsafe.go", + "norace_unsafe.go", + "race_unsafe.go", + "seqcount.go", + "syncutil.go", + ], + importpath = "gvisor.dev/gvisor/pkg/syncutil", +) + +go_test( + name = "syncutil_test", + size = "small", + srcs = [ + "downgradable_rwmutex_test.go", + "seqcount_test.go", + ], + embed = [":syncutil"], +) diff --git a/pkg/syncutil/LICENSE b/pkg/syncutil/LICENSE new file mode 100644 index 000000000..6a66aea5e --- /dev/null +++ b/pkg/syncutil/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/syncutil/README.md b/pkg/syncutil/README.md new file mode 100644 index 000000000..2183c4e20 --- /dev/null +++ b/pkg/syncutil/README.md @@ -0,0 +1,5 @@ +# Syncutil + +This package provides additional synchronization primitives not provided by the +Go stdlib 'sync' package. It is partially derived from the upstream 'sync' +package from go1.10. diff --git a/pkg/syncutil/atomicptr_unsafe.go b/pkg/syncutil/atomicptr_unsafe.go new file mode 100644 index 000000000..525c4beed --- /dev/null +++ b/pkg/syncutil/atomicptr_unsafe.go @@ -0,0 +1,47 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package template doesn't exist. This file must be instantiated using the +// go_template_instance rule in tools/go_generics/defs.bzl. +package template + +import ( + "sync/atomic" + "unsafe" +) + +// Value is a required type parameter. +type Value struct{} + +// An AtomicPtr is a pointer to a value of type Value that can be atomically +// loaded and stored. The zero value of an AtomicPtr represents nil. +// +// Note that copying AtomicPtr by value performs a non-atomic read of the +// stored pointer, which is unsafe if Store() can be called concurrently; in +// this case, do `dst.Store(src.Load())` instead. +// +// +stateify savable +type AtomicPtr struct { + ptr unsafe.Pointer `state:".(*Value)"` +} + +func (p *AtomicPtr) savePtr() *Value { + return p.Load() +} + +func (p *AtomicPtr) loadPtr(v *Value) { + p.Store(v) +} + +// Load returns the value set by the most recent Store. It returns nil if there +// has been no previous call to Store. +func (p *AtomicPtr) Load() *Value { + return (*Value)(atomic.LoadPointer(&p.ptr)) +} + +// Store sets the value returned by Load to x. +func (p *AtomicPtr) Store(x *Value) { + atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) +} diff --git a/pkg/syncutil/atomicptrtest/BUILD b/pkg/syncutil/atomicptrtest/BUILD new file mode 100644 index 000000000..63f411a90 --- /dev/null +++ b/pkg/syncutil/atomicptrtest/BUILD @@ -0,0 +1,29 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "atomicptr_int", + out = "atomicptr_int_unsafe.go", + package = "atomicptr", + suffix = "Int", + template = "//pkg/syncutil:generic_atomicptr", + types = { + "Value": "int", + }, +) + +go_library( + name = "atomicptr", + srcs = ["atomicptr_int_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/syncutil/atomicptr", +) + +go_test( + name = "atomicptr_test", + size = "small", + srcs = ["atomicptr_test.go"], + embed = [":atomicptr"], +) diff --git a/pkg/syncutil/atomicptrtest/atomicptr_test.go b/pkg/syncutil/atomicptrtest/atomicptr_test.go new file mode 100644 index 000000000..8fdc5112e --- /dev/null +++ b/pkg/syncutil/atomicptrtest/atomicptr_test.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package atomicptr + +import ( + "testing" +) + +func newInt(val int) *int { + return &val +} + +func TestAtomicPtr(t *testing.T) { + var p AtomicPtrInt + if got := p.Load(); got != nil { + t.Errorf("initial value is %p (%v), wanted nil", got, got) + } + want := newInt(42) + p.Store(want) + if got := p.Load(); got != want { + t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) + } + want = newInt(100) + p.Store(want) + if got := p.Load(); got != want { + t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) + } +} diff --git a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go new file mode 100644 index 000000000..7c6336e62 --- /dev/null +++ b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go @@ -0,0 +1,21 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Copyright 2019 The gVisor Authors. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.12 +// +build !go1.13 + +// TODO(b/133868570): Delete once Go 1.12 is no longer supported. + +package syncutil + +import _ "unsafe" + +//go:linkname runtimeSemrelease112 sync.runtime_Semrelease +func runtimeSemrelease112(s *uint32, handoff bool) + +func runtimeSemrelease(s *uint32, handoff bool, skipframes int) { + // 'skipframes' is only available starting from 1.13. + runtimeSemrelease112(s, handoff) +} diff --git a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go new file mode 100644 index 000000000..3c3673119 --- /dev/null +++ b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go @@ -0,0 +1,16 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Copyright 2019 The gVisor Authors. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.13 +// +build !go1.15 + +// Check go:linkname function signatures when updating Go version. + +package syncutil + +import _ "unsafe" + +//go:linkname runtimeSemrelease sync.runtime_Semrelease +func runtimeSemrelease(s *uint32, handoff bool, skipframes int) diff --git a/pkg/syncutil/downgradable_rwmutex_test.go b/pkg/syncutil/downgradable_rwmutex_test.go new file mode 100644 index 000000000..ffaf7ecc7 --- /dev/null +++ b/pkg/syncutil/downgradable_rwmutex_test.go @@ -0,0 +1,150 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Copyright 2019 The gVisor Authors. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// GOMAXPROCS=10 go test + +// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the +// addition of downgradingWriter and the renaming of num_iterations to +// numIterations to shut up Golint. + +package syncutil + +import ( + "fmt" + "runtime" + "sync/atomic" + "testing" +) + +func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) { + m.RLock() + clocked <- true + <-cunlock + m.RUnlock() + cdone <- true +} + +func doTestParallelReaders(numReaders, gomaxprocs int) { + runtime.GOMAXPROCS(gomaxprocs) + var m DowngradableRWMutex + clocked := make(chan bool) + cunlock := make(chan bool) + cdone := make(chan bool) + for i := 0; i < numReaders; i++ { + go parallelReader(&m, clocked, cunlock, cdone) + } + // Wait for all parallel RLock()s to succeed. + for i := 0; i < numReaders; i++ { + <-clocked + } + for i := 0; i < numReaders; i++ { + cunlock <- true + } + // Wait for the goroutines to finish. + for i := 0; i < numReaders; i++ { + <-cdone + } +} + +func TestParallelReaders(t *testing.T) { + defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) + doTestParallelReaders(1, 4) + doTestParallelReaders(3, 4) + doTestParallelReaders(4, 2) +} + +func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.RLock() + n := atomic.AddInt32(activity, 1) + if n < 1 || n >= 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -1) + rwm.RUnlock() + } + cdone <- true +} + +func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.Lock() + n := atomic.AddInt32(activity, 10000) + if n != 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -10000) + rwm.Unlock() + } + cdone <- true +} + +func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.Lock() + n := atomic.AddInt32(activity, 10000) + if n != 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -10000) + rwm.DowngradeLock() + n = atomic.AddInt32(activity, 1) + if n < 1 || n >= 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + n = atomic.AddInt32(activity, -1) + rwm.RUnlock() + } + cdone <- true +} + +func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) { + runtime.GOMAXPROCS(gomaxprocs) + // Number of active readers + 10000 * number of active writers. + var activity int32 + var rwm DowngradableRWMutex + cdone := make(chan bool) + go writer(&rwm, numIterations, &activity, cdone) + go downgradingWriter(&rwm, numIterations, &activity, cdone) + var i int + for i = 0; i < numReaders/2; i++ { + go reader(&rwm, numIterations, &activity, cdone) + } + go writer(&rwm, numIterations, &activity, cdone) + go downgradingWriter(&rwm, numIterations, &activity, cdone) + for ; i < numReaders; i++ { + go reader(&rwm, numIterations, &activity, cdone) + } + // Wait for the 4 writers and all readers to finish. + for i := 0; i < 4+numReaders; i++ { + <-cdone + } +} + +func TestDowngradableRWMutex(t *testing.T) { + defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) + n := 1000 + if testing.Short() { + n = 5 + } + HammerDowngradableRWMutex(1, 1, n) + HammerDowngradableRWMutex(1, 3, n) + HammerDowngradableRWMutex(1, 10, n) + HammerDowngradableRWMutex(4, 1, n) + HammerDowngradableRWMutex(4, 3, n) + HammerDowngradableRWMutex(4, 10, n) + HammerDowngradableRWMutex(10, 1, n) + HammerDowngradableRWMutex(10, 3, n) + HammerDowngradableRWMutex(10, 10, n) + HammerDowngradableRWMutex(10, 5, n) +} diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go new file mode 100644 index 000000000..07feca402 --- /dev/null +++ b/pkg/syncutil/downgradable_rwmutex_unsafe.go @@ -0,0 +1,143 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Copyright 2019 The gVisor Authors. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.12 +// +build !go1.15 + +// Check go:linkname function signatures when updating Go version. + +// This is mostly copied from the standard library's sync/rwmutex.go. +// +// Happens-before relationships indicated to the race detector: +// - Unlock -> Lock (via writerSem) +// - Unlock -> RLock (via readerSem) +// - RUnlock -> Lock (via writerSem) +// - DowngradeLock -> RLock (via readerSem) + +package syncutil + +import ( + "sync" + "sync/atomic" + "unsafe" +) + +//go:linkname runtimeSemacquire sync.runtime_Semacquire +func runtimeSemacquire(s *uint32) + +// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock +// method. +type DowngradableRWMutex struct { + w sync.Mutex // held if there are pending writers + writerSem uint32 // semaphore for writers to wait for completing readers + readerSem uint32 // semaphore for readers to wait for completing writers + readerCount int32 // number of pending readers + readerWait int32 // number of departing readers +} + +const rwmutexMaxReaders = 1 << 30 + +// RLock locks rw for reading. +func (rw *DowngradableRWMutex) RLock() { + if RaceEnabled { + RaceDisable() + } + if atomic.AddInt32(&rw.readerCount, 1) < 0 { + // A writer is pending, wait for it. + runtimeSemacquire(&rw.readerSem) + } + if RaceEnabled { + RaceEnable() + RaceAcquire(unsafe.Pointer(&rw.readerSem)) + } +} + +// RUnlock undoes a single RLock call. +func (rw *DowngradableRWMutex) RUnlock() { + if RaceEnabled { + RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) + RaceDisable() + } + if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 { + if r+1 == 0 || r+1 == -rwmutexMaxReaders { + panic("RUnlock of unlocked DowngradableRWMutex") + } + // A writer is pending. + if atomic.AddInt32(&rw.readerWait, -1) == 0 { + // The last reader unblocks the writer. + runtimeSemrelease(&rw.writerSem, false, 0) + } + } + if RaceEnabled { + RaceEnable() + } +} + +// Lock locks rw for writing. +func (rw *DowngradableRWMutex) Lock() { + if RaceEnabled { + RaceDisable() + } + // First, resolve competition with other writers. + rw.w.Lock() + // Announce to readers there is a pending writer. + r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders + // Wait for active readers. + if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 { + runtimeSemacquire(&rw.writerSem) + } + if RaceEnabled { + RaceEnable() + RaceAcquire(unsafe.Pointer(&rw.writerSem)) + } +} + +// Unlock unlocks rw for writing. +func (rw *DowngradableRWMutex) Unlock() { + if RaceEnabled { + RaceRelease(unsafe.Pointer(&rw.writerSem)) + RaceRelease(unsafe.Pointer(&rw.readerSem)) + RaceDisable() + } + // Announce to readers there is no active writer. + r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders) + if r >= rwmutexMaxReaders { + panic("Unlock of unlocked DowngradableRWMutex") + } + // Unblock blocked readers, if any. + for i := 0; i < int(r); i++ { + runtimeSemrelease(&rw.readerSem, false, 0) + } + // Allow other writers to proceed. + rw.w.Unlock() + if RaceEnabled { + RaceEnable() + } +} + +// DowngradeLock atomically unlocks rw for writing and locks it for reading. +func (rw *DowngradableRWMutex) DowngradeLock() { + if RaceEnabled { + RaceRelease(unsafe.Pointer(&rw.readerSem)) + RaceDisable() + } + // Announce to readers there is no active writer and one additional reader. + r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1) + if r >= rwmutexMaxReaders+1 { + panic("DowngradeLock of unlocked DowngradableRWMutex") + } + // Unblock blocked readers, if any. Note that this loop starts as 1 since r + // includes this goroutine. + for i := 1; i < int(r); i++ { + runtimeSemrelease(&rw.readerSem, false, 0) + } + // Allow other writers to proceed to rw.w.Lock(). Note that they will still + // block on rw.writerSem since at least this reader exists, such that + // DowngradeLock() is atomic with the previous write lock. + rw.w.Unlock() + if RaceEnabled { + RaceEnable() + } +} diff --git a/pkg/syncutil/memmove_unsafe.go b/pkg/syncutil/memmove_unsafe.go new file mode 100644 index 000000000..348675baa --- /dev/null +++ b/pkg/syncutil/memmove_unsafe.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.12 +// +build !go1.15 + +// Check go:linkname function signatures when updating Go version. + +package syncutil + +import ( + "unsafe" +) + +//go:linkname memmove runtime.memmove +//go:noescape +func memmove(to, from unsafe.Pointer, n uintptr) + +// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad, which can't +// define it because go_generics can't update the go:linkname annotation. +// Furthermore, go:linkname silently doesn't work if the local name is exported +// (this is of course undocumented), which is why this indirection is +// necessary. +func Memmove(to, from unsafe.Pointer, n uintptr) { + memmove(to, from, n) +} diff --git a/pkg/syncutil/norace_unsafe.go b/pkg/syncutil/norace_unsafe.go new file mode 100644 index 000000000..0a0a9deda --- /dev/null +++ b/pkg/syncutil/norace_unsafe.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !race + +package syncutil + +import ( + "unsafe" +) + +// RaceEnabled is true if the Go data race detector is enabled. +const RaceEnabled = false + +// RaceDisable has the same semantics as runtime.RaceDisable. +func RaceDisable() { +} + +// RaceEnable has the same semantics as runtime.RaceEnable. +func RaceEnable() { +} + +// RaceAcquire has the same semantics as runtime.RaceAcquire. +func RaceAcquire(addr unsafe.Pointer) { +} + +// RaceRelease has the same semantics as runtime.RaceRelease. +func RaceRelease(addr unsafe.Pointer) { +} + +// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. +func RaceReleaseMerge(addr unsafe.Pointer) { +} diff --git a/pkg/syncutil/race_unsafe.go b/pkg/syncutil/race_unsafe.go new file mode 100644 index 000000000..206067ec1 --- /dev/null +++ b/pkg/syncutil/race_unsafe.go @@ -0,0 +1,41 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build race + +package syncutil + +import ( + "runtime" + "unsafe" +) + +// RaceEnabled is true if the Go data race detector is enabled. +const RaceEnabled = true + +// RaceDisable has the same semantics as runtime.RaceDisable. +func RaceDisable() { + runtime.RaceDisable() +} + +// RaceEnable has the same semantics as runtime.RaceEnable. +func RaceEnable() { + runtime.RaceEnable() +} + +// RaceAcquire has the same semantics as runtime.RaceAcquire. +func RaceAcquire(addr unsafe.Pointer) { + runtime.RaceAcquire(addr) +} + +// RaceRelease has the same semantics as runtime.RaceRelease. +func RaceRelease(addr unsafe.Pointer) { + runtime.RaceRelease(addr) +} + +// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. +func RaceReleaseMerge(addr unsafe.Pointer) { + runtime.RaceReleaseMerge(addr) +} diff --git a/pkg/syncutil/seqatomic_unsafe.go b/pkg/syncutil/seqatomic_unsafe.go new file mode 100644 index 000000000..cb6d2eb22 --- /dev/null +++ b/pkg/syncutil/seqatomic_unsafe.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package template doesn't exist. This file must be instantiated using the +// go_template_instance rule in tools/go_generics/defs.bzl. +package template + +import ( + "fmt" + "reflect" + "strings" + "unsafe" + + "gvisor.dev/gvisor/pkg/syncutil" +) + +// Value is a required type parameter. +// +// Value must not contain any pointers, including interface objects, function +// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs +// containing any of the above. An init() function will panic if this property +// does not hold. +type Value struct{} + +// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race +// with any writer critical sections in sc. +func SeqAtomicLoad(sc *syncutil.SeqCount, ptr *Value) Value { + // This function doesn't use SeqAtomicTryLoad because doing so is + // measurably, significantly (~20%) slower; Go is awful at inlining. + var val Value + for { + epoch := sc.BeginRead() + if syncutil.RaceEnabled { + // runtime.RaceDisable() doesn't actually stop the race detector, + // so it can't help us here. Instead, call runtime.memmove + // directly, which is not instrumented by the race detector. + syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) + } else { + // This is ~40% faster for short reads than going through memmove. + val = *ptr + } + if sc.ReadOk(epoch) { + break + } + } + return val +} + +// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section +// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read +// would race with a writer critical section, SeqAtomicTryLoad returns +// (unspecified, false). +func SeqAtomicTryLoad(sc *syncutil.SeqCount, epoch syncutil.SeqCountEpoch, ptr *Value) (Value, bool) { + var val Value + if syncutil.RaceEnabled { + syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) + } else { + val = *ptr + } + return val, sc.ReadOk(epoch) +} + +func init() { + var val Value + typ := reflect.TypeOf(val) + name := typ.Name() + if ptrs := syncutil.PointersInType(typ, name); len(ptrs) != 0 { + panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n"))) + } +} diff --git a/pkg/syncutil/seqatomictest/BUILD b/pkg/syncutil/seqatomictest/BUILD new file mode 100644 index 000000000..ba18f3238 --- /dev/null +++ b/pkg/syncutil/seqatomictest/BUILD @@ -0,0 +1,35 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "seqatomic_int", + out = "seqatomic_int_unsafe.go", + package = "seqatomic", + suffix = "Int", + template = "//pkg/syncutil:generic_seqatomic", + types = { + "Value": "int", + }, +) + +go_library( + name = "seqatomic", + srcs = ["seqatomic_int_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/syncutil/seqatomic", + deps = [ + "//pkg/syncutil", + ], +) + +go_test( + name = "seqatomic_test", + size = "small", + srcs = ["seqatomic_test.go"], + embed = [":seqatomic"], + deps = [ + "//pkg/syncutil", + ], +) diff --git a/pkg/syncutil/seqatomictest/seqatomic_test.go b/pkg/syncutil/seqatomictest/seqatomic_test.go new file mode 100644 index 000000000..b0db44999 --- /dev/null +++ b/pkg/syncutil/seqatomictest/seqatomic_test.go @@ -0,0 +1,132 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seqatomic + +import ( + "sync/atomic" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/syncutil" +) + +func TestSeqAtomicLoadUncontended(t *testing.T) { + var seq syncutil.SeqCount + const want = 1 + data := want + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicLoadAfterWrite(t *testing.T) { + var seq syncutil.SeqCount + var data int + const want = 1 + seq.BeginWrite() + data = want + seq.EndWrite() + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicLoadDuringWrite(t *testing.T) { + var seq syncutil.SeqCount + var data int + const want = 1 + seq.BeginWrite() + go func() { + time.Sleep(time.Second) + data = want + seq.EndWrite() + }() + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicTryLoadUncontended(t *testing.T) { + var seq syncutil.SeqCount + const want = 1 + data := want + epoch := seq.BeginRead() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { + t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) + } +} + +func TestSeqAtomicTryLoadDuringWrite(t *testing.T) { + var seq syncutil.SeqCount + var data int + epoch := seq.BeginRead() + seq.BeginWrite() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { + t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) + } + seq.EndWrite() +} + +func TestSeqAtomicTryLoadAfterWrite(t *testing.T) { + var seq syncutil.SeqCount + var data int + epoch := seq.BeginRead() + seq.BeginWrite() + seq.EndWrite() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { + t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) + } +} + +func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) { + var seq syncutil.SeqCount + const want = 42 + data := want + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if got := SeqAtomicLoadInt(&seq, &data); got != want { + b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } + } + }) +} + +func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) { + var seq syncutil.SeqCount + const want = 42 + data := want + b.RunParallel(func(pb *testing.PB) { + epoch := seq.BeginRead() + for pb.Next() { + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { + b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) + } + } + }) +} + +// For comparison: +func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) { + var a atomic.Value + const want = 42 + a.Store(int(want)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if got := a.Load().(int); got != want { + b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want) + } + } + }) +} diff --git a/pkg/syncutil/seqcount.go b/pkg/syncutil/seqcount.go new file mode 100644 index 000000000..11d8dbfaa --- /dev/null +++ b/pkg/syncutil/seqcount.go @@ -0,0 +1,149 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syncutil + +import ( + "fmt" + "reflect" + "runtime" + "sync/atomic" +) + +// SeqCount is a synchronization primitive for optimistic reader/writer +// synchronization in cases where readers can work with stale data and +// therefore do not need to block writers. +// +// Compared to sync/atomic.Value: +// +// - Mutation of SeqCount-protected data does not require memory allocation, +// whereas atomic.Value generally does. This is a significant advantage when +// writes are common. +// +// - Atomic reads of SeqCount-protected data require copying. This is a +// disadvantage when atomic reads are common. +// +// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other +// operations to be made atomic with reads of SeqCount-protected data. +// +// - SeqCount may be less flexible: as of this writing, SeqCount-protected data +// cannot include pointers. +// +// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected +// data require instantiating function templates using go_generics (see +// seqatomic.go). +type SeqCount struct { + // epoch is incremented by BeginWrite and EndWrite, such that epoch is odd + // if a writer critical section is active, and a read from data protected + // by this SeqCount is atomic iff epoch is the same even value before and + // after the read. + epoch uint32 +} + +// SeqCountEpoch tracks writer critical sections in a SeqCount. +type SeqCountEpoch struct { + val uint32 +} + +// We assume that: +// +// - All functions in sync/atomic that perform a memory read are at least a +// read fence: memory reads before calls to such functions cannot be reordered +// after the call, and memory reads after calls to such functions cannot be +// reordered before the call, even if those reads do not use sync/atomic. +// +// - All functions in sync/atomic that perform a memory write are at least a +// write fence: memory writes before calls to such functions cannot be +// reordered after the call, and memory writes after calls to such functions +// cannot be reordered before the call, even if those writes do not use +// sync/atomic. +// +// As of this writing, the Go memory model completely fails to describe +// sync/atomic, but these properties are implied by +// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8. + +// BeginRead indicates the beginning of a reader critical section. Reader +// critical sections DO NOT BLOCK writer critical sections, so operations in a +// reader critical section MAY RACE with writer critical sections. Races are +// detected by ReadOk at the end of the reader critical section. Thus, the +// low-level structure of readers is generally: +// +// for { +// epoch := seq.BeginRead() +// // do something idempotent with seq-protected data +// if seq.ReadOk(epoch) { +// break +// } +// } +// +// However, since reader critical sections may race with writer critical +// sections, the Go race detector will (accurately) flag data races in readers +// using this pattern. Most users of SeqCount will need to use the +// SeqAtomicLoad function template in seqatomic.go. +func (s *SeqCount) BeginRead() SeqCountEpoch { + epoch := atomic.LoadUint32(&s.epoch) + for epoch&1 != 0 { + runtime.Gosched() + epoch = atomic.LoadUint32(&s.epoch) + } + return SeqCountEpoch{epoch} +} + +// ReadOk returns true if the reader critical section initiated by a previous +// call to BeginRead() that returned epoch did not race with any writer critical +// sections. +// +// ReadOk may be called any number of times during a reader critical section. +// Reader critical sections do not need to be explicitly terminated; the last +// call to ReadOk is implicitly the end of the reader critical section. +func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool { + return atomic.LoadUint32(&s.epoch) == epoch.val +} + +// BeginWrite indicates the beginning of a writer critical section. +// +// SeqCount does not support concurrent writer critical sections; clients with +// concurrent writers must synchronize them using e.g. sync.Mutex. +func (s *SeqCount) BeginWrite() { + if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 { + panic("SeqCount.BeginWrite during writer critical section") + } +} + +// EndWrite ends the effect of a preceding BeginWrite. +func (s *SeqCount) EndWrite() { + if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 { + panic("SeqCount.EndWrite outside writer critical section") + } +} + +// PointersInType returns a list of pointers reachable from values named +// valName of the given type. +// +// PointersInType is not exhaustive, but it is guaranteed that if typ contains +// at least one pointer, then PointersInTypeOf returns a non-empty list. +func PointersInType(typ reflect.Type, valName string) []string { + switch kind := typ.Kind(); kind { + case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128: + return nil + + case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer: + return []string{valName} + + case reflect.Array: + return PointersInType(typ.Elem(), valName+"[]") + + case reflect.Struct: + var ptrs []string + for i, n := 0, typ.NumField(); i < n; i++ { + field := typ.Field(i) + ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...) + } + return ptrs + + default: + return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)} + } +} diff --git a/pkg/syncutil/seqcount_test.go b/pkg/syncutil/seqcount_test.go new file mode 100644 index 000000000..14d6aedea --- /dev/null +++ b/pkg/syncutil/seqcount_test.go @@ -0,0 +1,153 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syncutil + +import ( + "reflect" + "testing" + "time" +) + +func TestSeqCountWriteUncontended(t *testing.T) { + var seq SeqCount + seq.BeginWrite() + seq.EndWrite() +} + +func TestSeqCountReadUncontended(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountBeginReadAfterWrite(t *testing.T) { + var seq SeqCount + var data int32 + const want = 1 + seq.BeginWrite() + data = want + seq.EndWrite() + epoch := seq.BeginRead() + if data != want { + t.Errorf("Reader: got %v, wanted %v", data, want) + } + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountBeginReadDuringWrite(t *testing.T) { + var seq SeqCount + var data int + const want = 1 + seq.BeginWrite() + go func() { + time.Sleep(time.Second) + data = want + seq.EndWrite() + }() + epoch := seq.BeginRead() + if data != want { + t.Errorf("Reader: got %v, wanted %v", data, want) + } + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountReadOkAfterWrite(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + seq.BeginWrite() + seq.EndWrite() + if seq.ReadOk(epoch) { + t.Errorf("ReadOk: got true, wanted false") + } +} + +func TestSeqCountReadOkDuringWrite(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + seq.BeginWrite() + if seq.ReadOk(epoch) { + t.Errorf("ReadOk: got true, wanted false") + } + seq.EndWrite() +} + +func BenchmarkSeqCountWriteUncontended(b *testing.B) { + var seq SeqCount + for i := 0; i < b.N; i++ { + seq.BeginWrite() + seq.EndWrite() + } +} + +func BenchmarkSeqCountReadUncontended(b *testing.B) { + var seq SeqCount + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + epoch := seq.BeginRead() + if !seq.ReadOk(epoch) { + b.Fatalf("ReadOk: got false, wanted true") + } + } + }) +} + +func TestPointersInType(t *testing.T) { + for _, test := range []struct { + name string // used for both test and value name + val interface{} + ptrs []string + }{ + { + name: "EmptyStruct", + val: struct{}{}, + }, + { + name: "Int", + val: int(0), + }, + { + name: "MixedStruct", + val: struct { + b bool + I int + ExportedPtr *struct{} + unexportedPtr *struct{} + arr [2]int + ptrArr [2]*int + nestedStruct struct { + nestedNonptr int + nestedPtr *int + } + structArr [1]struct { + nonptr int + ptr *int + } + }{}, + ptrs: []string{ + "MixedStruct.ExportedPtr", + "MixedStruct.unexportedPtr", + "MixedStruct.ptrArr[]", + "MixedStruct.nestedStruct.nestedPtr", + "MixedStruct.structArr[].ptr", + }, + }, + } { + t.Run(test.name, func(t *testing.T) { + typ := reflect.TypeOf(test.val) + ptrs := PointersInType(typ, test.name) + t.Logf("Found pointers: %v", ptrs) + if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) { + t.Errorf("Got %v, wanted %v", ptrs, test.ptrs) + } + }) + } +} diff --git a/pkg/syncutil/syncutil.go b/pkg/syncutil/syncutil.go new file mode 100644 index 000000000..66e750d06 --- /dev/null +++ b/pkg/syncutil/syncutil.go @@ -0,0 +1,7 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package syncutil provides synchronization primitives. +package syncutil diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc index 427c42ede..e08c578f0 100644 --- a/test/syscalls/linux/accept_bind.cc +++ b/test/syscalls/linux/accept_bind.cc @@ -14,8 +14,10 @@ #include #include + #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc index 7bcd91e9e..4857f160b 100644 --- a/test/syscalls/linux/accept_bind_stream.cc +++ b/test/syscalls/linux/accept_bind_stream.cc @@ -14,8 +14,10 @@ #include #include + #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc index 7e918b9b2..a06b5cfd6 100644 --- a/test/syscalls/linux/chmod.cc +++ b/test/syscalls/linux/chmod.cc @@ -16,6 +16,7 @@ #include #include #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc index de1611c21..04bc2d7b9 100644 --- a/test/syscalls/linux/chroot.cc +++ b/test/syscalls/linux/chroot.cc @@ -19,6 +19,7 @@ #include #include #include + #include #include diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc index c9e3ed6b2..2aa91691e 100644 --- a/test/syscalls/linux/clock_gettime.cc +++ b/test/syscalls/linux/clock_gettime.cc @@ -14,6 +14,7 @@ #include #include + #include #include #include diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc index 4e0a13f8b..00b96b34a 100644 --- a/test/syscalls/linux/concurrency.cc +++ b/test/syscalls/linux/concurrency.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc index 0a3931e5a..736452b0c 100644 --- a/test/syscalls/linux/exec_binary.cc +++ b/test/syscalls/linux/exec_binary.cc @@ -20,6 +20,7 @@ #include #include #include + #include #include #include diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h index 4d155b618..4e048320e 100644 --- a/test/syscalls/linux/file_base.h +++ b/test/syscalls/linux/file_base.h @@ -27,6 +27,7 @@ #include #include #include + #include #include diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc index b4a91455d..3ecb8db8e 100644 --- a/test/syscalls/linux/flock.cc +++ b/test/syscalls/linux/flock.cc @@ -14,6 +14,7 @@ #include #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc index dd6e1a422..371890110 100644 --- a/test/syscalls/linux/fork.cc +++ b/test/syscalls/linux/fork.cc @@ -20,6 +20,7 @@ #include #include #include + #include #include diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc index fe9cfafe8..ad2dbacb8 100644 --- a/test/syscalls/linux/getdents.cc +++ b/test/syscalls/linux/getdents.cc @@ -23,6 +23,7 @@ #include #include #include + #include #include #include diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc index 57e99596f..8398fc95f 100644 --- a/test/syscalls/linux/ip_socket_test_util.cc +++ b/test/syscalls/linux/ip_socket_test_util.cc @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "test/syscalls/linux/ip_socket_test_util.h" + #include #include #include #include -#include -#include "test/syscalls/linux/ip_socket_test_util.h" +#include namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc index ff2f49863..94aea4077 100644 --- a/test/syscalls/linux/memory_accounting.cc +++ b/test/syscalls/linux/memory_accounting.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc index 283c21ed3..620b4f8b4 100644 --- a/test/syscalls/linux/mlock.cc +++ b/test/syscalls/linux/mlock.cc @@ -16,6 +16,7 @@ #include #include #include + #include #include diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc index a112316e9..6f2639d8a 100644 --- a/test/syscalls/linux/mmap.cc +++ b/test/syscalls/linux/mmap.cc @@ -28,6 +28,7 @@ #include #include #include + #include #include "gmock/gmock.h" diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc index e35be3cab..a3e9745cf 100644 --- a/test/syscalls/linux/mount.cc +++ b/test/syscalls/linux/mount.cc @@ -18,6 +18,7 @@ #include #include #include + #include #include #include diff --git a/test/syscalls/linux/read.cc b/test/syscalls/linux/read.cc index 4430fa3c2..2633ba31b 100644 --- a/test/syscalls/linux/read.cc +++ b/test/syscalls/linux/read.cc @@ -14,6 +14,7 @@ #include #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc index 5b474ff32..833c0dc4f 100644 --- a/test/syscalls/linux/rename.cc +++ b/test/syscalls/linux/rename.cc @@ -14,6 +14,7 @@ #include #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc index e77586852..7e41fe7d8 100644 --- a/test/syscalls/linux/seccomp.cc +++ b/test/syscalls/linux/seccomp.cc @@ -25,6 +25,7 @@ #include #include #include + #include #include "gmock/gmock.h" diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc index e06a2666d..424e2a67f 100644 --- a/test/syscalls/linux/select.cc +++ b/test/syscalls/linux/select.cc @@ -16,6 +16,7 @@ #include #include #include + #include #include #include diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc index eb7a3966f..7ba752599 100644 --- a/test/syscalls/linux/shm.cc +++ b/test/syscalls/linux/shm.cc @@ -13,7 +13,6 @@ // limitations under the License. #include - #include #include #include diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc index d7ce57566..7e88aa2d9 100644 --- a/test/syscalls/linux/socket_blocking.cc +++ b/test/syscalls/linux/socket_blocking.cc @@ -17,6 +17,7 @@ #include #include #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc index d7fc9715b..e58eedaba 100644 --- a/test/syscalls/linux/socket_ip_loopback_blocking.cc +++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "test/syscalls/linux/ip_socket_test_util.h" diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc index 0dc274e2d..d11f7cc23 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "test/syscalls/linux/ip_socket_test_util.h" diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc index cd3ad97d0..fcd20102f 100644 --- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc +++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "test/syscalls/linux/ip_socket_test_util.h" diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc index 1acdecc17..63a05b799 100644 --- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc +++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "test/syscalls/linux/ip_socket_test_util.h" diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc index 3c3712b50..80f12b0a9 100644 --- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc +++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc @@ -18,6 +18,7 @@ #include #include #include + #include #include diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc index 92f03e045..3ac790873 100644 --- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc +++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h" + #include #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc index 9d4e1ab97..8f47952b0 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h" + #include #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc index 5f05bab10..723f5d728 100644 --- a/test/syscalls/linux/socket_netlink_util.cc +++ b/test/syscalls/linux/socket_netlink_util.cc @@ -12,15 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +#include "test/syscalls/linux/socket_netlink_util.h" #include #include +#include #include #include "absl/strings/str_cat.h" -#include "test/syscalls/linux/socket_netlink_util.h" #include "test/syscalls/linux/socket_test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc index 1994139e6..6f84221b2 100644 --- a/test/syscalls/linux/socket_unix_blocking_local.cc +++ b/test/syscalls/linux/socket_unix_blocking_local.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "test/syscalls/linux/socket_blocking.h" - #include +#include "test/syscalls/linux/socket_blocking.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc index 3245cf7c9..af0df4fb4 100644 --- a/test/syscalls/linux/socket_unix_dgram.cc +++ b/test/syscalls/linux/socket_unix_dgram.cc @@ -16,6 +16,7 @@ #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc index cd4fba25c..2db8b68d3 100644 --- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc +++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc @@ -14,6 +14,7 @@ #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc index da762cd83..8855d5001 100644 --- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc +++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "test/syscalls/linux/socket_non_stream_blocking.h" - #include +#include "test/syscalls/linux/socket_non_stream_blocking.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc index 60fa9e38a..84d3a569e 100644 --- a/test/syscalls/linux/socket_unix_seqpacket.cc +++ b/test/syscalls/linux/socket_unix_seqpacket.cc @@ -16,6 +16,7 @@ #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc index fa0a9d367..08e579ba7 100644 --- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc +++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "test/syscalls/linux/socket_stream_blocking.h" - #include +#include "test/syscalls/linux/socket_stream_blocking.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc index ec777c59f..1936aa135 100644 --- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc +++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc @@ -11,10 +11,9 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "test/syscalls/linux/socket_stream_nonblock.h" - #include +#include "test/syscalls/linux/socket_stream_nonblock.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc index 7f5816ace..8b1762000 100644 --- a/test/syscalls/linux/socket_unix_unbound_abstract.cc +++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc @@ -14,6 +14,7 @@ #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc index b14f24086..cab912152 100644 --- a/test/syscalls/linux/socket_unix_unbound_filesystem.cc +++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc @@ -14,6 +14,7 @@ #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc index 50ffa1d04..cb99030f5 100644 --- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc +++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc @@ -14,6 +14,7 @@ #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc index 344918c34..f185dded3 100644 --- a/test/syscalls/linux/socket_unix_unbound_stream.cc +++ b/test/syscalls/linux/socket_unix_unbound_stream.cc @@ -14,6 +14,7 @@ #include #include + #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" diff --git a/test/syscalls/linux/sync.cc b/test/syscalls/linux/sync.cc index fe479390d..8aa2525a9 100644 --- a/test/syscalls/linux/sync.cc +++ b/test/syscalls/linux/sync.cc @@ -14,10 +14,9 @@ #include #include -#include - #include #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/truncate.cc b/test/syscalls/linux/truncate.cc index e5cc5d97c..c988c6380 100644 --- a/test/syscalls/linux/truncate.cc +++ b/test/syscalls/linux/truncate.cc @@ -19,6 +19,7 @@ #include #include #include + #include #include diff --git a/test/syscalls/linux/unix_domain_socket_test_util.cc b/test/syscalls/linux/unix_domain_socket_test_util.cc index 7fb9eed8d..b05ab2900 100644 --- a/test/syscalls/linux/unix_domain_socket_test_util.cc +++ b/test/syscalls/linux/unix_domain_socket_test_util.cc @@ -15,6 +15,7 @@ #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include + #include #include "gtest/gtest.h" diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h index 5eca0b7f0..b8073db17 100644 --- a/test/syscalls/linux/unix_domain_socket_test_util.h +++ b/test/syscalls/linux/unix_domain_socket_test_util.h @@ -16,6 +16,7 @@ #define GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_ #include + #include "test/syscalls/linux/socket_test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc index 80716859a..12b925a51 100644 --- a/test/syscalls/linux/utimes.cc +++ b/test/syscalls/linux/utimes.cc @@ -20,6 +20,7 @@ #include #include #include + #include #include "absl/time/time.h" diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc index 40c0014b9..ce1899f45 100644 --- a/test/syscalls/linux/vdso_clock_gettime.cc +++ b/test/syscalls/linux/vdso_clock_gettime.cc @@ -17,6 +17,7 @@ #include #include #include + #include #include #include diff --git a/test/util/fs_util_test.cc b/test/util/fs_util_test.cc index 2a200320a..657b6a46e 100644 --- a/test/util/fs_util_test.cc +++ b/test/util/fs_util_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "test/util/fs_util.h" + #include + #include #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "test/util/fs_util.h" #include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" diff --git a/test/util/mount_util.h b/test/util/mount_util.h index 38ec6c8a1..484de560e 100644 --- a/test/util/mount_util.h +++ b/test/util/mount_util.h @@ -17,6 +17,7 @@ #include #include + #include #include diff --git a/test/util/posix_error_test.cc b/test/util/posix_error_test.cc index d67270842..bf9465abb 100644 --- a/test/util/posix_error_test.cc +++ b/test/util/posix_error_test.cc @@ -15,6 +15,7 @@ #include "test/util/posix_error.h" #include + #include "gmock/gmock.h" #include "gtest/gtest.h" diff --git a/test/util/rlimit_util.cc b/test/util/rlimit_util.cc index 684253f78..d7bfc1606 100644 --- a/test/util/rlimit_util.cc +++ b/test/util/rlimit_util.cc @@ -15,6 +15,7 @@ #include "test/util/rlimit_util.h" #include + #include #include "test/util/cleanup.h" diff --git a/test/util/signal_util.cc b/test/util/signal_util.cc index 26738864f..5ee95ee80 100644 --- a/test/util/signal_util.cc +++ b/test/util/signal_util.cc @@ -15,6 +15,7 @@ #include "test/util/signal_util.h" #include + #include #include "gtest/gtest.h" diff --git a/test/util/signal_util.h b/test/util/signal_util.h index 7fd2af015..bcf85c337 100644 --- a/test/util/signal_util.h +++ b/test/util/signal_util.h @@ -18,6 +18,7 @@ #include #include #include + #include #include "gmock/gmock.h" diff --git a/test/util/temp_path.h b/test/util/temp_path.h index 92d669503..9e5ac11f4 100644 --- a/test/util/temp_path.h +++ b/test/util/temp_path.h @@ -16,6 +16,7 @@ #define GVISOR_TEST_UTIL_TEMP_PATH_H_ #include + #include #include diff --git a/test/util/test_util_test.cc b/test/util/test_util_test.cc index b7300d9e5..f42100374 100644 --- a/test/util/test_util_test.cc +++ b/test/util/test_util_test.cc @@ -15,6 +15,7 @@ #include "test/util/test_util.h" #include + #include #include "gmock/gmock.h" diff --git a/third_party/gvsync/BUILD b/third_party/gvsync/BUILD deleted file mode 100644 index 7d6d59c48..000000000 --- a/third_party/gvsync/BUILD +++ /dev/null @@ -1,53 +0,0 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template") - -package( - default_visibility = ["//:sandbox"], - licenses = ["notice"], -) - -exports_files(["LICENSE"]) - -go_template( - name = "generic_atomicptr", - srcs = ["atomicptr_unsafe.go"], - types = [ - "Value", - ], -) - -go_template( - name = "generic_seqatomic", - srcs = ["seqatomic_unsafe.go"], - types = [ - "Value", - ], - deps = [ - ":sync", - ], -) - -go_library( - name = "gvsync", - srcs = [ - "downgradable_rwmutex_1_12_unsafe.go", - "downgradable_rwmutex_1_13_unsafe.go", - "downgradable_rwmutex_unsafe.go", - "gvsync.go", - "memmove_unsafe.go", - "norace_unsafe.go", - "race_unsafe.go", - "seqcount.go", - ], - importpath = "gvisor.dev/gvisor/third_party/gvsync", -) - -go_test( - name = "gvsync_test", - size = "small", - srcs = [ - "downgradable_rwmutex_test.go", - "seqcount_test.go", - ], - embed = [":gvsync"], -) diff --git a/third_party/gvsync/LICENSE b/third_party/gvsync/LICENSE deleted file mode 100644 index 6a66aea5e..000000000 --- a/third_party/gvsync/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/gvsync/README.md b/third_party/gvsync/README.md deleted file mode 100644 index fcc7e6f44..000000000 --- a/third_party/gvsync/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This package provides additional synchronization primitives not provided by the -Go stdlib 'sync' package. It is partially derived from the upstream 'sync' -package. diff --git a/third_party/gvsync/atomicptr_unsafe.go b/third_party/gvsync/atomicptr_unsafe.go deleted file mode 100644 index 525c4beed..000000000 --- a/third_party/gvsync/atomicptr_unsafe.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package template doesn't exist. This file must be instantiated using the -// go_template_instance rule in tools/go_generics/defs.bzl. -package template - -import ( - "sync/atomic" - "unsafe" -) - -// Value is a required type parameter. -type Value struct{} - -// An AtomicPtr is a pointer to a value of type Value that can be atomically -// loaded and stored. The zero value of an AtomicPtr represents nil. -// -// Note that copying AtomicPtr by value performs a non-atomic read of the -// stored pointer, which is unsafe if Store() can be called concurrently; in -// this case, do `dst.Store(src.Load())` instead. -// -// +stateify savable -type AtomicPtr struct { - ptr unsafe.Pointer `state:".(*Value)"` -} - -func (p *AtomicPtr) savePtr() *Value { - return p.Load() -} - -func (p *AtomicPtr) loadPtr(v *Value) { - p.Store(v) -} - -// Load returns the value set by the most recent Store. It returns nil if there -// has been no previous call to Store. -func (p *AtomicPtr) Load() *Value { - return (*Value)(atomic.LoadPointer(&p.ptr)) -} - -// Store sets the value returned by Load to x. -func (p *AtomicPtr) Store(x *Value) { - atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) -} diff --git a/third_party/gvsync/atomicptrtest/BUILD b/third_party/gvsync/atomicptrtest/BUILD deleted file mode 100644 index 447ecf96a..000000000 --- a/third_party/gvsync/atomicptrtest/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "atomicptr_int", - out = "atomicptr_int_unsafe.go", - package = "atomicptr", - suffix = "Int", - template = "//third_party/gvsync:generic_atomicptr", - types = { - "Value": "int", - }, -) - -go_library( - name = "atomicptr", - srcs = ["atomicptr_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/third_party/gvsync/atomicptr", -) - -go_test( - name = "atomicptr_test", - size = "small", - srcs = ["atomicptr_test.go"], - embed = [":atomicptr"], -) diff --git a/third_party/gvsync/atomicptrtest/atomicptr_test.go b/third_party/gvsync/atomicptrtest/atomicptr_test.go deleted file mode 100644 index 8fdc5112e..000000000 --- a/third_party/gvsync/atomicptrtest/atomicptr_test.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package atomicptr - -import ( - "testing" -) - -func newInt(val int) *int { - return &val -} - -func TestAtomicPtr(t *testing.T) { - var p AtomicPtrInt - if got := p.Load(); got != nil { - t.Errorf("initial value is %p (%v), wanted nil", got, got) - } - want := newInt(42) - p.Store(want) - if got := p.Load(); got != want { - t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) - } - want = newInt(100) - p.Store(want) - if got := p.Load(); got != want { - t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) - } -} diff --git a/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go deleted file mode 100644 index 855b2a2b1..000000000 --- a/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.12 -// +build !go1.13 - -// TODO(b/133868570): Delete once Go 1.12 is no longer supported. - -package gvsync - -import _ "unsafe" - -//go:linkname runtimeSemrelease112 sync.runtime_Semrelease -func runtimeSemrelease112(s *uint32, handoff bool) - -func runtimeSemrelease(s *uint32, handoff bool, skipframes int) { - // 'skipframes' is only available starting from 1.13. - runtimeSemrelease112(s, handoff) -} diff --git a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go deleted file mode 100644 index 3b9346843..000000000 --- a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.13 -// +build !go1.15 - -// Check go:linkname function signatures when updating Go version. - -package gvsync - -import _ "unsafe" - -//go:linkname runtimeSemrelease sync.runtime_Semrelease -func runtimeSemrelease(s *uint32, handoff bool, skipframes int) diff --git a/third_party/gvsync/downgradable_rwmutex_test.go b/third_party/gvsync/downgradable_rwmutex_test.go deleted file mode 100644 index 40c384b8b..000000000 --- a/third_party/gvsync/downgradable_rwmutex_test.go +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// GOMAXPROCS=10 go test - -// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the -// addition of downgradingWriter and the renaming of num_iterations to -// numIterations to shut up Golint. - -package gvsync - -import ( - "fmt" - "runtime" - "sync/atomic" - "testing" -) - -func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) { - m.RLock() - clocked <- true - <-cunlock - m.RUnlock() - cdone <- true -} - -func doTestParallelReaders(numReaders, gomaxprocs int) { - runtime.GOMAXPROCS(gomaxprocs) - var m DowngradableRWMutex - clocked := make(chan bool) - cunlock := make(chan bool) - cdone := make(chan bool) - for i := 0; i < numReaders; i++ { - go parallelReader(&m, clocked, cunlock, cdone) - } - // Wait for all parallel RLock()s to succeed. - for i := 0; i < numReaders; i++ { - <-clocked - } - for i := 0; i < numReaders; i++ { - cunlock <- true - } - // Wait for the goroutines to finish. - for i := 0; i < numReaders; i++ { - <-cdone - } -} - -func TestParallelReaders(t *testing.T) { - defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) - doTestParallelReaders(1, 4) - doTestParallelReaders(3, 4) - doTestParallelReaders(4, 2) -} - -func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.RLock() - n := atomic.AddInt32(activity, 1) - if n < 1 || n >= 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -1) - rwm.RUnlock() - } - cdone <- true -} - -func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.Lock() - n := atomic.AddInt32(activity, 10000) - if n != 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -10000) - rwm.Unlock() - } - cdone <- true -} - -func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.Lock() - n := atomic.AddInt32(activity, 10000) - if n != 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -10000) - rwm.DowngradeLock() - n = atomic.AddInt32(activity, 1) - if n < 1 || n >= 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - n = atomic.AddInt32(activity, -1) - rwm.RUnlock() - } - cdone <- true -} - -func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) { - runtime.GOMAXPROCS(gomaxprocs) - // Number of active readers + 10000 * number of active writers. - var activity int32 - var rwm DowngradableRWMutex - cdone := make(chan bool) - go writer(&rwm, numIterations, &activity, cdone) - go downgradingWriter(&rwm, numIterations, &activity, cdone) - var i int - for i = 0; i < numReaders/2; i++ { - go reader(&rwm, numIterations, &activity, cdone) - } - go writer(&rwm, numIterations, &activity, cdone) - go downgradingWriter(&rwm, numIterations, &activity, cdone) - for ; i < numReaders; i++ { - go reader(&rwm, numIterations, &activity, cdone) - } - // Wait for the 4 writers and all readers to finish. - for i := 0; i < 4+numReaders; i++ { - <-cdone - } -} - -func TestDowngradableRWMutex(t *testing.T) { - defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) - n := 1000 - if testing.Short() { - n = 5 - } - HammerDowngradableRWMutex(1, 1, n) - HammerDowngradableRWMutex(1, 3, n) - HammerDowngradableRWMutex(1, 10, n) - HammerDowngradableRWMutex(4, 1, n) - HammerDowngradableRWMutex(4, 3, n) - HammerDowngradableRWMutex(4, 10, n) - HammerDowngradableRWMutex(10, 1, n) - HammerDowngradableRWMutex(10, 3, n) - HammerDowngradableRWMutex(10, 10, n) - HammerDowngradableRWMutex(10, 5, n) -} diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go deleted file mode 100644 index b7862d185..000000000 --- a/third_party/gvsync/downgradable_rwmutex_unsafe.go +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.12 -// +build !go1.15 - -// Check go:linkname function signatures when updating Go version. - -// This is mostly copied from the standard library's sync/rwmutex.go. -// -// Happens-before relationships indicated to the race detector: -// - Unlock -> Lock (via writerSem) -// - Unlock -> RLock (via readerSem) -// - RUnlock -> Lock (via writerSem) -// - DowngradeLock -> RLock (via readerSem) - -package gvsync - -import ( - "sync" - "sync/atomic" - "unsafe" -) - -//go:linkname runtimeSemacquire sync.runtime_Semacquire -func runtimeSemacquire(s *uint32) - -// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock -// method. -type DowngradableRWMutex struct { - w sync.Mutex // held if there are pending writers - writerSem uint32 // semaphore for writers to wait for completing readers - readerSem uint32 // semaphore for readers to wait for completing writers - readerCount int32 // number of pending readers - readerWait int32 // number of departing readers -} - -const rwmutexMaxReaders = 1 << 30 - -// RLock locks rw for reading. -func (rw *DowngradableRWMutex) RLock() { - if RaceEnabled { - RaceDisable() - } - if atomic.AddInt32(&rw.readerCount, 1) < 0 { - // A writer is pending, wait for it. - runtimeSemacquire(&rw.readerSem) - } - if RaceEnabled { - RaceEnable() - RaceAcquire(unsafe.Pointer(&rw.readerSem)) - } -} - -// RUnlock undoes a single RLock call. -func (rw *DowngradableRWMutex) RUnlock() { - if RaceEnabled { - RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) - RaceDisable() - } - if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 { - if r+1 == 0 || r+1 == -rwmutexMaxReaders { - panic("RUnlock of unlocked DowngradableRWMutex") - } - // A writer is pending. - if atomic.AddInt32(&rw.readerWait, -1) == 0 { - // The last reader unblocks the writer. - runtimeSemrelease(&rw.writerSem, false, 0) - } - } - if RaceEnabled { - RaceEnable() - } -} - -// Lock locks rw for writing. -func (rw *DowngradableRWMutex) Lock() { - if RaceEnabled { - RaceDisable() - } - // First, resolve competition with other writers. - rw.w.Lock() - // Announce to readers there is a pending writer. - r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders - // Wait for active readers. - if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 { - runtimeSemacquire(&rw.writerSem) - } - if RaceEnabled { - RaceEnable() - RaceAcquire(unsafe.Pointer(&rw.writerSem)) - } -} - -// Unlock unlocks rw for writing. -func (rw *DowngradableRWMutex) Unlock() { - if RaceEnabled { - RaceRelease(unsafe.Pointer(&rw.writerSem)) - RaceRelease(unsafe.Pointer(&rw.readerSem)) - RaceDisable() - } - // Announce to readers there is no active writer. - r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders) - if r >= rwmutexMaxReaders { - panic("Unlock of unlocked DowngradableRWMutex") - } - // Unblock blocked readers, if any. - for i := 0; i < int(r); i++ { - runtimeSemrelease(&rw.readerSem, false, 0) - } - // Allow other writers to proceed. - rw.w.Unlock() - if RaceEnabled { - RaceEnable() - } -} - -// DowngradeLock atomically unlocks rw for writing and locks it for reading. -func (rw *DowngradableRWMutex) DowngradeLock() { - if RaceEnabled { - RaceRelease(unsafe.Pointer(&rw.readerSem)) - RaceDisable() - } - // Announce to readers there is no active writer and one additional reader. - r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1) - if r >= rwmutexMaxReaders+1 { - panic("DowngradeLock of unlocked DowngradableRWMutex") - } - // Unblock blocked readers, if any. Note that this loop starts as 1 since r - // includes this goroutine. - for i := 1; i < int(r); i++ { - runtimeSemrelease(&rw.readerSem, false, 0) - } - // Allow other writers to proceed to rw.w.Lock(). Note that they will still - // block on rw.writerSem since at least this reader exists, such that - // DowngradeLock() is atomic with the previous write lock. - rw.w.Unlock() - if RaceEnabled { - RaceEnable() - } -} diff --git a/third_party/gvsync/gvsync.go b/third_party/gvsync/gvsync.go deleted file mode 100644 index 3bbef13c3..000000000 --- a/third_party/gvsync/gvsync.go +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package gvsync provides synchronization primitives. -package gvsync diff --git a/third_party/gvsync/memmove_unsafe.go b/third_party/gvsync/memmove_unsafe.go deleted file mode 100644 index 9dd1d6142..000000000 --- a/third_party/gvsync/memmove_unsafe.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.12 -// +build !go1.15 - -// Check go:linkname function signatures when updating Go version. - -package gvsync - -import ( - "unsafe" -) - -//go:linkname memmove runtime.memmove -//go:noescape -func memmove(to, from unsafe.Pointer, n uintptr) - -// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad, which can't -// define it because go_generics can't update the go:linkname annotation. -// Furthermore, go:linkname silently doesn't work if the local name is exported -// (this is of course undocumented), which is why this indirection is -// necessary. -func Memmove(to, from unsafe.Pointer, n uintptr) { - memmove(to, from, n) -} diff --git a/third_party/gvsync/norace_unsafe.go b/third_party/gvsync/norace_unsafe.go deleted file mode 100644 index e3852db8c..000000000 --- a/third_party/gvsync/norace_unsafe.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !race - -package gvsync - -import ( - "unsafe" -) - -// RaceEnabled is true if the Go data race detector is enabled. -const RaceEnabled = false - -// RaceDisable has the same semantics as runtime.RaceDisable. -func RaceDisable() { -} - -// RaceEnable has the same semantics as runtime.RaceEnable. -func RaceEnable() { -} - -// RaceAcquire has the same semantics as runtime.RaceAcquire. -func RaceAcquire(addr unsafe.Pointer) { -} - -// RaceRelease has the same semantics as runtime.RaceRelease. -func RaceRelease(addr unsafe.Pointer) { -} - -// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. -func RaceReleaseMerge(addr unsafe.Pointer) { -} diff --git a/third_party/gvsync/race_unsafe.go b/third_party/gvsync/race_unsafe.go deleted file mode 100644 index 13c02a830..000000000 --- a/third_party/gvsync/race_unsafe.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build race - -package gvsync - -import ( - "runtime" - "unsafe" -) - -// RaceEnabled is true if the Go data race detector is enabled. -const RaceEnabled = true - -// RaceDisable has the same semantics as runtime.RaceDisable. -func RaceDisable() { - runtime.RaceDisable() -} - -// RaceEnable has the same semantics as runtime.RaceEnable. -func RaceEnable() { - runtime.RaceEnable() -} - -// RaceAcquire has the same semantics as runtime.RaceAcquire. -func RaceAcquire(addr unsafe.Pointer) { - runtime.RaceAcquire(addr) -} - -// RaceRelease has the same semantics as runtime.RaceRelease. -func RaceRelease(addr unsafe.Pointer) { - runtime.RaceRelease(addr) -} - -// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. -func RaceReleaseMerge(addr unsafe.Pointer) { - runtime.RaceReleaseMerge(addr) -} diff --git a/third_party/gvsync/seqatomic_unsafe.go b/third_party/gvsync/seqatomic_unsafe.go deleted file mode 100644 index 382eeed43..000000000 --- a/third_party/gvsync/seqatomic_unsafe.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package template doesn't exist. This file must be instantiated using the -// go_template_instance rule in tools/go_generics/defs.bzl. -package template - -import ( - "fmt" - "reflect" - "strings" - "unsafe" - - "gvisor.dev/gvisor/third_party/gvsync" -) - -// Value is a required type parameter. -// -// Value must not contain any pointers, including interface objects, function -// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs -// containing any of the above. An init() function will panic if this property -// does not hold. -type Value struct{} - -// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race -// with any writer critical sections in sc. -func SeqAtomicLoad(sc *gvsync.SeqCount, ptr *Value) Value { - // This function doesn't use SeqAtomicTryLoad because doing so is - // measurably, significantly (~20%) slower; Go is awful at inlining. - var val Value - for { - epoch := sc.BeginRead() - if gvsync.RaceEnabled { - // runtime.RaceDisable() doesn't actually stop the race detector, - // so it can't help us here. Instead, call runtime.memmove - // directly, which is not instrumented by the race detector. - gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) - } else { - // This is ~40% faster for short reads than going through memmove. - val = *ptr - } - if sc.ReadOk(epoch) { - break - } - } - return val -} - -// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section -// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read -// would race with a writer critical section, SeqAtomicTryLoad returns -// (unspecified, false). -func SeqAtomicTryLoad(sc *gvsync.SeqCount, epoch gvsync.SeqCountEpoch, ptr *Value) (Value, bool) { - var val Value - if gvsync.RaceEnabled { - gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) - } else { - val = *ptr - } - return val, sc.ReadOk(epoch) -} - -func init() { - var val Value - typ := reflect.TypeOf(val) - name := typ.Name() - if ptrs := gvsync.PointersInType(typ, name); len(ptrs) != 0 { - panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n"))) - } -} diff --git a/third_party/gvsync/seqatomictest/BUILD b/third_party/gvsync/seqatomictest/BUILD deleted file mode 100644 index c858c20c4..000000000 --- a/third_party/gvsync/seqatomictest/BUILD +++ /dev/null @@ -1,34 +0,0 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "seqatomic_int", - out = "seqatomic_int_unsafe.go", - package = "seqatomic", - suffix = "Int", - template = "//third_party/gvsync:generic_seqatomic", - types = { - "Value": "int", - }, -) - -go_library( - name = "seqatomic", - srcs = ["seqatomic_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/third_party/gvsync/seqatomic", - deps = [ - "//third_party/gvsync", - ], -) - -go_test( - name = "seqatomic_test", - size = "small", - srcs = ["seqatomic_test.go"], - embed = [":seqatomic"], - deps = [ - "//third_party/gvsync", - ], -) diff --git a/third_party/gvsync/seqatomictest/seqatomic_test.go b/third_party/gvsync/seqatomictest/seqatomic_test.go deleted file mode 100644 index a5447f589..000000000 --- a/third_party/gvsync/seqatomictest/seqatomic_test.go +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package seqatomic - -import ( - "sync/atomic" - "testing" - "time" - - "gvisor.dev/gvisor/third_party/gvsync" -) - -func TestSeqAtomicLoadUncontended(t *testing.T) { - var seq gvsync.SeqCount - const want = 1 - data := want - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicLoadAfterWrite(t *testing.T) { - var seq gvsync.SeqCount - var data int - const want = 1 - seq.BeginWrite() - data = want - seq.EndWrite() - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicLoadDuringWrite(t *testing.T) { - var seq gvsync.SeqCount - var data int - const want = 1 - seq.BeginWrite() - go func() { - time.Sleep(time.Second) - data = want - seq.EndWrite() - }() - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicTryLoadUncontended(t *testing.T) { - var seq gvsync.SeqCount - const want = 1 - data := want - epoch := seq.BeginRead() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { - t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) - } -} - -func TestSeqAtomicTryLoadDuringWrite(t *testing.T) { - var seq gvsync.SeqCount - var data int - epoch := seq.BeginRead() - seq.BeginWrite() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { - t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) - } - seq.EndWrite() -} - -func TestSeqAtomicTryLoadAfterWrite(t *testing.T) { - var seq gvsync.SeqCount - var data int - epoch := seq.BeginRead() - seq.BeginWrite() - seq.EndWrite() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { - t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) - } -} - -func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) { - var seq gvsync.SeqCount - const want = 42 - data := want - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - if got := SeqAtomicLoadInt(&seq, &data); got != want { - b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } - } - }) -} - -func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) { - var seq gvsync.SeqCount - const want = 42 - data := want - b.RunParallel(func(pb *testing.PB) { - epoch := seq.BeginRead() - for pb.Next() { - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { - b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) - } - } - }) -} - -// For comparison: -func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) { - var a atomic.Value - const want = 42 - a.Store(int(want)) - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - if got := a.Load().(int); got != want { - b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want) - } - } - }) -} diff --git a/third_party/gvsync/seqcount.go b/third_party/gvsync/seqcount.go deleted file mode 100644 index 2c9c2c3d6..000000000 --- a/third_party/gvsync/seqcount.go +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package gvsync - -import ( - "fmt" - "reflect" - "runtime" - "sync/atomic" -) - -// SeqCount is a synchronization primitive for optimistic reader/writer -// synchronization in cases where readers can work with stale data and -// therefore do not need to block writers. -// -// Compared to sync/atomic.Value: -// -// - Mutation of SeqCount-protected data does not require memory allocation, -// whereas atomic.Value generally does. This is a significant advantage when -// writes are common. -// -// - Atomic reads of SeqCount-protected data require copying. This is a -// disadvantage when atomic reads are common. -// -// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other -// operations to be made atomic with reads of SeqCount-protected data. -// -// - SeqCount may be less flexible: as of this writing, SeqCount-protected data -// cannot include pointers. -// -// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected -// data require instantiating function templates using go_generics (see -// seqatomic.go). -type SeqCount struct { - // epoch is incremented by BeginWrite and EndWrite, such that epoch is odd - // if a writer critical section is active, and a read from data protected - // by this SeqCount is atomic iff epoch is the same even value before and - // after the read. - epoch uint32 -} - -// SeqCountEpoch tracks writer critical sections in a SeqCount. -type SeqCountEpoch struct { - val uint32 -} - -// We assume that: -// -// - All functions in sync/atomic that perform a memory read are at least a -// read fence: memory reads before calls to such functions cannot be reordered -// after the call, and memory reads after calls to such functions cannot be -// reordered before the call, even if those reads do not use sync/atomic. -// -// - All functions in sync/atomic that perform a memory write are at least a -// write fence: memory writes before calls to such functions cannot be -// reordered after the call, and memory writes after calls to such functions -// cannot be reordered before the call, even if those writes do not use -// sync/atomic. -// -// As of this writing, the Go memory model completely fails to describe -// sync/atomic, but these properties are implied by -// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8. - -// BeginRead indicates the beginning of a reader critical section. Reader -// critical sections DO NOT BLOCK writer critical sections, so operations in a -// reader critical section MAY RACE with writer critical sections. Races are -// detected by ReadOk at the end of the reader critical section. Thus, the -// low-level structure of readers is generally: -// -// for { -// epoch := seq.BeginRead() -// // do something idempotent with seq-protected data -// if seq.ReadOk(epoch) { -// break -// } -// } -// -// However, since reader critical sections may race with writer critical -// sections, the Go race detector will (accurately) flag data races in readers -// using this pattern. Most users of SeqCount will need to use the -// SeqAtomicLoad function template in seqatomic.go. -func (s *SeqCount) BeginRead() SeqCountEpoch { - epoch := atomic.LoadUint32(&s.epoch) - for epoch&1 != 0 { - runtime.Gosched() - epoch = atomic.LoadUint32(&s.epoch) - } - return SeqCountEpoch{epoch} -} - -// ReadOk returns true if the reader critical section initiated by a previous -// call to BeginRead() that returned epoch did not race with any writer critical -// sections. -// -// ReadOk may be called any number of times during a reader critical section. -// Reader critical sections do not need to be explicitly terminated; the last -// call to ReadOk is implicitly the end of the reader critical section. -func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool { - return atomic.LoadUint32(&s.epoch) == epoch.val -} - -// BeginWrite indicates the beginning of a writer critical section. -// -// SeqCount does not support concurrent writer critical sections; clients with -// concurrent writers must synchronize them using e.g. sync.Mutex. -func (s *SeqCount) BeginWrite() { - if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 { - panic("SeqCount.BeginWrite during writer critical section") - } -} - -// EndWrite ends the effect of a preceding BeginWrite. -func (s *SeqCount) EndWrite() { - if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 { - panic("SeqCount.EndWrite outside writer critical section") - } -} - -// PointersInType returns a list of pointers reachable from values named -// valName of the given type. -// -// PointersInType is not exhaustive, but it is guaranteed that if typ contains -// at least one pointer, then PointersInTypeOf returns a non-empty list. -func PointersInType(typ reflect.Type, valName string) []string { - switch kind := typ.Kind(); kind { - case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128: - return nil - - case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer: - return []string{valName} - - case reflect.Array: - return PointersInType(typ.Elem(), valName+"[]") - - case reflect.Struct: - var ptrs []string - for i, n := 0, typ.NumField(); i < n; i++ { - field := typ.Field(i) - ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...) - } - return ptrs - - default: - return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)} - } -} diff --git a/third_party/gvsync/seqcount_test.go b/third_party/gvsync/seqcount_test.go deleted file mode 100644 index 085e574b3..000000000 --- a/third_party/gvsync/seqcount_test.go +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package gvsync - -import ( - "reflect" - "testing" - "time" -) - -func TestSeqCountWriteUncontended(t *testing.T) { - var seq SeqCount - seq.BeginWrite() - seq.EndWrite() -} - -func TestSeqCountReadUncontended(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountBeginReadAfterWrite(t *testing.T) { - var seq SeqCount - var data int32 - const want = 1 - seq.BeginWrite() - data = want - seq.EndWrite() - epoch := seq.BeginRead() - if data != want { - t.Errorf("Reader: got %v, wanted %v", data, want) - } - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountBeginReadDuringWrite(t *testing.T) { - var seq SeqCount - var data int - const want = 1 - seq.BeginWrite() - go func() { - time.Sleep(time.Second) - data = want - seq.EndWrite() - }() - epoch := seq.BeginRead() - if data != want { - t.Errorf("Reader: got %v, wanted %v", data, want) - } - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountReadOkAfterWrite(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - seq.BeginWrite() - seq.EndWrite() - if seq.ReadOk(epoch) { - t.Errorf("ReadOk: got true, wanted false") - } -} - -func TestSeqCountReadOkDuringWrite(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - seq.BeginWrite() - if seq.ReadOk(epoch) { - t.Errorf("ReadOk: got true, wanted false") - } - seq.EndWrite() -} - -func BenchmarkSeqCountWriteUncontended(b *testing.B) { - var seq SeqCount - for i := 0; i < b.N; i++ { - seq.BeginWrite() - seq.EndWrite() - } -} - -func BenchmarkSeqCountReadUncontended(b *testing.B) { - var seq SeqCount - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - epoch := seq.BeginRead() - if !seq.ReadOk(epoch) { - b.Fatalf("ReadOk: got false, wanted true") - } - } - }) -} - -func TestPointersInType(t *testing.T) { - for _, test := range []struct { - name string // used for both test and value name - val interface{} - ptrs []string - }{ - { - name: "EmptyStruct", - val: struct{}{}, - }, - { - name: "Int", - val: int(0), - }, - { - name: "MixedStruct", - val: struct { - b bool - I int - ExportedPtr *struct{} - unexportedPtr *struct{} - arr [2]int - ptrArr [2]*int - nestedStruct struct { - nestedNonptr int - nestedPtr *int - } - structArr [1]struct { - nonptr int - ptr *int - } - }{}, - ptrs: []string{ - "MixedStruct.ExportedPtr", - "MixedStruct.unexportedPtr", - "MixedStruct.ptrArr[]", - "MixedStruct.nestedStruct.nestedPtr", - "MixedStruct.structArr[].ptr", - }, - }, - } { - t.Run(test.name, func(t *testing.T) { - typ := reflect.TypeOf(test.val) - ptrs := PointersInType(typ, test.name) - t.Logf("Found pointers: %v", ptrs) - if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) { - t.Errorf("Got %v, wanted %v", ptrs, test.ptrs) - } - }) - } -} diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD index fa82f8e9b..d412e1ccf 100644 --- a/tools/go_marshal/test/BUILD +++ b/tools/go_marshal/test/BUILD @@ -1,9 +1,8 @@ load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_marshal:defs.bzl", "go_library") package(licenses = ["notice"]) -load("//tools/go_marshal:defs.bzl", "go_library") - package_group( name = "gomarshal_test", packages = [ diff --git a/tools/go_marshal/test/external/BUILD b/tools/go_marshal/test/external/BUILD index 8fb43179b..9bb89e1da 100644 --- a/tools/go_marshal/test/external/BUILD +++ b/tools/go_marshal/test/external/BUILD @@ -1,7 +1,7 @@ -package(licenses = ["notice"]) - load("//tools/go_marshal:defs.bzl", "go_library") +package(licenses = ["notice"]) + go_library( name = "external", testonly = 1, -- cgit v1.2.3 From 663fe840f79ac3d8e2ce1a1f1409d84cf2a9d37e Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 6 Dec 2019 14:32:53 -0800 Subject: Implement TTY field in control.Processes(). Threadgroups already know their TTY (if they have one), which now contains the TTY Index, and is returned in the Processes() call. PiperOrigin-RevId: 284263850 --- pkg/sentry/control/proc.go | 26 +++++++--- pkg/sentry/control/proc_test.go | 10 ++-- pkg/sentry/fs/tty/terminal.go | 4 +- pkg/sentry/kernel/tty.go | 11 +++++ runsc/container/container_test.go | 95 +++++++++++++++++++++++++++++++++++- runsc/container/test_app/BUILD | 1 + runsc/container/test_app/test_app.go | 40 +++++++++++++++ 7 files changed, 173 insertions(+), 14 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index c35faeb4c..a6f90b2bb 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -268,7 +268,6 @@ func (proc *Proc) Ps(args *PsArgs, out *string) error { } // Process contains information about a single process in a Sandbox. -// TODO(b/117881927): Implement TTY field. type Process struct { UID auth.KUID `json:"uid"` PID kernel.ThreadID `json:"pid"` @@ -276,6 +275,9 @@ type Process struct { PPID kernel.ThreadID `json:"ppid"` // Processor utilization C int32 `json:"c"` + // TTY name of the process. Will be of the form "pts/N" if there is a + // TTY, or "?" if there is not. + TTY string `json:"tty"` // Start time STime string `json:"stime"` // CPU time @@ -285,18 +287,19 @@ type Process struct { } // ProcessListToTable prints a table with the following format: -// UID PID PPID C STIME TIME CMD -// 0 1 0 0 14:04 505262ns tail +// UID PID PPID C TTY STIME TIME CMD +// 0 1 0 0 pty/4 14:04 505262ns tail func ProcessListToTable(pl []*Process) string { var buf bytes.Buffer tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) - fmt.Fprint(tw, "UID\tPID\tPPID\tC\tSTIME\tTIME\tCMD") + fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") for _, d := range pl { - fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s", + fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", d.UID, d.PID, d.PPID, d.C, + d.TTY, d.STime, d.Time, d.Cmd) @@ -347,7 +350,7 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { if p := tg.Leader().Parent(); p != nil { ppid = p.PIDNamespace().IDOfThreadGroup(p.ThreadGroup()) } - *out = append(*out, &Process{ + p := Process{ UID: tg.Leader().Credentials().EffectiveKUID, PID: pid, PPID: ppid, @@ -355,7 +358,9 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), Time: tg.CPUStats().SysTime.String(), Cmd: tg.Leader().Name(), - }) + TTY: ttyName(tg.TTY()), + } + *out = append(*out, &p) } sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) return nil @@ -395,3 +400,10 @@ func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { } return int32(percentCPU) } + +func ttyName(tty *kernel.TTY) string { + if tty == nil { + return "?" + } + return fmt.Sprintf("pts/%d", tty.Index) +} diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go index d8ada2694..0a88459b2 100644 --- a/pkg/sentry/control/proc_test.go +++ b/pkg/sentry/control/proc_test.go @@ -34,7 +34,7 @@ func TestProcessListTable(t *testing.T) { }{ { pl: []*Process{}, - expected: "UID PID PPID C STIME TIME CMD", + expected: "UID PID PPID C TTY STIME TIME CMD", }, { pl: []*Process{ @@ -43,6 +43,7 @@ func TestProcessListTable(t *testing.T) { PID: 0, PPID: 0, C: 0, + TTY: "?", STime: "0", Time: "0", Cmd: "zero", @@ -52,14 +53,15 @@ func TestProcessListTable(t *testing.T) { PID: 1, PPID: 1, C: 1, + TTY: "pts/4", STime: "1", Time: "1", Cmd: "one", }, }, - expected: `UID PID PPID C STIME TIME CMD -0 0 0 0 0 0 zero -1 1 1 1 1 1 one`, + expected: `UID PID PPID C TTY STIME TIME CMD +0 0 0 0 ? 0 0 zero +1 1 1 1 pts/4 1 1 one`, }, } diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index ff8138820..917f90cc0 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -53,8 +53,8 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal d: d, n: n, ld: newLineDiscipline(termios), - masterKTTY: &kernel.TTY{}, - slaveKTTY: &kernel.TTY{}, + masterKTTY: &kernel.TTY{Index: n}, + slaveKTTY: &kernel.TTY{Index: n}, } t.EnableLeakCheck("tty.Terminal") return &t diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go index 34f84487a..048de26dc 100644 --- a/pkg/sentry/kernel/tty.go +++ b/pkg/sentry/kernel/tty.go @@ -21,8 +21,19 @@ import "sync" // // +stateify savable type TTY struct { + // Index is the terminal index. It is immutable. + Index uint32 + mu sync.Mutex `state:"nosave"` // tg is protected by mu. tg *ThreadGroup } + +// TTY returns the thread group's controlling terminal. If nil, there is no +// controlling terminal. +func (tg *ThreadGroup) TTY() *TTY { + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.tty +} diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 07eacaac0..1d06f2780 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -98,10 +98,14 @@ func procListsEqual(got, want []*control.Process) bool { for i := range got { pd1 := got[i] pd2 := want[i] - // Zero out unimplemented and timing dependant fields. + // Zero out timing dependant fields. pd1.Time = "" pd1.STime = "" pd1.C = 0 + // Ignore TTY field too, since it's not relevant in the cases + // where we use this method. Tests that care about the TTY + // field should check for it themselves. + pd1.TTY = "" if *pd1 != *pd2 { return false } @@ -2112,6 +2116,95 @@ func TestOverlayfsStaleRead(t *testing.T) { } } +// TestTTYField checks TTY field returned by container.Processes(). +func TestTTYField(t *testing.T) { + stop := testutil.StartReaper() + defer stop() + + testApp, err := testutil.FindFile("runsc/container/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + testCases := []struct { + name string + useTTY bool + wantTTYField string + }{ + { + name: "no tty", + useTTY: false, + wantTTYField: "?", + }, + { + name: "tty used", + useTTY: true, + wantTTYField: "pts/0", + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + conf := testutil.TestConfig() + + // We will run /bin/sleep, possibly with an open TTY. + cmd := []string{"/bin/sleep", "10000"} + if test.useTTY { + // Run inside the "pty-runner". + cmd = append([]string{testApp, "pty-runner"}, cmd...) + } + + spec := testutil.NewSpecWithArgs(cmd...) + rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create and start the container. + args := Args{ + ID: testutil.UniqueContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait for sleep to be running, and check the TTY + // field. + var gotTTYField string + cb := func() error { + ps, err := c.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %v", err) + return &backoff.PermanentError{Err: err} + } + for _, p := range ps { + if strings.Contains(p.Cmd, "sleep") { + gotTTYField = p.TTY + return nil + } + } + return fmt.Errorf("sleep not running") + } + if err := testutil.Poll(cb, 30*time.Second); err != nil { + t.Fatalf("error waiting for sleep process: %v", err) + } + + if gotTTYField != test.wantTTYField { + t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField) + } + }) + } +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD index 9bf9e6e9d..bfd338bb6 100644 --- a/runsc/container/test_app/BUILD +++ b/runsc/container/test_app/BUILD @@ -15,5 +15,6 @@ go_binary( "//pkg/unet", "//runsc/testutil", "@com_github_google_subcommands//:go_default_library", + "@com_github_kr_pty//:go_default_library", ], ) diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go index 913d781c6..a1c8a741a 100644 --- a/runsc/container/test_app/test_app.go +++ b/runsc/container/test_app/test_app.go @@ -19,6 +19,7 @@ package main import ( "context" "fmt" + "io" "io/ioutil" "log" "net" @@ -31,6 +32,7 @@ import ( "flag" "github.com/google/subcommands" + "github.com/kr/pty" "gvisor.dev/gvisor/runsc/testutil" ) @@ -41,6 +43,7 @@ func main() { subcommands.Register(new(fdReceiver), "") subcommands.Register(new(fdSender), "") subcommands.Register(new(forkBomb), "") + subcommands.Register(new(ptyRunner), "") subcommands.Register(new(reaper), "") subcommands.Register(new(syscall), "") subcommands.Register(new(taskTree), "") @@ -352,3 +355,40 @@ func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...inter return subcommands.ExitSuccess } + +type ptyRunner struct{} + +// Name implements subcommands.Command. +func (*ptyRunner) Name() string { + return "pty-runner" +} + +// Synopsis implements subcommands.Command. +func (*ptyRunner) Synopsis() string { + return "runs the given command with an open pty terminal" +} + +// Usage implements subcommands.Command. +func (*ptyRunner) Usage() string { + return "pty-runner [command]" +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*ptyRunner) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command. +func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus { + c := exec.Command(fs.Args()[0], fs.Args()[1:]...) + f, err := pty.Start(c) + if err != nil { + fmt.Printf("pty.Start failed: %v", err) + return subcommands.ExitFailure + } + defer f.Close() + + // Copy stdout from the command to keep this process alive until the + // subprocess exits. + io.Copy(os.Stdout, f) + + return subcommands.ExitSuccess +} -- cgit v1.2.3 From 371e210b83c244d8828ad2fa1b3d7cef15fbf463 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Fri, 6 Dec 2019 16:58:28 -0800 Subject: Add runtime tracing. This adds meaningful annotations to the trace generated by the runtime/trace package. PiperOrigin-RevId: 284290115 --- pkg/sentry/control/pprof.go | 15 ++++++- pkg/sentry/kernel/kernel.go | 20 ++++++++- pkg/sentry/kernel/syscalls.go | 8 ++++ pkg/sentry/kernel/task.go | 20 +++++---- pkg/sentry/kernel/task_block.go | 8 +++- pkg/sentry/kernel/task_clone.go | 1 + pkg/sentry/kernel/task_exec.go | 3 +- pkg/sentry/kernel/task_exit.go | 1 + pkg/sentry/kernel/task_log.go | 86 +++++++++++++++++++++++++++++++++++++-- pkg/sentry/kernel/task_run.go | 14 +++++++ pkg/sentry/kernel/task_start.go | 8 ++-- pkg/sentry/kernel/task_syscall.go | 8 ++++ runsc/boot/controller.go | 4 +- runsc/cmd/debug.go | 29 +++++++------ scripts/dev.sh | 3 +- 15 files changed, 190 insertions(+), 38 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 1f78d54a2..e1f2fea60 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -22,6 +22,7 @@ import ( "sync" "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/urpc" ) @@ -56,6 +57,9 @@ type Profile struct { // traceFile is the current execution trace output file. traceFile *fd.FD + + // Kernel is the kernel under profile. + Kernel *kernel.Kernel } // StartCPUProfile is an RPC stub which starts recording the CPU profile in a @@ -147,6 +151,9 @@ func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error { return err } + // Ensure all trace contexts are registered. + p.Kernel.RebuildTraceContexts() + p.traceFile = output return nil } @@ -158,9 +165,15 @@ func (p *Profile) StopTrace(_, _ *struct{}) error { defer p.mu.Unlock() if p.traceFile == nil { - return errors.New("Execution tracing not start") + return errors.New("Execution tracing not started") } + // Similarly to the case above, if tasks have not ended traces, we will + // lose information. Thus we need to rebuild the tasks in order to have + // complete information. This will not lose information if multiple + // traces are overlapping. + p.Kernel.RebuildTraceContexts() + trace.Stop() p.traceFile.Close() p.traceFile = nil diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 28ba950bd..bd3fb4c03 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -841,9 +841,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, AbstractSocketNamespace: args.AbstractSocketNamespace, ContainerID: args.ContainerID, } - if _, err := k.tasks.NewTask(config); err != nil { + t, err := k.tasks.NewTask(config) + if err != nil { return nil, 0, err } + t.traceExecEvent(tc) // Simulate exec for tracing. // Success. tgid := k.tasks.Root.IDOfThreadGroup(tg) @@ -1118,6 +1120,22 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { return lastErr } +// RebuildTraceContexts rebuilds the trace context for all tasks. +// +// Unfortunately, if these are built while tracing is not enabled, then we will +// not have meaningful trace data. Rebuilding here ensures that we can do so +// after tracing has been enabled. +func (k *Kernel) RebuildTraceContexts() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + for t, tid := range k.tasks.Root.tids { + t.rebuildTraceContext(tid) + } +} + // FeatureSet returns the FeatureSet. func (k *Kernel) FeatureSet() *cpuid.FeatureSet { return k.featureSet diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 220fa73a2..2fdee0282 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -339,6 +339,14 @@ func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { return nil } +// LookupName looks up a syscall name. +func (s *SyscallTable) LookupName(sysno uintptr) string { + if sc, ok := s.Table[sysno]; ok { + return sc.Name + } + return fmt.Sprintf("sys_%d", sysno) // Unlikely. +} + // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 80c8e5464..ab0c6c4aa 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -15,6 +15,8 @@ package kernel import ( + gocontext "context" + "runtime/trace" "sync" "sync/atomic" @@ -390,7 +392,14 @@ type Task struct { // logPrefix is a string containing the task's thread ID in the root PID // namespace, and is prepended to log messages emitted by Task.Infof etc. - logPrefix atomic.Value `state:".(string)"` + logPrefix atomic.Value `state:"nosave"` + + // traceContext and traceTask are both used for tracing, and are + // updated along with the logPrefix in updateInfoLocked. + // + // These are exclusive to the task goroutine. + traceContext gocontext.Context `state:"nosave"` + traceTask *trace.Task `state:"nosave"` // creds is the task's credentials. // @@ -528,14 +537,6 @@ func (t *Task) loadPtraceTracer(tracer *Task) { t.ptraceTracer.Store(tracer) } -func (t *Task) saveLogPrefix() string { - return t.logPrefix.Load().(string) -} - -func (t *Task) loadLogPrefix(prefix string) { - t.logPrefix.Store(prefix) -} - func (t *Task) saveSyscallFilters() []bpf.Program { if f := t.syscallFilters.Load(); f != nil { return f.([]bpf.Program) @@ -549,6 +550,7 @@ func (t *Task) loadSyscallFilters(filters []bpf.Program) { // afterLoad is invoked by stateify. func (t *Task) afterLoad() { + t.updateInfoLocked() t.interruptChan = make(chan struct{}, 1) t.gosched.State = TaskGoroutineNonexistent if t.stop != nil { diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index dd69939f9..4a4a69ee2 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -16,6 +16,7 @@ package kernel import ( "runtime" + "runtime/trace" "time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -133,19 +134,24 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { runtime.Gosched() } + region := trace.StartRegion(t.traceContext, blockRegion) select { case <-C: + region.End() t.SleepFinish(true) + // Woken by event. return nil case <-interrupt: + region.End() t.SleepFinish(false) // Return the indicated error on interrupt. return syserror.ErrInterrupted case <-timerChan: - // We've timed out. + region.End() t.SleepFinish(true) + // We've timed out. return syserror.ETIMEDOUT } } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 0916fd658..3eadfedb4 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -299,6 +299,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // nt that it must receive before its task goroutine starts running. tid := nt.k.tasks.Root.IDOfTask(nt) defer nt.Start(tid) + t.traceCloneEvent(tid) // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 17a089b90..90a6190f1 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -129,6 +129,7 @@ type runSyscallAfterExecStop struct { } func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { + t.traceExecEvent(r.tc) t.tg.pidns.owner.mu.Lock() t.tg.execing = nil if t.killed() { @@ -253,7 +254,7 @@ func (t *Task) promoteLocked() { t.tg.leader = t t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) - t.updateLogPrefixLocked() + t.updateInfoLocked() // Reap the original leader. If it has a tracer, detach it instead of // waiting for it to acknowledge the original leader's death. oldLeader.exitParentNotified = true diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 535f03e50..435761e5a 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -236,6 +236,7 @@ func (*runExit) execute(t *Task) taskRunState { type runExitMain struct{} func (*runExitMain) execute(t *Task) taskRunState { + t.traceExitEvent() lastExiter := t.exitThreadGroup() // If the task has a cleartid, and the thread group wasn't killed by a diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index a29e9b9eb..0fb3661de 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -16,6 +16,7 @@ package kernel import ( "fmt" + "runtime/trace" "sort" "gvisor.dev/gvisor/pkg/log" @@ -127,11 +128,88 @@ func (t *Task) debugDumpStack() { } } -// updateLogPrefix updates the task's cached log prefix to reflect its -// current thread ID. +// trace definitions. +// +// Note that all region names are prefixed by ':' in order to ensure that they +// are lexically ordered before all system calls, which use the naked system +// call name (e.g. "read") for maximum clarity. +const ( + traceCategory = "task" + runRegion = ":run" + blockRegion = ":block" + cpuidRegion = ":cpuid" + faultRegion = ":fault" +) + +// updateInfoLocked updates the task's cached log prefix and tracing +// information to reflect its current thread ID. // // Preconditions: The task's owning TaskSet.mu must be locked. -func (t *Task) updateLogPrefixLocked() { +func (t *Task) updateInfoLocked() { // Use the task's TID in the root PID namespace for logging. - t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t])) + tid := t.tg.pidns.owner.Root.tids[t] + t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid)) + t.rebuildTraceContext(tid) +} + +// rebuildTraceContext rebuilds the trace context. +// +// Precondition: the passed tid must be the tid in the root namespace. +func (t *Task) rebuildTraceContext(tid ThreadID) { + // Re-initialize the trace context. + if t.traceTask != nil { + t.traceTask.End() + } + + // Note that we define the "task type" to be the dynamic TID. This does + // not align perfectly with the documentation for "tasks" in the + // tracing package. Tasks may be assumed to be bounded by analysis + // tools. However, if we just use a generic "task" type here, then the + // "user-defined tasks" page on the tracing dashboard becomes nearly + // unusable, as it loads all traces from all tasks. + // + // We can assume that the number of tasks in the system is not + // arbitrarily large (in general it won't be, especially for cases + // where we're collecting a brief profile), so using the TID is a + // reasonable compromise in this case. + t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid)) +} + +// traceCloneEvent is called when a new task is spawned. +// +// ntid must be the new task's ThreadID in the root namespace. +func (t *Task) traceCloneEvent(ntid ThreadID) { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid) +} + +// traceExitEvent is called when a task exits. +func (t *Task) traceExitEvent() { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status()) +} + +// traceExecEvent is called when a task calls exec. +func (t *Task) traceExecEvent(tc *TaskContext) { + if !trace.IsEnabled() { + return + } + d := tc.MemoryManager.Executable() + if d == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") + return + } + defer d.DecRef() + root := t.fsContext.RootDirectory() + if root == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>") + return + } + defer root.DecRef() + n, _ := d.FullName(root) + trace.Logf(t.traceContext, traceCategory, "exec: %s", n) } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index c92266c59..d97f8c189 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -17,6 +17,7 @@ package kernel import ( "bytes" "runtime" + "runtime/trace" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -205,9 +206,11 @@ func (*runApp) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.RUnlock() } + region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) + region.End() if clearSinglestep { t.Arch().ClearSingleStep() @@ -225,6 +228,7 @@ func (*runApp) execute(t *Task) taskRunState { case platform.ErrContextSignalCPUID: // Is this a CPUID instruction? + region := trace.StartRegion(t.traceContext, cpuidRegion) expected := arch.CPUIDInstruction[:] found := make([]byte, len(expected)) _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) @@ -232,10 +236,12 @@ func (*runApp) execute(t *Task) taskRunState { // Skip the cpuid instruction. t.Arch().CPUIDEmulate(t) t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + region.End() // Resume execution. return (*runApp)(nil) } + region.End() // Not an actual CPUID, but required copy-in. // The instruction at the given RIP was not a CPUID, and we // fallthrough to the default signal deliver behavior below. @@ -251,8 +257,10 @@ func (*runApp) execute(t *Task) taskRunState { // an application-generated signal and we should continue execution // normally. if at.Any() { + region := trace.StartRegion(t.traceContext, faultRegion) addr := usermem.Addr(info.Addr()) err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + region.End() if err == nil { // The fault was handled appropriately. // We can resume running the application. @@ -260,6 +268,12 @@ func (*runApp) execute(t *Task) taskRunState { } // Is this a vsyscall that we need emulate? + // + // Note that we don't track vsyscalls as part of a + // specific trace region. This is because regions don't + // stack, and the actual system call will count as a + // region. We should be able to easily identify + // vsyscalls by having a pair. if at.Execute { if sysno, ok := t.tc.st.LookupEmulate(addr); ok { return t.doVsyscall(addr, sysno) diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index ae6fc4025..3522a4ae5 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -154,10 +154,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { // Below this point, newTask is expected not to fail (there is no rollback // of assignTIDsLocked or any of the following). - // Logging on t's behalf will panic if t.logPrefix hasn't been initialized. - // This is the earliest point at which we can do so (since t now has thread - // IDs). - t.updateLogPrefixLocked() + // Logging on t's behalf will panic if t.logPrefix hasn't been + // initialized. This is the earliest point at which we can do so + // (since t now has thread IDs). + t.updateInfoLocked() if cfg.InheritParent != nil { t.parent = cfg.InheritParent.parent diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index b543d536a..3180f5560 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -17,6 +17,7 @@ package kernel import ( "fmt" "os" + "runtime/trace" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -160,6 +161,10 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u ctrl = ctrlStopAndReinvokeSyscall } else { fn := s.Lookup(sysno) + var region *trace.Region // Only non-nil if tracing == true. + if trace.IsEnabled() { + region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) + } if fn != nil { // Call our syscall implementation. rval, ctrl, err = fn(t, args) @@ -167,6 +172,9 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u // Use the missing function if not found. rval, err = t.SyscallTable().Missing(t, sysno, args) } + if region != nil { + region.End() + } } if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index f62be4c59..9c9e94864 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -152,7 +152,9 @@ func newController(fd int, l *Loader) (*controller, error) { srv.Register(&debug{}) srv.Register(&control.Logging{}) if l.conf.ProfileEnable { - srv.Register(&control.Profile{}) + srv.Register(&control.Profile{ + Kernel: l.k, + }) } return &controller{ diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index 7313e473f..38da7ee02 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -32,16 +32,16 @@ import ( // Debug implements subcommands.Command for the "debug" command. type Debug struct { - pid int - stacks bool - signal int - profileHeap string - profileCPU string - profileDelay int - trace string - strace string - logLevel string - logPackets string + pid int + stacks bool + signal int + profileHeap string + profileCPU string + trace string + strace string + logLevel string + logPackets string + duration time.Duration } // Name implements subcommands.Command. @@ -65,7 +65,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") - f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile") + f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles") f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.") f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox") f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`) @@ -163,7 +163,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if err := c.Sandbox.StartCPUProfile(f); err != nil { return Errorf(err.Error()) } - log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU) + log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU) } if d.trace != "" { delay = true @@ -181,8 +181,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if err := c.Sandbox.StartTrace(f); err != nil { return Errorf(err.Error()) } - log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace) - + log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace) } if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 { @@ -243,7 +242,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } if delay { - time.Sleep(time.Duration(d.profileDelay) * time.Second) + time.Sleep(d.duration) } return subcommands.ExitSuccess diff --git a/scripts/dev.sh b/scripts/dev.sh index c67003018..6238b4d0b 100755 --- a/scripts/dev.sh +++ b/scripts/dev.sh @@ -54,9 +54,10 @@ declare OUTPUT="$(build //runsc)" if [[ ${REFRESH} -eq 0 ]]; then install_runsc "${RUNTIME}" --net-raw install_runsc "${RUNTIME}-d" --net-raw --debug --strace --log-packets + install_runsc "${RUNTIME}-p" --net-raw --profile echo - echo "Runtimes ${RUNTIME} and ${RUNTIME}-d (debug enabled) setup." + echo "Runtimes ${RUNTIME}, ${RUNTIME}-d (debug enabled), and ${RUNTIME}-p installed." echo "Use --runtime="${RUNTIME}" with your Docker command." echo " docker run --rm --runtime="${RUNTIME}" hello-world" echo -- cgit v1.2.3 From 898dcc2f839a975a9171271824af32176c2e5c27 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 9 Dec 2019 12:03:31 -0800 Subject: Redirect TODOs to gvisor.dev PiperOrigin-RevId: 284606233 --- pkg/sentry/fs/gofer/session.go | 6 +++--- pkg/sentry/kernel/semaphore/semaphore.go | 6 +++--- pkg/sentry/syscalls/linux/linux64_amd64.go | 2 +- pkg/sentry/syscalls/linux/linux64_arm64.go | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 0da608548..4e358a46a 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -143,9 +143,9 @@ type session struct { // socket files. This allows unix domain sockets to be used with paths that // belong to a gofer. // - // TODO(b/77154739): there are few possible races with someone stat'ing the - // file and another deleting it concurrently, where the file will not be - // reported as socket file. + // TODO(gvisor.dev/issue/1200): there are few possible races with someone + // stat'ing the file and another deleting it concurrently, where the file + // will not be reported as socket file. endpoints *endpointMaps `state:"wait"` } diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 93fe68a3e..de9617e9d 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred return syserror.ERANGE } - // TODO(b/29354920): Clear undo entries in all processes + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = val sem.pid = pid s.changeTime = ktime.NowFromContext(ctx) @@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti for i, val := range vals { sem := &s.sems[i] - // TODO(b/29354920): Clear undo entries in all processes + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = int16(val) sem.pid = pid sem.wakeWaiters() @@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch } // All operations succeeded, apply them. - // TODO(b/29354920): handle undo operations. + // TODO(gvisor.dev/issue/137): handle undo operations. for i, v := range tmpVals { s.sems[i].value = v s.sems[i].wakeWaiters() diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index 81e4f93a6..5642d69ea 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -260,7 +260,7 @@ var AMD64 = &kernel.SyscallTable{ 217: syscalls.Supported("getdents64", Getdents64), 218: syscalls.Supported("set_tid_address", SetTidAddress), 219: syscalls.Supported("restart_syscall", RestartSyscall), - 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), 222: syscalls.Supported("timer_create", TimerCreate), 223: syscalls.Supported("timer_settime", TimerSettime), diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index f1dd4b0c0..f897bfff8 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -224,7 +224,7 @@ var ARM64 = &kernel.SyscallTable{ 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 190: syscalls.Supported("semget", Semget), 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), - 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), -- cgit v1.2.3 From 3c125eb21946e1f6bf8f22f4169baafb7f07bf60 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 26 Dec 2019 14:42:19 -0800 Subject: Initial procfs implementation in VFSv2 Updates #1195 PiperOrigin-RevId: 287227722 --- pkg/sentry/fsimpl/kernfs/BUILD | 1 + pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 20 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 5 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 20 ++ pkg/sentry/fsimpl/kernfs/kernfs.go | 9 + pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 +- pkg/sentry/fsimpl/kernfs/symlink.go | 45 +++ pkg/sentry/fsimpl/proc/BUILD | 33 +- pkg/sentry/fsimpl/proc/boot_test.go | 149 +++++++++ pkg/sentry/fsimpl/proc/filesystem.go | 69 +++++ pkg/sentry/fsimpl/proc/filesystems.go | 25 -- pkg/sentry/fsimpl/proc/loadavg.go | 8 +- pkg/sentry/fsimpl/proc/meminfo.go | 6 +- pkg/sentry/fsimpl/proc/proc.go | 16 - pkg/sentry/fsimpl/proc/stat.go | 6 +- pkg/sentry/fsimpl/proc/task.go | 341 ++++++++------------ pkg/sentry/fsimpl/proc/task_files.go | 272 ++++++++++++++++ pkg/sentry/fsimpl/proc/tasks.go | 162 ++++++++++ pkg/sentry/fsimpl/proc/tasks_files.go | 92 ++++++ pkg/sentry/fsimpl/proc/tasks_test.go | 412 +++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/version.go | 6 +- pkg/sentry/kernel/kernel.go | 7 +- pkg/sentry/kernel/task_clone.go | 2 +- pkg/sentry/kernel/thread_group.go | 8 +- pkg/sentry/vfs/file_description_impl_util.go | 11 + 25 files changed, 1454 insertions(+), 273 deletions(-) create mode 100644 pkg/sentry/fsimpl/kernfs/symlink.go create mode 100644 pkg/sentry/fsimpl/proc/boot_test.go create mode 100644 pkg/sentry/fsimpl/proc/filesystem.go delete mode 100644 pkg/sentry/fsimpl/proc/filesystems.go delete mode 100644 pkg/sentry/fsimpl/proc/proc.go create mode 100644 pkg/sentry/fsimpl/proc/task_files.go create mode 100644 pkg/sentry/fsimpl/proc/tasks.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_files.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_test.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 59f7f39e2..39c03ee9d 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -25,6 +25,7 @@ go_library( "inode_impl_util.go", "kernfs.go", "slot_list.go", + "symlink.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs", visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 51102ce48..c5fe65722 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -15,6 +15,8 @@ package kernfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -26,7 +28,10 @@ import ( // DynamicBytesFile implements kernfs.Inode and represents a read-only // file whose contents are backed by a vfs.DynamicBytesSource. // -// Must be initialized with Init before first use. +// Must be instantiated with NewDynamicBytesFile or initialized with Init +// before first use. +// +// +stateify savable type DynamicBytesFile struct { InodeAttrs InodeNoopRefCount @@ -36,9 +41,14 @@ type DynamicBytesFile struct { data vfs.DynamicBytesSource } -// Init intializes a dynamic bytes file. -func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource) { - f.InodeAttrs.Init(creds, ino, linux.ModeRegular|0444) +var _ Inode = (*DynamicBytesFile)(nil) + +// Init initializes a dynamic bytes file. +func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) f.data = data } @@ -59,6 +69,8 @@ func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { // DynamicBytesFile. // // Must be initialized with Init before first use. +// +// +stateify savable type DynamicBytesFD struct { vfs.FileDescriptionDefaultImpl vfs.DynamicBytesFileDescriptionImpl diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index bd402330f..77975583b 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -154,7 +154,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent fd.off++ } - return nil + var err error + relOffset := fd.off - int64(len(fd.children.set)) - 2 + fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset) + return err } // Seek implements vfs.FileDecriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 7b45b702a..752e0f659 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -139,6 +139,11 @@ func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, panic("Lookup called on non-directory inode") } +// IterDirents implements Inode.IterDirents. +func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + panic("IterDirents called on non-directory inode") +} + // Valid implements Inode.Valid. func (*InodeNotDirectory) Valid(context.Context) bool { return true @@ -156,6 +161,11 @@ func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dent return nil, syserror.ENOENT } +// IterDirents implements Inode.IterDirents. +func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return offset, nil +} + // Valid implements Inode.Valid. func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { return true @@ -490,3 +500,13 @@ func (o *OrderedChildren) nthLocked(i int64) *slot { } return nil } + +// InodeSymlink partially implements Inode interface for symlinks. +type InodeSymlink struct { + InodeNotDirectory +} + +// Open implements Inode.Open. +func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + return nil, syserror.ELOOP +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index ac802218d..d69b299ae 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -404,6 +404,15 @@ type inodeDynamicLookup interface { // Valid should return true if this inode is still valid, or needs to // be resolved again by a call to Lookup. Valid(ctx context.Context) bool + + // IterDirents is used to iterate over dynamically created entries. It invokes + // cb on each entry in the directory represented by the FileDescription. + // 'offset' is the offset for the entire IterDirents call, which may include + // results from the caller. 'relOffset' is the offset inside the entries + // returned by this IterDirents invocation. In other words, + // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff, + // while 'relOffset' is the place where iteration should start from. + IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) } type inodeSymlink interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 73b6e43b5..3db12caa0 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -133,7 +133,7 @@ type file struct { func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { f := &file{} f.content = content - f.DynamicBytesFile.Init(creds, fs.NextIno(), f) + f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777) d := &kernfs.Dentry{} d.Init(f) diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go new file mode 100644 index 000000000..068063f4e --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -0,0 +1,45 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +type staticSymlink struct { + InodeAttrs + InodeNoopRefCount + InodeSymlink + + target string +} + +var _ Inode = (*staticSymlink)(nil) + +// NewStaticSymlink creates a new symlink file pointing to 'target'. +func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry { + inode := &staticSymlink{target: target} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &Dentry{} + d.Init(inode) + return d +} + +func (s *staticSymlink) Readlink(_ context.Context) (string, error) { + return s.target, nil +} diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index ade6ac946..1f44b3217 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -6,15 +6,17 @@ package(licenses = ["notice"]) go_library( name = "proc", srcs = [ - "filesystems.go", + "filesystem.go", "loadavg.go", "meminfo.go", "mounts.go", "net.go", - "proc.go", "stat.go", "sys.go", "task.go", + "task_files.go", + "tasks.go", + "tasks_files.go", "version.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", @@ -24,8 +26,10 @@ go_library( "//pkg/log", "//pkg/sentry/context", "//pkg/sentry/fs", + "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", "//pkg/sentry/mm", "//pkg/sentry/socket", @@ -34,17 +38,40 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/syserror", ], ) go_test( name = "proc_test", size = "small", - srcs = ["net_test.go"], + srcs = [ + "boot_test.go", + "net_test.go", + "tasks_test.go", + ], embed = [":proc"], deps = [ "//pkg/abi/linux", + "//pkg/cpuid", + "//pkg/fspath", + "//pkg/memutil", + "//pkg/sentry/context", "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + "//pkg/sentry/time", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/proc/boot_test.go b/pkg/sentry/fsimpl/proc/boot_test.go new file mode 100644 index 000000000..84a93ee56 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/boot_test.go @@ -0,0 +1,149 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "flag" + "fmt" + "os" + "runtime" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/time" + + // Platforms are plugable. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +var ( + platformFlag = flag.String("platform", "ptrace", "specify which platform to use") +) + +// boot initializes a new bare bones kernel for test. +func boot() (*kernel.Kernel, error) { + platformCtr, err := platform.Lookup(*platformFlag) + if err != nil { + return nil, fmt.Errorf("platform not found: %v", err) + } + deviceFile, err := platformCtr.OpenDevice() + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + plat, err := platformCtr.New(deviceFile) + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + + k := &kernel.Kernel{ + Platform: plat, + } + + mf, err := createMemoryFile() + if err != nil { + return nil, err + } + k.SetMemoryFile(mf) + + // Pass k as the platform since it is savable, unlike the actual platform. + vdso, err := loader.PrepareVDSO(nil, k) + if err != nil { + return nil, fmt.Errorf("creating vdso: %v", err) + } + + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) + + creds := auth.NewRootCredentials(auth.NewRootUserNamespace()) + + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + ApplicationCores: uint(runtime.GOMAXPROCS(-1)), + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + Vdso: vdso, + RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace), + RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), + RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), + }); err != nil { + return nil, fmt.Errorf("initializing kernel: %v", err) + } + + ctx := k.SupervisorContext() + + // Create mount namespace without root as it's the minimum required to create + // the global thread group. + mntns, err := fs.NewMountNamespace(ctx, nil) + if err != nil { + return nil, err + } + ls, err := limits.NewLinuxLimitSet() + if err != nil { + return nil, err + } + tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) + k.TestOnly_SetGlobalInit(tg) + + return k, nil +} + +// createTask creates a new bare bones task for tests. +func createTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) { + k := kernel.KernelFromContext(ctx) + config := &kernel.TaskConfig{ + Kernel: k, + ThreadGroup: tc, + TaskContext: &kernel.TaskContext{Name: name}, + Credentials: auth.CredentialsFromContext(ctx), + AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), + UTSNamespace: kernel.UTSNamespaceFromContext(ctx), + IPCNamespace: kernel.IPCNamespaceFromContext(ctx), + AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + } + return k.TaskSet().NewTask(config) +} + +func createMemoryFile() (*pgalloc.MemoryFile, error) { + const memfileName = "test-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + return nil, fmt.Errorf("error creating memfd: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) + } + return mf, nil +} diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go new file mode 100644 index 000000000..d09182c77 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -0,0 +1,69 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package proc implements a partial in-memory file system for procfs. +package proc + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// procFSType is the factory class for procfs. +// +// +stateify savable +type procFSType struct{} + +var _ vfs.FilesystemType = (*procFSType)(nil) + +// GetFilesystem implements vfs.FilesystemType. +func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, nil, fmt.Errorf("procfs requires a kernel") + } + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return nil, nil, fmt.Errorf("procfs requires a PID namespace") + } + + procfs := &kernfs.Filesystem{} + procfs.VFSFilesystem().Init(vfsObj, procfs) + + _, dentry := newTasksInode(procfs, k, pidns) + return procfs.VFSFilesystem(), dentry.VFSDentry(), nil +} + +// dynamicInode is an overfitted interface for common Inodes with +// dynamicByteSource types used in procfs. +type dynamicInode interface { + kernfs.Inode + vfs.DynamicBytesSource + + Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) +} + +func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + inode.Init(creds, ino, inode, perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go deleted file mode 100644 index 0e016bca5..000000000 --- a/pkg/sentry/fsimpl/proc/filesystems.go +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems. -// -// +stateify savable -type filesystemsData struct{} - -// TODO(gvisor.dev/issue/1195): Implement vfs.DynamicBytesSource.Generate for -// filesystemsData. We would need to retrive filesystem names from -// vfs.VirtualFilesystem. Also needs vfs replacement for -// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev. diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go index 9135afef1..5351d86e8 100644 --- a/pkg/sentry/fsimpl/proc/loadavg.go +++ b/pkg/sentry/fsimpl/proc/loadavg.go @@ -19,15 +19,17 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" ) // loadavgData backs /proc/loadavg. // // +stateify savable -type loadavgData struct{} +type loadavgData struct { + kernfs.DynamicBytesFile +} -var _ vfs.DynamicBytesSource = (*loadavgData)(nil) +var _ dynamicInode = (*loadavgData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go index 9a827cd66..cbdd4f3fc 100644 --- a/pkg/sentry/fsimpl/proc/meminfo.go +++ b/pkg/sentry/fsimpl/proc/meminfo.go @@ -19,21 +19,23 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. // // +stateify savable type meminfoData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*meminfoData)(nil) +var _ dynamicInode = (*meminfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/fsimpl/proc/proc.go deleted file mode 100644 index 31dec36de..000000000 --- a/pkg/sentry/fsimpl/proc/proc.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package proc implements a partial in-memory file system for procfs. -package proc diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go index 720db3828..50894a534 100644 --- a/pkg/sentry/fsimpl/proc/stat.go +++ b/pkg/sentry/fsimpl/proc/stat.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // cpuStats contains the breakdown of CPU time for /proc/stat. @@ -66,11 +66,13 @@ func (c cpuStats) String() string { // // +stateify savable type statData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*statData)(nil) +var _ dynamicInode = (*statData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 0d87be52b..11a64c777 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -15,251 +15,176 @@ package proc import ( - "bytes" - "fmt" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) -// mapsCommon is embedded by mapsData and smapsData. -type mapsCommon struct { - t *kernel.Task -} - -// mm gets the kernel task's MemoryManager. No additional reference is taken on -// mm here. This is safe because MemoryManager.destroy is required to leave the -// MemoryManager in a state where it's still usable as a DynamicBytesSource. -func (md *mapsCommon) mm() *mm.MemoryManager { - var tmm *mm.MemoryManager - md.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - tmm = mm - } - }) - return tmm -} - -// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. +// taskInode represents the inode for /proc/PID/ directory. // // +stateify savable -type mapsData struct { - mapsCommon +type taskInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNoDynamicLookup + kernfs.InodeAttrs + kernfs.OrderedChildren + + task *kernel.Task } -var _ vfs.DynamicBytesSource = (*mapsData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { - if mm := md.mm(); mm != nil { - mm.ReadMapsDataInto(ctx, buf) +var _ kernfs.Inode = (*taskInode)(nil) + +func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry { + contents := map[string]*kernfs.Dentry{ + //"auxv": newAuxvec(t, msrc), + //"cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + //"comm": newComm(t, msrc), + //"environ": newExecArgInode(t, msrc, environExecArg), + //"exe": newExe(t, msrc), + //"fd": newFdDir(t, msrc), + //"fdinfo": newFdInfoDir(t, msrc), + //"gid_map": newGIDMap(t, msrc), + "io": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}), + //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), + //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + //"ns": newNamespaceDir(t, msrc), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}), + //"uid_map": newUIDMap(t, msrc), } - return nil -} - -// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. -// -// +stateify savable -type smapsData struct { - mapsCommon -} - -var _ vfs.DynamicBytesSource = (*smapsData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { - if mm := sd.mm(); mm != nil { - mm.ReadSmapsDataInto(ctx, buf) + if isThreadGroup { + //contents["task"] = p.newSubtasks(t, msrc) } - return nil -} - -// +stateify savable -type taskStatData struct { - t *kernel.Task + //if len(p.cgroupControllers) > 0 { + // contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) + //} - // If tgstats is true, accumulate fault stats (not implemented) and CPU - // time across all tasks in t's thread group. - tgstats bool + taskInode := &taskInode{task: task} + // Note: credentials are overridden by taskOwnedInode. + taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) - // pidns is the PID namespace associated with the proc filesystem that - // includes the file using this statData. - pidns *kernel.PIDNamespace -} - -var _ vfs.DynamicBytesSource = (*taskStatData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) - fmt.Fprintf(buf, "(%s) ", s.t.Name()) - fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) - ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { - ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) - } - fmt.Fprintf(buf, "%d ", ppid) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) - fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) - fmt.Fprintf(buf, "0 " /* flags */) - fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) - var cputime usage.CPUStats - if s.tgstats { - cputime = s.t.ThreadGroup().CPUStats() - } else { - cputime = s.t.CPUStats() - } - fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - cputime = s.t.ThreadGroup().JoinedChildCPUStats() - fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + inode := &taskOwnedInode{Inode: taskInode, owner: task} + dentry := &kernfs.Dentry{} + dentry.Init(inode) - // itrealvalue. Since kernel 2.6.17, this field is no longer - // maintained, and is hard coded as 0. - fmt.Fprintf(buf, "0 ") + taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := taskInode.OrderedChildren.Populate(dentry, contents) + taskInode.IncLinks(links) - // Start time is relative to boot time, expressed in clock ticks. - fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + return dentry +} - var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) - fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) +// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long +// as the task is still running. When it's dead, another tasks with the same +// PID could replace it. +func (i *taskInode) Valid(ctx context.Context) bool { + return i.task.ExitState() != kernel.TaskExitDead +} - // rsslim. - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) +// Open implements kernfs.Inode. +func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} - fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) - fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) - fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) - terminationSignal := linux.Signal(0) - if s.t == s.t.ThreadGroup().Leader() { - terminationSignal = s.t.ThreadGroup().TerminationSignal() +// SetStat implements kernfs.Inode. +func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { + stat := opts.Stat + if stat.Mask&linux.STATX_MODE != 0 { + return syserror.EPERM } - fmt.Fprintf(buf, "%d ", terminationSignal) - fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) - fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) - fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) - fmt.Fprintf(buf, "0\n" /* exit_code */) - return nil } -// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. -// -// +stateify savable -type statmData struct { - t *kernel.Task +// taskOwnedInode implements kernfs.Inode and overrides inode owner with task +// effective user and group. +type taskOwnedInode struct { + kernfs.Inode + + // owner is the task that owns this inode. + owner *kernel.Task } -var _ vfs.DynamicBytesSource = (*statmData)(nil) +var _ kernfs.Inode = (*taskOwnedInode)(nil) -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { - var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) +func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), ino, inode, perm) - fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize) - return nil + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d } -// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. -// -// +stateify savable -type statusData struct { - t *kernel.Task - pidns *kernel.PIDNamespace +// Stat implements kernfs.Inode. +func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { + stat := i.Inode.Stat(fs) + uid, gid := i.getOwner(linux.FileMode(stat.Mode)) + stat.UID = uint32(uid) + stat.GID = uint32(gid) + return stat } -var _ vfs.DynamicBytesSource = (*statusData)(nil) +// CheckPermissions implements kernfs.Inode. +func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := i.Mode() + uid, gid := i.getOwner(mode) + return vfs.GenericCheckPermissions( + creds, + ats, + mode.FileType() == linux.ModeDirectory, + uint16(mode), + uid, + gid, + ) +} -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) - fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) - fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) - fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) - ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { - ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) +func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) { + // By default, set the task owner as the file owner. + creds := i.owner.Credentials() + uid := creds.EffectiveKUID + gid := creds.EffectiveKGID + + // Linux doesn't apply dumpability adjustments to world readable/executable + // directories so that applications can stat /proc/PID to determine the + // effective UID of a process. See fs/proc/base.c:task_dump_owner. + if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 { + return uid, gid } - fmt.Fprintf(buf, "PPid:\t%d\n", ppid) - tpid := kernel.ThreadID(0) - if tracer := s.t.Tracer(); tracer != nil { - tpid = s.pidns.IDOfTask(tracer) + + // If the task is not dumpable, then root (in the namespace preferred) + // owns the file. + m := getMM(i.owner) + if m == nil { + return auth.RootKUID, auth.RootKGID } - fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) - var fds int - var vss, rss, data uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.Size() + if m.Dumpability() != mm.UserDumpable { + uid = auth.RootKUID + if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() { + uid = kuid } - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - data = mm.VirtualDataSize() + gid = auth.RootKGID + if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() { + gid = kgid } - }) - fmt.Fprintf(buf, "FDSize:\t%d\n", fds) - fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) - fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) - fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) - fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) - creds := s.t.Credentials() - fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) - fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) - fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) - fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) - fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) - // We unconditionally report a single NUMA node. See - // pkg/sentry/syscalls/linux/sys_mempolicy.go. - fmt.Fprintf(buf, "Mems_allowed:\t1\n") - fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") - return nil -} - -// ioUsage is the /proc//io and /proc//task//io data provider. -type ioUsage interface { - // IOUsage returns the io usage data. - IOUsage() *usage.IO -} - -// +stateify savable -type ioData struct { - ioUsage + } + return uid, gid } -var _ vfs.DynamicBytesSource = (*ioData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { - io := usage.IO{} - io.Accumulate(i.IOUsage()) - - fmt.Fprintf(buf, "char: %d\n", io.CharsRead) - fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) - fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) - fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) - fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) - fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) - fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) - return nil +func newIO(t *kernel.Task, isThreadGroup bool) *ioData { + if isThreadGroup { + return &ioData{ioUsage: t.ThreadGroup()} + } + return &ioData{ioUsage: t} } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go new file mode 100644 index 000000000..93f0e1aa8 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -0,0 +1,272 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// mm gets the kernel task's MemoryManager. No additional reference is taken on +// mm here. This is safe because MemoryManager.destroy is required to leave the +// MemoryManager in a state where it's still usable as a DynamicBytesSource. +func getMM(task *kernel.Task) *mm.MemoryManager { + var tmm *mm.MemoryManager + task.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + tmm = mm + } + }) + return tmm +} + +// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. +// +// +stateify savable +type mapsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := getMM(d.task); mm != nil { + mm.ReadMapsDataInto(ctx, buf) + } + return nil +} + +// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. +// +// +stateify savable +type smapsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*smapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := getMM(d.task); mm != nil { + mm.ReadSmapsDataInto(ctx, buf) + } + return nil +} + +// +stateify savable +type taskStatData struct { + kernfs.DynamicBytesFile + + t *kernel.Task + + // If tgstats is true, accumulate fault stats (not implemented) and CPU + // time across all tasks in t's thread group. + tgstats bool + + // pidns is the PID namespace associated with the proc filesystem that + // includes the file using this statData. + pidns *kernel.PIDNamespace +} + +var _ dynamicInode = (*taskStatData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) + fmt.Fprintf(buf, "(%s) ", s.t.Name()) + fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "%d ", ppid) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) + fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) + fmt.Fprintf(buf, "0 " /* flags */) + fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) + var cputime usage.CPUStats + if s.tgstats { + cputime = s.t.ThreadGroup().CPUStats() + } else { + cputime = s.t.CPUStats() + } + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + cputime = s.t.ThreadGroup().JoinedChildCPUStats() + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) + fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + + // itrealvalue. Since kernel 2.6.17, this field is no longer + // maintained, and is hard coded as 0. + fmt.Fprintf(buf, "0 ") + + // Start time is relative to boot time, expressed in clock ticks. + fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) + + // rsslim. + fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) + + fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) + fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) + fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) + terminationSignal := linux.Signal(0) + if s.t == s.t.ThreadGroup().Leader() { + terminationSignal = s.t.ThreadGroup().TerminationSignal() + } + fmt.Fprintf(buf, "%d ", terminationSignal) + fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) + fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) + fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) + fmt.Fprintf(buf, "0\n" /* exit_code */) + + return nil +} + +// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. +// +// +stateify savable +type statmData struct { + kernfs.DynamicBytesFile + + t *kernel.Task +} + +var _ dynamicInode = (*statmData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + + fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize) + return nil +} + +// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. +// +// +stateify savable +type statusData struct { + kernfs.DynamicBytesFile + + t *kernel.Task + pidns *kernel.PIDNamespace +} + +var _ dynamicInode = (*statusData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) + fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) + fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) + fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "PPid:\t%d\n", ppid) + tpid := kernel.ThreadID(0) + if tracer := s.t.Tracer(); tracer != nil { + tpid = s.pidns.IDOfTask(tracer) + } + fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) + var fds int + var vss, rss, data uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() + } + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } + }) + fmt.Fprintf(buf, "FDSize:\t%d\n", fds) + fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) + fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) + fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) + fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) + creds := s.t.Credentials() + fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) + fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) + fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) + fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) + fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(buf, "Mems_allowed:\t1\n") + fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") + return nil +} + +// ioUsage is the /proc//io and /proc//task//io data provider. +type ioUsage interface { + // IOUsage returns the io usage data. + IOUsage() *usage.IO +} + +// +stateify savable +type ioData struct { + kernfs.DynamicBytesFile + + ioUsage +} + +var _ dynamicInode = (*ioData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { + io := usage.IO{} + io.Accumulate(i.IOUsage()) + + fmt.Fprintf(buf, "char: %d\n", io.CharsRead) + fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) + fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) + fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) + fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) + fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) + fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go new file mode 100644 index 000000000..50b2a832f --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -0,0 +1,162 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const defaultPermission = 0444 + +// InoGenerator generates unique inode numbers for a given filesystem. +type InoGenerator interface { + NextIno() uint64 +} + +// tasksInode represents the inode for /proc/ directory. +// +// +stateify savable +type tasksInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + + inoGen InoGenerator + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*tasksInode)(nil) + +func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) { + root := auth.NewRootCredentials(pidns.UserNamespace()) + contents := map[string]*kernfs.Dentry{ + //"cpuinfo": newCPUInfo(ctx, msrc), + //"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc), + "loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}), + "meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"), + "self": newSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns), + "stat": newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}), + "thread-self": newThreadSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns), + //"uptime": newUptime(ctx, msrc), + //"version": newVersionData(root, inoGen.NextIno(), k), + "version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}), + } + + inode := &tasksInode{ + pidns: pidns, + inoGen: inoGen, + } + inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := inode.OrderedChildren.Populate(dentry, contents) + inode.IncLinks(links) + + return inode, dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + // Try to lookup a corresponding task. + tid, err := strconv.ParseUint(name, 10, 64) + if err != nil { + return nil, syserror.ENOENT + } + + task := i.pidns.TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return nil, syserror.ENOENT + } + + taskDentry := newTaskInode(i.inoGen, task, i.pidns, true) + return taskDentry.VFSDentry(), nil +} + +// Valid implements kernfs.inodeDynamicLookup. +func (i *tasksInode) Valid(ctx context.Context) bool { + return true +} + +// IterDirents implements kernfs.inodeDynamicLookup. +// +// TODO(gvisor.dev/issue/1195): Use tgid N offset = TGID_OFFSET + N. +func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + var tids []int + + // Collect all tasks. Per linux we only include it in directory listings if + // it's the leader. But for whatever crazy reason, you can still walk to the + // given node. + for _, tg := range i.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + tids = append(tids, int(i.pidns.IDOfThreadGroup(tg))) + } + } + + if len(tids) == 0 { + return offset, nil + } + if relOffset >= int64(len(tids)) { + return offset, nil + } + + sort.Ints(tids) + for _, tid := range tids[relOffset:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(tid), 10), + Type: linux.DT_DIR, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } + return offset, nil +} + +// Open implements kernfs.Inode. +func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { + stat := i.InodeAttrs.Stat(vsfs) + + // Add dynamic children to link count. + for _, tg := range i.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + stat.Nlink++ + } + } + + return stat +} diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go new file mode 100644 index 000000000..91f30a798 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -0,0 +1,92 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +type selfSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*selfSymlink)(nil) + +func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { + inode := &selfSymlink{pidns: pidns} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // Who is reading this link? + return "", syserror.EINVAL + } + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + if tgid == 0 { + return "", syserror.ENOENT + } + return strconv.FormatUint(uint64(tgid), 10), nil +} + +type threadSelfSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*threadSelfSymlink)(nil) + +func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { + inode := &threadSelfSymlink{pidns: pidns} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // Who is reading this link? + return "", syserror.EINVAL + } + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + tid := s.pidns.IDOfTask(t) + if tid == 0 || tgid == 0 { + return "", syserror.ENOENT + } + return fmt.Sprintf("%d/task/%d", tgid, tid), nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go new file mode 100644 index 000000000..48201d75a --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -0,0 +1,412 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "path" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type testIterDirentsCallback struct { + dirents []vfs.Dirent +} + +func (t *testIterDirentsCallback) Handle(d vfs.Dirent) bool { + t.dirents = append(t.dirents, d) + return true +} + +func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) { + if got := len(dirs); got < 2 { + return dirs, fmt.Errorf("wrong number of dirents, want at least: 2, got: %d: %v", got, dirs) + } + for i, want := range []string{".", ".."} { + if got := dirs[i].Name; got != want { + return dirs, fmt.Errorf("wrong name, want: %s, got: %s", want, got) + } + if got := dirs[i].Type; got != linux.DT_DIR { + return dirs, fmt.Errorf("wrong type, want: %d, got: %d", linux.DT_DIR, got) + } + } + return dirs[2:], nil +} + +func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { + wants := map[string]vfs.Dirent{ + "loadavg": vfs.Dirent{Type: linux.DT_REG}, + "meminfo": vfs.Dirent{Type: linux.DT_REG}, + "mounts": vfs.Dirent{Type: linux.DT_LNK}, + "self": vfs.Dirent{Type: linux.DT_LNK}, + "stat": vfs.Dirent{Type: linux.DT_REG}, + "thread-self": vfs.Dirent{Type: linux.DT_LNK}, + "version": vfs.Dirent{Type: linux.DT_REG}, + } + return checkFiles(gots, wants) +} + +func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { + wants := map[string]vfs.Dirent{ + "io": vfs.Dirent{Type: linux.DT_REG}, + "maps": vfs.Dirent{Type: linux.DT_REG}, + "smaps": vfs.Dirent{Type: linux.DT_REG}, + "stat": vfs.Dirent{Type: linux.DT_REG}, + "statm": vfs.Dirent{Type: linux.DT_REG}, + "status": vfs.Dirent{Type: linux.DT_REG}, + } + return checkFiles(gots, wants) +} + +func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, error) { + // Go over all files, when there is a match, the file is removed from both + // 'gots' and 'wants'. wants is expected to reach 0, as all files must + // be present. Remaining files in 'gots', is returned to caller to decide + // whether this is valid or not. + for i := 0; i < len(gots); i++ { + got := gots[i] + want, ok := wants[got.Name] + if !ok { + continue + } + if want.Type != got.Type { + return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got) + } + + delete(wants, got.Name) + gots = append(gots[0:i], gots[i+1:]...) + i-- + } + if len(wants) != 0 { + return gots, fmt.Errorf("not all files were found, missing: %+v", wants) + } + return gots, nil +} + +func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) { + k, err := boot() + if err != nil { + return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err) + } + + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err) + } + return ctx, vfsObj, mntns.Root(), nil +} + +func TestTasksEmpty(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTasksStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Error("found more files than expected: %+v", cb.dirents) + } +} + +func TestTasks(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + var tasks []*kernel.Task + for i := 0; i < 5; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + tasks = append(tasks, task) + } + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTasksStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + lastPid := 0 + for _, d := range cb.dirents { + pid, err := strconv.Atoi(d.Name) + if err != nil { + t.Fatalf("Invalid process directory %q", d.Name) + } + if lastPid > pid { + t.Errorf("pids not in order: %v", cb.dirents) + } + found := false + for _, t := range tasks { + if k.TaskSet().Root.IDOfTask(t) == kernel.ThreadID(pid) { + found = true + } + } + if !found { + t.Errorf("Additional task ID %d listed: %v", pid, tasks) + } + } + + // Test lookup. + for _, path := range []string{"/1", "/2"} { + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(path)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) + } + buf := make([]byte, 1) + bufIOSeq := usermem.BytesIOSequence(buf) + if _, err := fd.Read(ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { + t.Errorf("wrong error reading directory: %v", err) + } + } + + if _, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/9999")}, + &vfs.OpenOptions{}, + ); err != syserror.ENOENT { + t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err) + } +} + +func TestTask(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + _, err = createTask(ctx, "name", tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/1")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/1) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTaskStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } +} + +func TestProcSelf(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(ctx, "name", tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + + fd, err := vfsObj.OpenAt( + task, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/self/"), FollowFinalSymlink: true}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/self/) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTaskStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } +} + +func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, fd *vfs.FileDescription) { + t.Logf("Iterating: /proc%s", fd.MappedName(ctx)) + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + var err error + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + for _, d := range cb.dirents { + childPath := path.Join(fd.MappedName(ctx), d.Name) + if d.Type == linux.DT_LNK { + link, err := vfsObj.ReadlinkAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)}, + ) + if err != nil { + t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err) + } else { + t.Logf("Skipping symlink: /proc%s => %s", childPath, link) + } + continue + } + + t.Logf("Opening: /proc%s", childPath) + child, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err) + continue + } + stat, err := child.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Errorf("Stat(%v) failed: %v", childPath, err) + } + if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type { + t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type) + } + if d.Type == linux.DT_DIR { + // Found another dir, let's do it again! + iterateDir(ctx, t, vfsObj, root, child) + } + } +} + +// TestTree iterates all directories and stats every file. +func TestTree(t *testing.T) { + uberCtx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(uberCtx) + var tasks []*kernel.Task + for i := 0; i < 5; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(uberCtx, fmt.Sprintf("name-%d", i), tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + tasks = append(tasks, task) + } + + ctx := tasks[0] + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(uberCtx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + iterateDir(ctx, t, vfsObj, root, fd) +} diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go index e1643d4e0..367f2396b 100644 --- a/pkg/sentry/fsimpl/proc/version.go +++ b/pkg/sentry/fsimpl/proc/version.go @@ -19,19 +19,21 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // versionData implements vfs.DynamicBytesSource for /proc/version. // // +stateify savable type versionData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*versionData)(nil) +var _ dynamicInode = (*versionData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index bd3fb4c03..8653d2f63 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -762,7 +762,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, mounts.IncRef() } - tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) + tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) ctx := args.NewContext(k) // Get the root directory from the MountNamespace. @@ -1191,6 +1191,11 @@ func (k *Kernel) GlobalInit() *ThreadGroup { return k.globalInit } +// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace. +func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) { + k.globalInit = tg +} + // ApplicationCores returns the number of CPUs visible to sandboxed // applications. func (k *Kernel) ApplicationCores() uint { diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 3eadfedb4..5f3589493 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -243,7 +243,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { if opts.NewSignalHandlers { sh = sh.Fork() } - tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock) + tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) } cfg := &TaskConfig{ diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 72568d296..0cded73f6 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -256,20 +256,20 @@ type ThreadGroup struct { tty *TTY } -// newThreadGroup returns a new, empty thread group in PID namespace ns. The +// NewThreadGroup returns a new, empty thread group in PID namespace ns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. -func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup { +func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup { tg := &ThreadGroup{ threadGroupNode: threadGroupNode{ - pidns: ns, + pidns: pidns, }, signalHandlers: sh, terminationSignal: terminationSignal, ioUsage: &usage.IO{}, limits: limits, - mounts: mounts, + mounts: mntns, } tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg}) tg.timers = make(map[linux.TimerID]*IntervalTimer) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 3df49991c..de782e577 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -199,6 +199,17 @@ type DynamicBytesSource interface { Generate(ctx context.Context, buf *bytes.Buffer) error } +// StaticData implements DynamicBytesSource over a static string. +type StaticData struct { + Data string +} + +// Generate implements DynamicBytesSource. +func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(s.Data) + return nil +} + // SetDataSource must be called exactly once on fd before first use. func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { fd.data = data -- cgit v1.2.3 From 6410387ff9b4f0dbe88325ea0e30776f5f3efd5d Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 6 Jan 2020 09:27:35 -0800 Subject: Cleanup Shm reference handling Currently, shm.Registry.FindByID will return Shm instances without taking an additional reference on them, making it possible for them to disappear. More explicitly handle references. All callers hold a reference for the duration that they hold the instance. Registry.shms may transitively hold Shms with no references, so it must TryIncRef to determine if they are still valid. PiperOrigin-RevId: 288314529 --- pkg/sentry/kernel/shm/shm.go | 85 +++++++++++++++++++++++++----------- pkg/sentry/syscalls/linux/sys_shm.go | 7 ++- 2 files changed, 66 insertions(+), 26 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 5bd610f68..19034a21e 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -71,9 +71,20 @@ type Registry struct { mu sync.Mutex `state:"nosave"` // shms maps segment ids to segments. + // + // shms holds all referenced segments, which are removed on the last + // DecRef. Thus, it cannot itself hold a reference on the Shm. + // + // Since removal only occurs after the last (unlocked) DecRef, there + // exists a short window during which a Shm still exists in Shm, but is + // unreferenced. Users must use TryIncRef to determine if the Shm is + // still valid. shms map[ID]*Shm // keysToShms maps segment keys to segments. + // + // Shms in keysToShms are guaranteed to be referenced, as they are + // removed by disassociateKey before the last DecRef. keysToShms map[Key]*Shm // Sum of the sizes of all existing segments rounded up to page size, in @@ -95,10 +106,18 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { } // FindByID looks up a segment given an ID. +// +// FindByID returns a reference on Shm. func (r *Registry) FindByID(id ID) *Shm { r.mu.Lock() defer r.mu.Unlock() - return r.shms[id] + s := r.shms[id] + // Take a reference on s. If TryIncRef fails, s has reached the last + // DecRef, but hasn't quite been removed from r.shms yet. + if s != nil && s.TryIncRef() { + return s + } + return nil } // dissociateKey removes the association between a segment and its key, @@ -119,6 +138,8 @@ func (r *Registry) dissociateKey(s *Shm) { // FindOrCreate looks up or creates a segment in the registry. It's functionally // analogous to open(2). +// +// FindOrCreate returns a reference on Shm. func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { // "A new segment was to be created and size is less than SHMMIN or @@ -166,6 +187,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui return nil, syserror.EEXIST } + shm.IncRef() return shm, nil } @@ -193,7 +215,14 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui // Need to create a new segment. creator := fs.FileOwnerFromContext(ctx) perms := fs.FilePermsFromMode(mode) - return r.newShm(ctx, pid, key, creator, perms, size) + s, err := r.newShm(ctx, pid, key, creator, perms, size) + if err != nil { + return nil, err + } + // The initial reference is held by s itself. Take another to return to + // the caller. + s.IncRef() + return s, nil } // newShm creates a new segment in the registry. @@ -296,22 +325,26 @@ func (r *Registry) remove(s *Shm) { // Shm represents a single shared memory segment. // -// Shm segment are backed directly by an allocation from platform -// memory. Segments are always mapped as a whole, greatly simplifying how -// mappings are tracked. However note that mremap and munmap calls may cause the -// vma for a segment to become fragmented; which requires special care when -// unmapping a segment. See mm/shm.go. +// Shm segment are backed directly by an allocation from platform memory. +// Segments are always mapped as a whole, greatly simplifying how mappings are +// tracked. However note that mremap and munmap calls may cause the vma for a +// segment to become fragmented; which requires special care when unmapping a +// segment. See mm/shm.go. // // Segments persist until they are explicitly marked for destruction via -// shmctl(SHM_RMID). +// MarkDestroyed(). // // Shm implements memmap.Mappable and memmap.MappingIdentity. // // +stateify savable type Shm struct { - // AtomicRefCount tracks the number of references to this segment from - // maps. A segment always holds a reference to itself, until it's marked for + // AtomicRefCount tracks the number of references to this segment. + // + // A segment holds a reference to itself until it is marked for // destruction. + // + // In addition to direct users, the MemoryManager will hold references + // via MappingIdentity. refs.AtomicRefCount mfp pgalloc.MemoryFileProvider @@ -484,9 +517,8 @@ type AttachOpts struct { // ConfigureAttach creates an mmap configuration for the segment with the // requested attach options. // -// ConfigureAttach returns with a ref on s on success. The caller should drop -// this once the map is installed. This reference prevents s from being -// destroyed before the returned configuration is used. +// Postconditions: The returned MMapOpts are valid only as long as a reference +// continues to be held on s. func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) { s.mu.Lock() defer s.mu.Unlock() @@ -504,7 +536,6 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac // in the user namespace that governs its IPC namespace." - man shmat(2) return memmap.MMapOpts{}, syserror.EACCES } - s.IncRef() return memmap.MMapOpts{ Length: s.size, Offset: 0, @@ -549,10 +580,15 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { } creds := auth.CredentialsFromContext(ctx) - nattach := uint64(s.ReadRefs()) - // Don't report the self-reference we keep prior to being marked for - // destruction. However, also don't report a count of -1 for segments marked - // as destroyed, with no mappings. + // Use the reference count as a rudimentary count of the number of + // attaches. We exclude: + // + // 1. The reference the caller holds. + // 2. The self-reference held by s prior to destruction. + // + // Note that this may still overcount by including transient references + // used in concurrent calls. + nattach := uint64(s.ReadRefs()) - 1 if !s.pendingDestruction { nattach-- } @@ -620,18 +656,17 @@ func (s *Shm) MarkDestroyed() { s.registry.dissociateKey(s) s.mu.Lock() - // Only drop the segment's self-reference once, when destruction is - // requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a - // segment to be destroyed prematurely, potentially with active maps to the - // segment's address range. Remaining references are dropped when the - // segment is detached or unmaped. + defer s.mu.Unlock() if !s.pendingDestruction { s.pendingDestruction = true - s.mu.Unlock() // Must release s.mu before calling s.DecRef. + // Drop the self-reference so destruction occurs when all + // external references are gone. + // + // N.B. This cannot be the final DecRef, as the caller also + // holds a reference. s.DecRef() return } - s.mu.Unlock() } // checkOwnership verifies whether a segment may be accessed by ctx as an diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go index d57ffb3a1..4a8bc24a2 100644 --- a/pkg/sentry/syscalls/linux/sys_shm.go +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -39,10 +39,13 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } + defer segment.DecRef() return uintptr(segment.ID), nil, nil } // findSegment retrives a shm segment by the given id. +// +// findSegment returns a reference on Shm. func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) { r := t.IPCNamespace().ShmRegistry() segment := r.FindByID(id) @@ -63,6 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, syserror.EINVAL } + defer segment.DecRef() opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{ Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC, @@ -72,7 +76,6 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer segment.DecRef() addr, err = t.MemoryManager().MMap(t, opts) return uintptr(addr), nil, err } @@ -105,6 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } + defer segment.DecRef() stat, err := segment.IPCStat(t) if err == nil { @@ -128,6 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } + defer segment.DecRef() switch cmd { case linux.IPC_SET: -- cgit v1.2.3 From 354a15a234c1270bcb9b902503f61835b2ccd2d0 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 6 Jan 2020 11:41:13 -0800 Subject: Implement rseq(2) PiperOrigin-RevId: 288342928 --- pkg/abi/linux/BUILD | 1 + pkg/abi/linux/rseq.go | 130 ++++++++++ pkg/sentry/arch/arch.go | 6 +- pkg/sentry/arch/arch_amd64.go | 4 +- pkg/sentry/kernel/rseq.go | 383 +++++++++++++++++++++++++---- pkg/sentry/kernel/task.go | 43 +++- pkg/sentry/kernel/task_clone.go | 7 + pkg/sentry/kernel/task_exec.go | 6 +- pkg/sentry/kernel/task_run.go | 16 +- pkg/sentry/kernel/task_start.go | 10 + pkg/sentry/kernel/thread_group.go | 18 +- pkg/sentry/syscalls/linux/BUILD | 1 + pkg/sentry/syscalls/linux/linux64_amd64.go | 2 +- pkg/sentry/syscalls/linux/linux64_arm64.go | 2 +- pkg/sentry/syscalls/linux/sys_rseq.go | 48 ++++ 15 files changed, 598 insertions(+), 79 deletions(-) create mode 100644 pkg/abi/linux/rseq.go create mode 100644 pkg/sentry/syscalls/linux/sys_rseq.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 9553f164d..716ff22d2 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -41,6 +41,7 @@ go_library( "poll.go", "prctl.go", "ptrace.go", + "rseq.go", "rusage.go", "sched.go", "seccomp.go", diff --git a/pkg/abi/linux/rseq.go b/pkg/abi/linux/rseq.go new file mode 100644 index 000000000..76253ba30 --- /dev/null +++ b/pkg/abi/linux/rseq.go @@ -0,0 +1,130 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Flags passed to rseq(2). +// +// Defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_FLAG_UNREGISTER unregisters the current thread. + RSEQ_FLAG_UNREGISTER = 1 << 0 +) + +// Critical section flags used in RSeqCriticalSection.Flags and RSeq.Flags. +// +// Defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT inhibits restart on preemption. + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 1 << 0 + + // RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL inhibits restart on signal + // delivery. + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 1 << 1 + + // RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE inhibits restart on CPU + // migration. + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 1 << 2 +) + +// RSeqCriticalSection describes a restartable sequences critical section. It +// is equivalent to struct rseq_cs, defined in include/uapi/linux/rseq.h. +// +// In userspace, this structure is always aligned to 32 bytes. +// +// +marshal +type RSeqCriticalSection struct { + // Version is the version of this structure. Version 0 is defined here. + Version uint32 + + // Flags are the critical section flags, defined above. + Flags uint32 + + // Start is the start address of the critical section. + Start uint64 + + // PostCommitOffset is the offset from Start of the first instruction + // outside of the critical section. + PostCommitOffset uint64 + + // Abort is the abort address. It must be outside the critical section, + // and the 4 bytes prior must match the abort signature. + Abort uint64 +} + +const ( + // SizeOfRSeqCriticalSection is the size of RSeqCriticalSection. + SizeOfRSeqCriticalSection = 32 + + // SizeOfRSeqSignature is the size of the signature immediately + // preceding RSeqCriticalSection.Abort. + SizeOfRSeqSignature = 4 +) + +// Special values for RSeq.CPUID, defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_CPU_ID_UNINITIALIZED indicates that this thread has not + // performed rseq initialization. + RSEQ_CPU_ID_UNINITIALIZED = ^uint32(0) // -1 + + // RSEQ_CPU_ID_REGISTRATION_FAILED indicates that rseq initialization + // failed. + RSEQ_CPU_ID_REGISTRATION_FAILED = ^uint32(1) // -2 +) + +// RSeq is the thread-local restartable sequences config/status. It +// is equivalent to struct rseq, defined in include/uapi/linux/rseq.h. +// +// In userspace, this structure is always aligned to 32 bytes. +type RSeq struct { + // CPUIDStart contains the current CPU ID if rseq is initialized. + // + // This field should only be read by the thread which registered this + // structure, and must be read atomically. + CPUIDStart uint32 + + // CPUID contains the current CPU ID or one of the CPU ID special + // values defined above. + // + // This field should only be read by the thread which registered this + // structure, and must be read atomically. + CPUID uint32 + + // RSeqCriticalSection is a pointer to the current RSeqCriticalSection + // block, or NULL. It is reset to NULL by the kernel on restart or + // non-restarting preempt/signal. + // + // This field should only be written by the thread which registered + // this structure, and must be written atomically. + RSeqCriticalSection uint64 + + // Flags are the critical section flags that apply to all critical + // sections on this thread, defined above. + Flags uint32 +} + +const ( + // SizeOfRSeq is the size of RSeq. + // + // Note that RSeq is naively 24 bytes. However, it has 32-byte + // alignment, which in C increases sizeof to 32. That is the size that + // the Linux kernel uses. + SizeOfRSeq = 32 + + // AlignOfRSeq is the standard alignment of RSeq. + AlignOfRSeq = 32 + + // OffsetOfRSeqCriticalSection is the offset of RSeqCriticalSection in RSeq. + OffsetOfRSeqCriticalSection = 8 +) diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index 498ca4669..81ec98a77 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -125,9 +125,9 @@ type Context interface { // SetTLS sets the current TLS pointer. Returns false if value is invalid. SetTLS(value uintptr) bool - // SetRSEQInterruptedIP sets the register that contains the old IP when a - // restartable sequence is interrupted. - SetRSEQInterruptedIP(value uintptr) + // SetOldRSeqInterruptedIP sets the register that contains the old IP + // when an "old rseq" restartable sequence is interrupted. + SetOldRSeqInterruptedIP(value uintptr) // StateData returns a pointer to underlying architecture state. StateData() *State diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 67daa6c24..2aa08b1a9 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -174,8 +174,8 @@ func (c *context64) SetTLS(value uintptr) bool { return true } -// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP. -func (c *context64) SetRSEQInterruptedIP(value uintptr) { +// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP. +func (c *context64) SetOldRSeqInterruptedIP(value uintptr) { c.Regs.R10 = uint64(value) } diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index 24ea002ba..b14429854 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -15,17 +15,29 @@ package kernel import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) -// Restartable sequences, as described in https://lwn.net/Articles/650333/. +// Restartable sequences. +// +// We support two different APIs for restartable sequences. +// +// 1. The upstream interface added in v4.18. +// 2. The interface described in https://lwn.net/Articles/650333/. +// +// Throughout this file and other parts of the kernel, the latter is referred +// to as "old rseq". This interface was never merged upstream, but is supported +// for a limited set of applications that use it regardless. -// RSEQCriticalRegion describes a restartable sequence critical region. +// OldRSeqCriticalRegion describes an old rseq critical region. // // +stateify savable -type RSEQCriticalRegion struct { +type OldRSeqCriticalRegion struct { // When a task in this thread group has its CPU preempted (as defined by // platform.ErrContextCPUPreempted) or has a signal delivered to an // application handler while its instruction pointer is in CriticalSection, @@ -35,86 +47,359 @@ type RSEQCriticalRegion struct { Restart usermem.Addr } -// RSEQAvailable returns true if t supports restartable sequences. -func (t *Task) RSEQAvailable() bool { +// RSeqAvailable returns true if t supports (old and new) restartable sequences. +func (t *Task) RSeqAvailable() bool { return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption() } -// RSEQCriticalRegion returns a copy of t's thread group's current restartable -// sequence. -func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion { - return *t.tg.rscr.Load().(*RSEQCriticalRegion) +// SetRSeq registers addr as this thread's rseq structure. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error { + if t.rseqAddr != 0 { + if t.rseqAddr != addr { + return syserror.EINVAL + } + if t.rseqSignature != signature { + return syserror.EINVAL + } + return syserror.EBUSY + } + + // rseq must be aligned and correctly sized. + if addr&(linux.AlignOfRSeq-1) != 0 { + return syserror.EINVAL + } + if length != linux.SizeOfRSeq { + return syserror.EINVAL + } + if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok { + return syserror.EFAULT + } + + t.rseqAddr = addr + t.rseqSignature = signature + + // Initialize the CPUID. + // + // Linux implicitly does this on return from userspace, where failure + // would cause SIGSEGV. + if err := t.rseqUpdateCPU(); err != nil { + t.rseqAddr = 0 + t.rseqSignature = 0 + + t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return syserror.EFAULT + } + + return nil } -// SetRSEQCriticalRegion replaces t's thread group's restartable sequence. +// ClearRSeq unregisters addr as this thread's rseq structure. // -// Preconditions: t.RSEQAvailable() == true. -func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error { +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error { + if t.rseqAddr == 0 { + return syserror.EINVAL + } + if t.rseqAddr != addr { + return syserror.EINVAL + } + if length != linux.SizeOfRSeq { + return syserror.EINVAL + } + if t.rseqSignature != signature { + return syserror.EPERM + } + + if err := t.rseqClearCPU(); err != nil { + return err + } + + t.rseqAddr = 0 + t.rseqSignature = 0 + + if t.oldRSeqCPUAddr == 0 { + // rseqCPU no longer needed. + t.rseqCPU = -1 + } + + return nil +} + +// OldRSeqCriticalRegion returns a copy of t's thread group's current +// old restartable sequence. +func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion { + return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) +} + +// SetOldRSeqCriticalRegion replaces t's thread group's old restartable +// sequence. +// +// Preconditions: t.RSeqAvailable() == true. +func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { // These checks are somewhat more lenient than in Linux, which (bizarrely) - // requires rscr.CriticalSection to be non-empty and rscr.Restart to be - // outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0 + // requires r.CriticalSection to be non-empty and r.Restart to be + // outside of r.CriticalSection, even if r.CriticalSection.Start == 0 // (which disables the critical region). - if rscr.CriticalSection.Start == 0 { - rscr.CriticalSection.End = 0 - rscr.Restart = 0 - t.tg.rscr.Store(&rscr) + if r.CriticalSection.Start == 0 { + r.CriticalSection.End = 0 + r.Restart = 0 + t.tg.oldRSeqCritical.Store(&r) return nil } - if rscr.CriticalSection.Start >= rscr.CriticalSection.End { + if r.CriticalSection.Start >= r.CriticalSection.End { return syserror.EINVAL } - if rscr.CriticalSection.Contains(rscr.Restart) { + if r.CriticalSection.Contains(r.Restart) { return syserror.EINVAL } - // TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in - // the application address range, for consistency with Linux - t.tg.rscr.Store(&rscr) + // TODO(jamieliu): check that r.CriticalSection and r.Restart are in + // the application address range, for consistency with Linux. + t.tg.oldRSeqCritical.Store(&r) return nil } -// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU -// number. +// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's +// CPU number. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) RSEQCPUAddr() usermem.Addr { - return t.rseqCPUAddr +func (t *Task) OldRSeqCPUAddr() usermem.Addr { + return t.oldRSeqCPUAddr } -// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU -// number. +// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with +// t's CPU number. // -// Preconditions: t.RSEQAvailable() == true. The caller must be running on the +// Preconditions: t.RSeqAvailable() == true. The caller must be running on the // task goroutine. t's AddressSpace must be active. -func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error { - t.rseqCPUAddr = addr - if addr != 0 { - t.rseqCPU = int32(hostcpu.GetCPU()) - if err := t.rseqCopyOutCPU(); err != nil { - t.rseqCPUAddr = 0 - t.rseqCPU = -1 - return syserror.EINVAL // yes, EINVAL, not err or EFAULT - } - } else { - t.rseqCPU = -1 +func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { + t.oldRSeqCPUAddr = addr + + // Check that addr is writable. + // + // N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's + // unfortunate, but unlikely in a correct program. + if err := t.rseqUpdateCPU(); err != nil { + t.oldRSeqCPUAddr = 0 + return syserror.EINVAL // yes, EINVAL, not err or EFAULT } return nil } // Preconditions: The caller must be running on the task goroutine. t's // AddressSpace must be active. -func (t *Task) rseqCopyOutCPU() error { +func (t *Task) rseqUpdateCPU() error { + if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 { + t.rseqCPU = -1 + return nil + } + + t.rseqCPU = int32(hostcpu.GetCPU()) + + // Update both CPUs, even if one fails. + rerr := t.rseqCopyOutCPU() + oerr := t.oldRSeqCopyOutCPU() + + if rerr != nil { + return rerr + } + return oerr +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) oldRSeqCopyOutCPU() error { + if t.oldRSeqCPUAddr == 0 { + return nil + } + buf := t.CopyScratchBuffer(4) usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) - _, err := t.CopyOutBytes(t.rseqCPUAddr, buf) + _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf) + return err +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqCopyOutCPU() error { + if t.rseqAddr == 0 { + return nil + } + + buf := t.CopyScratchBuffer(8) + // CPUIDStart and CPUID are the first two fields in linux.RSeq. + usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart + usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID + // N.B. This write is not atomic, but since this occurs on the task + // goroutine then as long as userspace uses a single-instruction read + // it can't see an invalid value. + _, err := t.CopyOutBytes(t.rseqAddr, buf) + return err +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqClearCPU() error { + buf := t.CopyScratchBuffer(8) + // CPUIDStart and CPUID are the first two fields in linux.RSeq. + usermem.ByteOrder.PutUint32(buf, 0) // CPUIDStart + usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID + // N.B. This write is not atomic, but since this occurs on the task + // goroutine then as long as userspace uses a single-instruction read + // it can't see an invalid value. + _, err := t.CopyOutBytes(t.rseqAddr, buf) return err } +// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so. +// +// This is a bit complex since both the RSeq and RSeqCriticalSection structs +// are stored in userspace. So we must: +// +// 1. Copy in the address of RSeqCriticalSection from RSeq. +// 2. Copy in RSeqCriticalSection itself. +// 3. Validate critical section struct version, address range, abort address. +// 4. Validate the abort signature (4 bytes preceding abort IP match expected +// signature). +// 5. Clear address of RSeqCriticalSection from RSeq. +// 6. Finally, conditionally abort. +// +// See kernel/rseq.c:rseq_ip_fixup for reference. +// +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqAddrInterrupt() { + if t.rseqAddr == 0 { + return + } + + critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection) + if !ok { + // SetRSeq should validate this. + panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr)) + } + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + t.Debugf("Only 64-bit rseq supported.") + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + buf := t.CopyScratchBuffer(8) + if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil { + t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf)) + if critAddr == 0 { + return + } + + buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection) + if _, err := t.CopyInBytes(critAddr, buf); err != nil { + t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Manually marshal RSeqCriticalSection as this is in the hot path when + // rseq is enabled. It must be as fast as possible. + // + // TODO(b/130243041): Replace with go_marshal. + cs := linux.RSeqCriticalSection{ + Version: usermem.ByteOrder.Uint32(buf[0:4]), + Flags: usermem.ByteOrder.Uint32(buf[4:8]), + Start: usermem.ByteOrder.Uint64(buf[8:16]), + PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]), + Abort: usermem.ByteOrder.Uint64(buf[24:32]), + } + + if cs.Version != 0 { + t.Debugf("Unknown version in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + start := usermem.Addr(cs.Start) + critRange, ok := start.ToRange(cs.PostCommitOffset) + if !ok { + t.Debugf("Invalid start and offset in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + abort := usermem.Addr(cs.Abort) + if critRange.Contains(abort) { + t.Debugf("Abort in critical section in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Verify signature. + sigAddr := abort - linux.SizeOfRSeqSignature + + buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature) + if _, err := t.CopyInBytes(sigAddr, buf); err != nil { + t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + sig := usermem.ByteOrder.Uint32(buf) + if sig != t.rseqSignature { + t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Clear the critical section address. + // + // NOTE(b/143949567): We don't support any rseq flags, so we always + // restart if we are in the critical section, and thus *always* clear + // critAddrAddr. + if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Finally we can actually decide whether or not to restart. + if !critRange.Contains(usermem.Addr(t.Arch().IP())) { + return + } + + t.Arch().SetIP(uintptr(cs.Abort)) +} + // Preconditions: The caller must be running on the task goroutine. -func (t *Task) rseqInterrupt() { - rscr := t.tg.rscr.Load().(*RSEQCriticalRegion) - if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) { - t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart) - t.Arch().SetIP(uintptr(rscr.Restart)) - t.Arch().SetRSEQInterruptedIP(ip) +func (t *Task) oldRSeqInterrupt() { + r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) + if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) { + t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart) + t.Arch().SetIP(uintptr(r.Restart)) + t.Arch().SetOldRSeqInterruptedIP(ip) } } + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) rseqInterrupt() { + t.rseqAddrInterrupt() + t.oldRSeqInterrupt() +} diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index ab0c6c4aa..d25a7903b 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -489,18 +489,43 @@ type Task struct { // netns is protected by mu. netns is owned by the task goroutine. netns bool - // If rseqPreempted is true, before the next call to p.Switch(), interrupt - // RSEQ critical regions as defined by tg.rseq and write the task - // goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number - // written to rseqCPUAddr. + // If rseqPreempted is true, before the next call to p.Switch(), + // interrupt rseq critical regions as defined by rseqAddr and + // tg.oldRSeqCritical and write the task goroutine's CPU number to + // rseqAddr/oldRSeqCPUAddr. // - // If rseqCPUAddr is 0, rseqCPU is -1. + // We support two ABIs for restartable sequences: // - // rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task - // goroutine. + // 1. The upstream interface added in v4.18, + // 2. An "old" interface never merged upstream. In the implementation, + // this is referred to as "old rseq". + // + // rseqPreempted is exclusive to the task goroutine. rseqPreempted bool `state:"nosave"` - rseqCPUAddr usermem.Addr - rseqCPU int32 + + // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr. + // + // If rseq is unused, rseqCPU is -1 for convenient use in + // platform.Context.Switch. + // + // rseqCPU is exclusive to the task goroutine. + rseqCPU int32 + + // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. + // + // oldRSeqCPUAddr is exclusive to the task goroutine. + oldRSeqCPUAddr usermem.Addr + + // rseqAddr is a pointer to the userspace linux.RSeq structure. + // + // rseqAddr is exclusive to the task goroutine. + rseqAddr usermem.Addr + + // rseqSignature is the signature that the rseq abort IP must be signed + // with. + // + // rseqSignature is exclusive to the task goroutine. + rseqSignature uint32 // copyScratchBuffer is a buffer available to CopyIn/CopyOut // implementations that require an intermediate buffer to copy data diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 5f3589493..247bd4aba 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -236,7 +236,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } else if opts.NewPIDNamespace { pidns = pidns.NewChild(userns) } + tg := t.tg + rseqAddr := usermem.Addr(0) + rseqSignature := uint32(0) if opts.NewThreadGroup { tg.mounts.IncRef() sh := t.tg.signalHandlers @@ -244,6 +247,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { sh = sh.Fork() } tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + rseqAddr = t.rseqAddr + rseqSignature = t.rseqSignature } cfg := &TaskConfig{ @@ -260,6 +265,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { UTSNamespace: utsns, IPCNamespace: ipcns, AbstractSocketNamespace: t.abstractSockets, + RSeqAddr: rseqAddr, + RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), } if opts.NewThreadGroup { diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 90a6190f1..fa6528386 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -190,9 +190,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.updateRSSLocked() // Restartable sequence state is discarded. t.rseqPreempted = false - t.rseqCPUAddr = 0 t.rseqCPU = -1 - t.tg.rscr.Store(&RSEQCriticalRegion{}) + t.rseqAddr = 0 + t.rseqSignature = 0 + t.oldRSeqCPUAddr = 0 + t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) t.tg.pidns.owner.mu.Unlock() // Remove FDs with the CloseOnExec flag set. diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index d97f8c189..6357273d3 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -169,12 +169,22 @@ func (*runApp) execute(t *Task) taskRunState { // Apply restartable sequences. if t.rseqPreempted { t.rseqPreempted = false - if t.rseqCPUAddr != 0 { + if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 { + // Linux writes the CPU on every preemption. We only do + // so if it changed. Thus we may delay delivery of + // SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid. cpu := int32(hostcpu.GetCPU()) if t.rseqCPU != cpu { t.rseqCPU = cpu if err := t.rseqCopyOutCPU(); err != nil { - t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err) + t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) + t.forceSignal(linux.SIGSEGV, false) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + // Re-enter the task run loop for signal delivery. + return (*runApp)(nil) + } + if err := t.oldRSeqCopyOutCPU(); err != nil { + t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err) t.forceSignal(linux.SIGSEGV, false) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) // Re-enter the task run loop for signal delivery. @@ -320,7 +330,7 @@ func (*runApp) execute(t *Task) taskRunState { return (*runApp)(nil) case platform.ErrContextCPUPreempted: - // Ensure that RSEQ critical sections are interrupted and per-thread + // Ensure that rseq critical sections are interrupted and per-thread // CPU values are updated before the next platform.Context.Switch(). t.rseqPreempted = true return (*runApp)(nil) diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 3522a4ae5..58af16ee2 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -79,6 +80,13 @@ type TaskConfig struct { // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. AbstractSocketNamespace *AbstractSocketNamespace + // RSeqAddr is a pointer to the the userspace linux.RSeq structure. + RSeqAddr usermem.Addr + + // RSeqSignature is the signature that the rseq abort IP must be signed + // with. + RSeqSignature uint32 + // ContainerID is the container the new task belongs to. ContainerID string } @@ -126,6 +134,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { ipcns: cfg.IPCNamespace, abstractSockets: cfg.AbstractSocketNamespace, rseqCPU: -1, + rseqAddr: cfg.RSeqAddr, + rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, } diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 0cded73f6..c0197a563 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -238,8 +238,8 @@ type ThreadGroup struct { // execed is protected by the TaskSet mutex. execed bool - // rscr is the thread group's RSEQ critical region. - rscr atomic.Value `state:".(*RSEQCriticalRegion)"` + // oldRSeqCritical is the thread group's old rseq critical region. + oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"` // mounts is the thread group's mount namespace. This does not really // correspond to a "mount namespace" in Linux, but is more like a @@ -273,18 +273,18 @@ func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, s } tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg}) tg.timers = make(map[linux.TimerID]*IntervalTimer) - tg.rscr.Store(&RSEQCriticalRegion{}) + tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) return tg } -// saveRscr is invoked by stateify. -func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion { - return tg.rscr.Load().(*RSEQCriticalRegion) +// saveOldRSeqCritical is invoked by stateify. +func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion { + return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) } -// loadRscr is invoked by stateify. -func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) { - tg.rscr.Store(rscr) +// loadOldRSeqCritical is invoked by stateify. +func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) { + tg.oldRSeqCritical.Store(r) } // SignalHandlers returns the signal handlers used by tg. diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 6766ba587..a76975cee 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -30,6 +30,7 @@ go_library( "sys_random.go", "sys_read.go", "sys_rlimit.go", + "sys_rseq.go", "sys_rusage.go", "sys_sched.go", "sys_seccomp.go", diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index 272ae9991..479c5f6ff 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -377,7 +377,7 @@ var AMD64 = &kernel.SyscallTable{ 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), 332: syscalls.Supported("statx", Statx), 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), - 334: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil), + 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), // Linux skips ahead to syscall 424 to sync numbers between arches. 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index 3b584eed9..d3f61f5e8 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -307,7 +307,7 @@ var ARM64 = &kernel.SyscallTable{ 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), 291: syscalls.Supported("statx", Statx), 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), - 293: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil), + 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), // Linux skips ahead to syscall 424 to sync numbers between arches. 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go new file mode 100644 index 000000000..90db10ea6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_rseq.go @@ -0,0 +1,48 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// RSeq implements syscall rseq(2). +func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Uint() + flags := args[2].Int() + signature := args[3].Uint() + + if !t.RSeqAvailable() { + // Event for applications that want rseq on a configuration + // that doesn't support them. + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } + + switch flags { + case 0: + // Register. + return 0, nil, t.SetRSeq(addr, length, signature) + case linux.RSEQ_FLAG_UNREGISTER: + return 0, nil, t.ClearRSeq(addr, length, signature) + default: + // Unknown flag. + return 0, nil, syserror.EINVAL + } +} -- cgit v1.2.3 From 27500d529f7fb87eef8812278fd1bbca67bcba72 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Thu, 9 Jan 2020 22:00:42 -0800 Subject: New sync package. * Rename syncutil to sync. * Add aliases to sync types. * Replace existing usage of standard library sync package. This will make it easier to swap out synchronization primitives. For example, this will allow us to use primitives from github.com/sasha-s/go-deadlock to check for lock ordering violations. Updates #1472 PiperOrigin-RevId: 289033387 --- pkg/amutex/BUILD | 1 + pkg/amutex/amutex_test.go | 3 +- pkg/atomicbitops/BUILD | 1 + pkg/atomicbitops/atomic_bitops_test.go | 3 +- pkg/compressio/BUILD | 5 +- pkg/compressio/compressio.go | 2 +- pkg/control/server/BUILD | 1 + pkg/control/server/server.go | 2 +- pkg/eventchannel/BUILD | 2 + pkg/eventchannel/event.go | 2 +- pkg/eventchannel/event_test.go | 2 +- pkg/fdchannel/BUILD | 1 + pkg/fdchannel/fdchannel_test.go | 3 +- pkg/fdnotifier/BUILD | 1 + pkg/fdnotifier/fdnotifier.go | 2 +- pkg/flipcall/BUILD | 3 +- pkg/flipcall/flipcall_example_test.go | 3 +- pkg/flipcall/flipcall_test.go | 3 +- pkg/flipcall/flipcall_unsafe.go | 10 +- pkg/gate/BUILD | 1 + pkg/gate/gate_test.go | 2 +- pkg/linewriter/BUILD | 1 + pkg/linewriter/linewriter.go | 3 +- pkg/log/BUILD | 5 +- pkg/log/log.go | 2 +- pkg/metric/BUILD | 1 + pkg/metric/metric.go | 2 +- pkg/p9/BUILD | 1 + pkg/p9/client.go | 2 +- pkg/p9/p9test/BUILD | 2 + pkg/p9/p9test/client_test.go | 2 +- pkg/p9/p9test/p9test.go | 2 +- pkg/p9/path_tree.go | 3 +- pkg/p9/pool.go | 2 +- pkg/p9/server.go | 2 +- pkg/p9/transport.go | 2 +- pkg/procid/BUILD | 2 + pkg/procid/procid_test.go | 3 +- pkg/rand/BUILD | 5 +- pkg/rand/rand_linux.go | 2 +- pkg/refs/BUILD | 2 + pkg/refs/refcounter.go | 2 +- pkg/refs/refcounter_test.go | 3 +- pkg/sentry/arch/BUILD | 1 + pkg/sentry/arch/arch_x86.go | 2 +- pkg/sentry/control/BUILD | 1 + pkg/sentry/control/pprof.go | 2 +- pkg/sentry/device/BUILD | 5 +- pkg/sentry/device/device.go | 2 +- pkg/sentry/fs/BUILD | 3 +- pkg/sentry/fs/copy_up.go | 2 +- pkg/sentry/fs/copy_up_test.go | 2 +- pkg/sentry/fs/dirent.go | 2 +- pkg/sentry/fs/dirent_cache.go | 3 +- pkg/sentry/fs/dirent_cache_limiter.go | 3 +- pkg/sentry/fs/fdpipe/BUILD | 1 + pkg/sentry/fs/fdpipe/pipe.go | 2 +- pkg/sentry/fs/fdpipe/pipe_state.go | 2 +- pkg/sentry/fs/file.go | 2 +- pkg/sentry/fs/file_overlay.go | 2 +- pkg/sentry/fs/filesystems.go | 2 +- pkg/sentry/fs/fs.go | 3 +- pkg/sentry/fs/fsutil/BUILD | 1 + pkg/sentry/fs/fsutil/host_file_mapper.go | 2 +- pkg/sentry/fs/fsutil/host_mappable.go | 2 +- pkg/sentry/fs/fsutil/inode.go | 3 +- pkg/sentry/fs/fsutil/inode_cached.go | 2 +- pkg/sentry/fs/gofer/BUILD | 1 + pkg/sentry/fs/gofer/inode.go | 2 +- pkg/sentry/fs/gofer/session.go | 2 +- pkg/sentry/fs/host/BUILD | 1 + pkg/sentry/fs/host/inode.go | 2 +- pkg/sentry/fs/host/socket.go | 2 +- pkg/sentry/fs/host/tty.go | 3 +- pkg/sentry/fs/inode.go | 3 +- pkg/sentry/fs/inode_inotify.go | 3 +- pkg/sentry/fs/inotify.go | 2 +- pkg/sentry/fs/inotify_watch.go | 2 +- pkg/sentry/fs/lock/BUILD | 1 + pkg/sentry/fs/lock/lock.go | 2 +- pkg/sentry/fs/mounts.go | 2 +- pkg/sentry/fs/overlay.go | 5 +- pkg/sentry/fs/proc/BUILD | 1 + pkg/sentry/fs/proc/seqfile/BUILD | 1 + pkg/sentry/fs/proc/seqfile/seqfile.go | 2 +- pkg/sentry/fs/proc/sys_net.go | 2 +- pkg/sentry/fs/ramfs/BUILD | 1 + pkg/sentry/fs/ramfs/dir.go | 2 +- pkg/sentry/fs/restore.go | 2 +- pkg/sentry/fs/tmpfs/BUILD | 1 + pkg/sentry/fs/tmpfs/inode_file.go | 2 +- pkg/sentry/fs/tty/BUILD | 1 + pkg/sentry/fs/tty/dir.go | 2 +- pkg/sentry/fs/tty/line_discipline.go | 2 +- pkg/sentry/fs/tty/queue.go | 3 +- pkg/sentry/fsimpl/ext/BUILD | 1 + pkg/sentry/fsimpl/ext/directory.go | 3 +- pkg/sentry/fsimpl/ext/filesystem.go | 2 +- pkg/sentry/fsimpl/ext/regular_file.go | 2 +- pkg/sentry/fsimpl/kernfs/BUILD | 2 + pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/regular_file.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 +- pkg/sentry/kernel/BUILD | 5 +- pkg/sentry/kernel/abstract_socket_namespace.go | 2 +- pkg/sentry/kernel/auth/BUILD | 3 +- pkg/sentry/kernel/auth/user_namespace.go | 2 +- pkg/sentry/kernel/epoll/BUILD | 1 + pkg/sentry/kernel/epoll/epoll.go | 2 +- pkg/sentry/kernel/eventfd/BUILD | 1 + pkg/sentry/kernel/eventfd/eventfd.go | 2 +- pkg/sentry/kernel/fasync/BUILD | 1 + pkg/sentry/kernel/fasync/fasync.go | 3 +- pkg/sentry/kernel/fd_table.go | 2 +- pkg/sentry/kernel/fd_table_test.go | 2 +- pkg/sentry/kernel/fs_context.go | 2 +- pkg/sentry/kernel/futex/BUILD | 8 +- pkg/sentry/kernel/futex/futex.go | 3 +- pkg/sentry/kernel/futex/futex_test.go | 2 +- pkg/sentry/kernel/kernel.go | 2 +- pkg/sentry/kernel/memevent/BUILD | 1 + pkg/sentry/kernel/memevent/memory_events.go | 2 +- pkg/sentry/kernel/pipe/BUILD | 1 + pkg/sentry/kernel/pipe/buffer.go | 2 +- pkg/sentry/kernel/pipe/node.go | 3 +- pkg/sentry/kernel/pipe/pipe.go | 2 +- pkg/sentry/kernel/pipe/pipe_util.go | 2 +- pkg/sentry/kernel/pipe/vfs.go | 3 +- pkg/sentry/kernel/semaphore/BUILD | 1 + pkg/sentry/kernel/semaphore/semaphore.go | 2 +- pkg/sentry/kernel/shm/BUILD | 1 + pkg/sentry/kernel/shm/shm.go | 2 +- pkg/sentry/kernel/signal_handlers.go | 3 +- pkg/sentry/kernel/signalfd/BUILD | 1 + pkg/sentry/kernel/signalfd/signalfd.go | 3 +- pkg/sentry/kernel/syscalls.go | 2 +- pkg/sentry/kernel/syslog.go | 3 +- pkg/sentry/kernel/task.go | 5 +- pkg/sentry/kernel/thread_group.go | 2 +- pkg/sentry/kernel/threads.go | 2 +- pkg/sentry/kernel/time/BUILD | 1 + pkg/sentry/kernel/time/time.go | 2 +- pkg/sentry/kernel/timekeeper.go | 2 +- pkg/sentry/kernel/tty.go | 2 +- pkg/sentry/kernel/uts_namespace.go | 3 +- pkg/sentry/limits/BUILD | 1 + pkg/sentry/limits/limits.go | 3 +- pkg/sentry/mm/BUILD | 2 +- pkg/sentry/mm/aio_context.go | 3 +- pkg/sentry/mm/mm.go | 8 +- pkg/sentry/pgalloc/BUILD | 1 + pkg/sentry/pgalloc/pgalloc.go | 2 +- pkg/sentry/platform/interrupt/BUILD | 1 + pkg/sentry/platform/interrupt/interrupt.go | 3 +- pkg/sentry/platform/kvm/BUILD | 1 + pkg/sentry/platform/kvm/address_space.go | 2 +- pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go | 2 - pkg/sentry/platform/kvm/kvm.go | 2 +- pkg/sentry/platform/kvm/machine.go | 2 +- pkg/sentry/platform/ptrace/BUILD | 1 + pkg/sentry/platform/ptrace/ptrace.go | 2 +- pkg/sentry/platform/ptrace/subprocess.go | 2 +- .../platform/ptrace/subprocess_linux_unsafe.go | 2 +- pkg/sentry/platform/ring0/defs.go | 2 +- pkg/sentry/platform/ring0/defs_amd64.go | 1 + pkg/sentry/platform/ring0/defs_arm64.go | 1 + pkg/sentry/platform/ring0/pagetables/BUILD | 5 +- pkg/sentry/platform/ring0/pagetables/pcids_x86.go | 2 +- pkg/sentry/socket/netlink/BUILD | 1 + pkg/sentry/socket/netlink/port/BUILD | 1 + pkg/sentry/socket/netlink/port/port.go | 3 +- pkg/sentry/socket/netlink/socket.go | 2 +- pkg/sentry/socket/netstack/BUILD | 1 + pkg/sentry/socket/netstack/netstack.go | 2 +- pkg/sentry/socket/rpcinet/conn/BUILD | 1 + pkg/sentry/socket/rpcinet/conn/conn.go | 2 +- pkg/sentry/socket/rpcinet/notifier/BUILD | 1 + pkg/sentry/socket/rpcinet/notifier/notifier.go | 2 +- pkg/sentry/socket/unix/transport/BUILD | 1 + pkg/sentry/socket/unix/transport/connectioned.go | 3 +- pkg/sentry/socket/unix/transport/queue.go | 3 +- pkg/sentry/socket/unix/transport/unix.go | 2 +- pkg/sentry/syscalls/linux/BUILD | 1 + pkg/sentry/syscalls/linux/error.go | 2 +- pkg/sentry/time/BUILD | 4 +- pkg/sentry/time/calibrated_clock.go | 2 +- pkg/sentry/usage/BUILD | 1 + pkg/sentry/usage/memory.go | 2 +- pkg/sentry/vfs/BUILD | 3 +- pkg/sentry/vfs/dentry.go | 2 +- pkg/sentry/vfs/file_description_impl_util.go | 2 +- pkg/sentry/vfs/mount_test.go | 3 +- pkg/sentry/vfs/mount_unsafe.go | 4 +- pkg/sentry/vfs/pathname.go | 3 +- pkg/sentry/vfs/resolving_path.go | 2 +- pkg/sentry/vfs/vfs.go | 2 +- pkg/sentry/watchdog/BUILD | 1 + pkg/sentry/watchdog/watchdog.go | 2 +- pkg/sync/BUILD | 53 +++++++ pkg/sync/LICENSE | 27 ++++ pkg/sync/README.md | 5 + pkg/sync/aliases.go | 37 +++++ pkg/sync/atomicptr_unsafe.go | 47 +++++++ pkg/sync/atomicptrtest/BUILD | 29 ++++ pkg/sync/atomicptrtest/atomicptr_test.go | 31 +++++ pkg/sync/downgradable_rwmutex_test.go | 150 ++++++++++++++++++++ pkg/sync/downgradable_rwmutex_unsafe.go | 146 ++++++++++++++++++++ pkg/sync/memmove_unsafe.go | 28 ++++ pkg/sync/norace_unsafe.go | 35 +++++ pkg/sync/race_unsafe.go | 41 ++++++ pkg/sync/seqatomic_unsafe.go | 72 ++++++++++ pkg/sync/seqatomictest/BUILD | 33 +++++ pkg/sync/seqatomictest/seqatomic_test.go | 132 ++++++++++++++++++ pkg/sync/seqcount.go | 149 ++++++++++++++++++++ pkg/sync/seqcount_test.go | 153 +++++++++++++++++++++ pkg/sync/syncutil.go | 7 + pkg/syncutil/BUILD | 52 ------- pkg/syncutil/LICENSE | 27 ---- pkg/syncutil/README.md | 5 - pkg/syncutil/atomicptr_unsafe.go | 47 ------- pkg/syncutil/atomicptrtest/BUILD | 29 ---- pkg/syncutil/atomicptrtest/atomicptr_test.go | 31 ----- pkg/syncutil/downgradable_rwmutex_test.go | 150 -------------------- pkg/syncutil/downgradable_rwmutex_unsafe.go | 146 -------------------- pkg/syncutil/memmove_unsafe.go | 28 ---- pkg/syncutil/norace_unsafe.go | 35 ----- pkg/syncutil/race_unsafe.go | 41 ------ pkg/syncutil/seqatomic_unsafe.go | 72 ---------- pkg/syncutil/seqatomictest/BUILD | 35 ----- pkg/syncutil/seqatomictest/seqatomic_test.go | 132 ------------------ pkg/syncutil/seqcount.go | 149 -------------------- pkg/syncutil/seqcount_test.go | 153 --------------------- pkg/syncutil/syncutil.go | 7 - pkg/tcpip/BUILD | 1 + pkg/tcpip/adapters/gonet/BUILD | 1 + pkg/tcpip/adapters/gonet/gonet.go | 2 +- pkg/tcpip/link/fdbased/BUILD | 1 + pkg/tcpip/link/fdbased/endpoint.go | 2 +- pkg/tcpip/link/sharedmem/BUILD | 2 + pkg/tcpip/link/sharedmem/pipe/BUILD | 1 + pkg/tcpip/link/sharedmem/pipe/pipe_test.go | 3 +- pkg/tcpip/link/sharedmem/sharedmem.go | 2 +- pkg/tcpip/link/sharedmem/sharedmem_test.go | 2 +- pkg/tcpip/network/fragmentation/BUILD | 1 + pkg/tcpip/network/fragmentation/fragmentation.go | 2 +- pkg/tcpip/network/fragmentation/reassembler.go | 2 +- pkg/tcpip/ports/BUILD | 1 + pkg/tcpip/ports/ports.go | 2 +- pkg/tcpip/stack/BUILD | 2 + pkg/tcpip/stack/linkaddrcache.go | 2 +- pkg/tcpip/stack/linkaddrcache_test.go | 2 +- pkg/tcpip/stack/nic.go | 2 +- pkg/tcpip/stack/stack.go | 2 +- pkg/tcpip/stack/transport_demuxer.go | 2 +- pkg/tcpip/tcpip.go | 2 +- pkg/tcpip/transport/icmp/BUILD | 1 + pkg/tcpip/transport/icmp/endpoint.go | 3 +- pkg/tcpip/transport/packet/BUILD | 1 + pkg/tcpip/transport/packet/endpoint.go | 3 +- pkg/tcpip/transport/raw/BUILD | 1 + pkg/tcpip/transport/raw/endpoint.go | 3 +- pkg/tcpip/transport/tcp/BUILD | 1 + pkg/tcpip/transport/tcp/accept.go | 2 +- pkg/tcpip/transport/tcp/connect.go | 2 +- pkg/tcpip/transport/tcp/endpoint.go | 2 +- pkg/tcpip/transport/tcp/endpoint_state.go | 2 +- pkg/tcpip/transport/tcp/forwarder.go | 3 +- pkg/tcpip/transport/tcp/protocol.go | 2 +- pkg/tcpip/transport/tcp/segment_queue.go | 2 +- pkg/tcpip/transport/tcp/snd.go | 2 +- pkg/tcpip/transport/udp/BUILD | 1 + pkg/tcpip/transport/udp/endpoint.go | 3 +- pkg/tmutex/BUILD | 1 + pkg/tmutex/tmutex_test.go | 3 +- pkg/unet/BUILD | 1 + pkg/unet/unet_test.go | 3 +- pkg/urpc/BUILD | 1 + pkg/urpc/urpc.go | 2 +- pkg/waiter/BUILD | 1 + pkg/waiter/waiter.go | 2 +- runsc/boot/BUILD | 2 + runsc/boot/compat.go | 2 +- runsc/boot/limits.go | 2 +- runsc/boot/loader.go | 2 +- runsc/boot/loader_test.go | 2 +- runsc/cmd/BUILD | 1 + runsc/cmd/create.go | 1 + runsc/cmd/gofer.go | 2 +- runsc/cmd/start.go | 1 + runsc/container/BUILD | 2 + runsc/container/console_test.go | 2 +- runsc/container/container_test.go | 2 +- runsc/container/multi_container_test.go | 2 +- runsc/container/state_file.go | 2 +- runsc/fsgofer/BUILD | 1 + runsc/fsgofer/fsgofer.go | 2 +- runsc/sandbox/BUILD | 1 + runsc/sandbox/sandbox.go | 2 +- runsc/testutil/BUILD | 1 + runsc/testutil/testutil.go | 2 +- 303 files changed, 1507 insertions(+), 1368 deletions(-) create mode 100644 pkg/sync/BUILD create mode 100644 pkg/sync/LICENSE create mode 100644 pkg/sync/README.md create mode 100644 pkg/sync/aliases.go create mode 100644 pkg/sync/atomicptr_unsafe.go create mode 100644 pkg/sync/atomicptrtest/BUILD create mode 100644 pkg/sync/atomicptrtest/atomicptr_test.go create mode 100644 pkg/sync/downgradable_rwmutex_test.go create mode 100644 pkg/sync/downgradable_rwmutex_unsafe.go create mode 100644 pkg/sync/memmove_unsafe.go create mode 100644 pkg/sync/norace_unsafe.go create mode 100644 pkg/sync/race_unsafe.go create mode 100644 pkg/sync/seqatomic_unsafe.go create mode 100644 pkg/sync/seqatomictest/BUILD create mode 100644 pkg/sync/seqatomictest/seqatomic_test.go create mode 100644 pkg/sync/seqcount.go create mode 100644 pkg/sync/seqcount_test.go create mode 100644 pkg/sync/syncutil.go delete mode 100644 pkg/syncutil/BUILD delete mode 100644 pkg/syncutil/LICENSE delete mode 100644 pkg/syncutil/README.md delete mode 100644 pkg/syncutil/atomicptr_unsafe.go delete mode 100644 pkg/syncutil/atomicptrtest/BUILD delete mode 100644 pkg/syncutil/atomicptrtest/atomicptr_test.go delete mode 100644 pkg/syncutil/downgradable_rwmutex_test.go delete mode 100644 pkg/syncutil/downgradable_rwmutex_unsafe.go delete mode 100644 pkg/syncutil/memmove_unsafe.go delete mode 100644 pkg/syncutil/norace_unsafe.go delete mode 100644 pkg/syncutil/race_unsafe.go delete mode 100644 pkg/syncutil/seqatomic_unsafe.go delete mode 100644 pkg/syncutil/seqatomictest/BUILD delete mode 100644 pkg/syncutil/seqatomictest/seqatomic_test.go delete mode 100644 pkg/syncutil/seqcount.go delete mode 100644 pkg/syncutil/seqcount_test.go delete mode 100644 pkg/syncutil/syncutil.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD index 6bc486b62..d99e37b40 100644 --- a/pkg/amutex/BUILD +++ b/pkg/amutex/BUILD @@ -15,4 +15,5 @@ go_test( size = "small", srcs = ["amutex_test.go"], embed = [":amutex"], + deps = ["//pkg/sync"], ) diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go index 1d7f45641..8a3952f2a 100644 --- a/pkg/amutex/amutex_test.go +++ b/pkg/amutex/amutex_test.go @@ -15,9 +15,10 @@ package amutex import ( - "sync" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) type sleeper struct { diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD index 36beaade9..6403c60c2 100644 --- a/pkg/atomicbitops/BUILD +++ b/pkg/atomicbitops/BUILD @@ -20,4 +20,5 @@ go_test( size = "small", srcs = ["atomic_bitops_test.go"], embed = [":atomicbitops"], + deps = ["//pkg/sync"], ) diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go index 965e9be79..9466d3e23 100644 --- a/pkg/atomicbitops/atomic_bitops_test.go +++ b/pkg/atomicbitops/atomic_bitops_test.go @@ -16,8 +16,9 @@ package atomicbitops import ( "runtime" - "sync" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) const iterations = 100 diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD index a0b21d4bd..2bb581b18 100644 --- a/pkg/compressio/BUILD +++ b/pkg/compressio/BUILD @@ -8,7 +8,10 @@ go_library( srcs = ["compressio.go"], importpath = "gvisor.dev/gvisor/pkg/compressio", visibility = ["//:sandbox"], - deps = ["//pkg/binary"], + deps = [ + "//pkg/binary", + "//pkg/sync", + ], ) go_test( diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go index 3b0bb086e..5f52cbe74 100644 --- a/pkg/compressio/compressio.go +++ b/pkg/compressio/compressio.go @@ -52,9 +52,9 @@ import ( "hash" "io" "runtime" - "sync" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sync" ) var bufPool = sync.Pool{ diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD index 21adf3adf..adbd1e3f8 100644 --- a/pkg/control/server/BUILD +++ b/pkg/control/server/BUILD @@ -9,6 +9,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/log", + "//pkg/sync", "//pkg/unet", "//pkg/urpc", ], diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go index a56152d10..41abe1f2d 100644 --- a/pkg/control/server/server.go +++ b/pkg/control/server/server.go @@ -22,9 +22,9 @@ package server import ( "os" - "sync" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" ) diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD index 0b4b7cc44..9d68682c7 100644 --- a/pkg/eventchannel/BUILD +++ b/pkg/eventchannel/BUILD @@ -15,6 +15,7 @@ go_library( deps = [ ":eventchannel_go_proto", "//pkg/log", + "//pkg/sync", "//pkg/unet", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_golang_protobuf//ptypes:go_default_library_gen", @@ -40,6 +41,7 @@ go_test( srcs = ["event_test.go"], embed = [":eventchannel"], deps = [ + "//pkg/sync", "@com_github_golang_protobuf//proto:go_default_library", ], ) diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go index d37ad0428..9a29c58bd 100644 --- a/pkg/eventchannel/event.go +++ b/pkg/eventchannel/event.go @@ -22,13 +22,13 @@ package eventchannel import ( "encoding/binary" "fmt" - "sync" "syscall" "github.com/golang/protobuf/proto" "github.com/golang/protobuf/ptypes" pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go index 3649097d6..7f41b4a27 100644 --- a/pkg/eventchannel/event_test.go +++ b/pkg/eventchannel/event_test.go @@ -16,11 +16,11 @@ package eventchannel import ( "fmt" - "sync" "testing" "time" "github.com/golang/protobuf/proto" + "gvisor.dev/gvisor/pkg/sync" ) // testEmitter is an emitter that can be used in tests. It records all events diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD index 56495cbd9..b0478c672 100644 --- a/pkg/fdchannel/BUILD +++ b/pkg/fdchannel/BUILD @@ -15,4 +15,5 @@ go_test( size = "small", srcs = ["fdchannel_test.go"], embed = [":fdchannel"], + deps = ["//pkg/sync"], ) diff --git a/pkg/fdchannel/fdchannel_test.go b/pkg/fdchannel/fdchannel_test.go index 5d01dc636..7a8a63a59 100644 --- a/pkg/fdchannel/fdchannel_test.go +++ b/pkg/fdchannel/fdchannel_test.go @@ -17,10 +17,11 @@ package fdchannel import ( "io/ioutil" "os" - "sync" "syscall" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) func TestSendRecvFD(t *testing.T) { diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD index aca2d8a82..91a202a30 100644 --- a/pkg/fdnotifier/BUILD +++ b/pkg/fdnotifier/BUILD @@ -11,6 +11,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/fdnotifier", visibility = ["//:sandbox"], deps = [ + "//pkg/sync", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go index f4aae1953..a6b63c982 100644 --- a/pkg/fdnotifier/fdnotifier.go +++ b/pkg/fdnotifier/fdnotifier.go @@ -22,10 +22,10 @@ package fdnotifier import ( "fmt" - "sync" "syscall" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD index e590a71ba..85bd83af1 100644 --- a/pkg/flipcall/BUILD +++ b/pkg/flipcall/BUILD @@ -19,7 +19,7 @@ go_library( "//pkg/abi/linux", "//pkg/log", "//pkg/memutil", - "//pkg/syncutil", + "//pkg/sync", ], ) @@ -31,4 +31,5 @@ go_test( "flipcall_test.go", ], embed = [":flipcall"], + deps = ["//pkg/sync"], ) diff --git a/pkg/flipcall/flipcall_example_test.go b/pkg/flipcall/flipcall_example_test.go index 8d88b845d..2e28a149a 100644 --- a/pkg/flipcall/flipcall_example_test.go +++ b/pkg/flipcall/flipcall_example_test.go @@ -17,7 +17,8 @@ package flipcall import ( "bytes" "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) func Example() { diff --git a/pkg/flipcall/flipcall_test.go b/pkg/flipcall/flipcall_test.go index 168a487ec..33fd55a44 100644 --- a/pkg/flipcall/flipcall_test.go +++ b/pkg/flipcall/flipcall_test.go @@ -16,9 +16,10 @@ package flipcall import ( "runtime" - "sync" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) var testPacketWindowSize = pageSize diff --git a/pkg/flipcall/flipcall_unsafe.go b/pkg/flipcall/flipcall_unsafe.go index 27b8939fc..ac974b232 100644 --- a/pkg/flipcall/flipcall_unsafe.go +++ b/pkg/flipcall/flipcall_unsafe.go @@ -18,7 +18,7 @@ import ( "reflect" "unsafe" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" ) // Packets consist of a 16-byte header followed by an arbitrarily-sized @@ -75,13 +75,13 @@ func (ep *Endpoint) Data() []byte { var ioSync int64 func raceBecomeActive() { - if syncutil.RaceEnabled { - syncutil.RaceAcquire((unsafe.Pointer)(&ioSync)) + if sync.RaceEnabled { + sync.RaceAcquire((unsafe.Pointer)(&ioSync)) } } func raceBecomeInactive() { - if syncutil.RaceEnabled { - syncutil.RaceReleaseMerge((unsafe.Pointer)(&ioSync)) + if sync.RaceEnabled { + sync.RaceReleaseMerge((unsafe.Pointer)(&ioSync)) } } diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD index 4b9321711..f22bd070d 100644 --- a/pkg/gate/BUILD +++ b/pkg/gate/BUILD @@ -19,5 +19,6 @@ go_test( ], deps = [ ":gate", + "//pkg/sync", ], ) diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go index 5dbd8d712..850693df8 100644 --- a/pkg/gate/gate_test.go +++ b/pkg/gate/gate_test.go @@ -15,11 +15,11 @@ package gate_test import ( - "sync" "testing" "time" "gvisor.dev/gvisor/pkg/gate" + "gvisor.dev/gvisor/pkg/sync" ) func TestBasicEnter(t *testing.T) { diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD index a5d980d14..bcde6d308 100644 --- a/pkg/linewriter/BUILD +++ b/pkg/linewriter/BUILD @@ -8,6 +8,7 @@ go_library( srcs = ["linewriter.go"], importpath = "gvisor.dev/gvisor/pkg/linewriter", visibility = ["//visibility:public"], + deps = ["//pkg/sync"], ) go_test( diff --git a/pkg/linewriter/linewriter.go b/pkg/linewriter/linewriter.go index cd6e4e2ce..a1b1285d4 100644 --- a/pkg/linewriter/linewriter.go +++ b/pkg/linewriter/linewriter.go @@ -17,7 +17,8 @@ package linewriter import ( "bytes" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // Writer is an io.Writer which buffers input, flushing diff --git a/pkg/log/BUILD b/pkg/log/BUILD index fc5f5779b..0df0f2849 100644 --- a/pkg/log/BUILD +++ b/pkg/log/BUILD @@ -16,7 +16,10 @@ go_library( visibility = [ "//visibility:public", ], - deps = ["//pkg/linewriter"], + deps = [ + "//pkg/linewriter", + "//pkg/sync", + ], ) go_test( diff --git a/pkg/log/log.go b/pkg/log/log.go index 9387586e6..91a81b288 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -25,12 +25,12 @@ import ( stdlog "log" "os" "runtime" - "sync" "sync/atomic" "syscall" "time" "gvisor.dev/gvisor/pkg/linewriter" + "gvisor.dev/gvisor/pkg/sync" ) // Level is the log level. diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD index dd6ca6d39..9145f3233 100644 --- a/pkg/metric/BUILD +++ b/pkg/metric/BUILD @@ -14,6 +14,7 @@ go_library( ":metric_go_proto", "//pkg/eventchannel", "//pkg/log", + "//pkg/sync", ], ) diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go index eadde06e4..93d4f2b8c 100644 --- a/pkg/metric/metric.go +++ b/pkg/metric/metric.go @@ -18,12 +18,12 @@ package metric import ( "errors" "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" + "gvisor.dev/gvisor/pkg/sync" ) var ( diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD index f32244c69..a3e05c96d 100644 --- a/pkg/p9/BUILD +++ b/pkg/p9/BUILD @@ -29,6 +29,7 @@ go_library( "//pkg/fdchannel", "//pkg/flipcall", "//pkg/log", + "//pkg/sync", "//pkg/unet", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/p9/client.go b/pkg/p9/client.go index 221516c6c..4045e41fa 100644 --- a/pkg/p9/client.go +++ b/pkg/p9/client.go @@ -17,12 +17,12 @@ package p9 import ( "errors" "fmt" - "sync" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD index 28707c0ca..f4edd68b2 100644 --- a/pkg/p9/p9test/BUILD +++ b/pkg/p9/p9test/BUILD @@ -70,6 +70,7 @@ go_library( "//pkg/fd", "//pkg/log", "//pkg/p9", + "//pkg/sync", "//pkg/unet", "@com_github_golang_mock//gomock:go_default_library", ], @@ -83,6 +84,7 @@ go_test( deps = [ "//pkg/fd", "//pkg/p9", + "//pkg/sync", "@com_github_golang_mock//gomock:go_default_library", ], ) diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go index 6e758148d..6e7bb3db2 100644 --- a/pkg/p9/p9test/client_test.go +++ b/pkg/p9/p9test/client_test.go @@ -22,7 +22,6 @@ import ( "os" "reflect" "strings" - "sync" "syscall" "testing" "time" @@ -30,6 +29,7 @@ import ( "github.com/golang/mock/gomock" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" ) func TestPanic(t *testing.T) { diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go index 4d3271b37..dd8b01b6d 100644 --- a/pkg/p9/p9test/p9test.go +++ b/pkg/p9/p9test/p9test.go @@ -17,13 +17,13 @@ package p9test import ( "fmt" - "sync" "sync/atomic" "syscall" "testing" "github.com/golang/mock/gomock" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/p9/path_tree.go b/pkg/p9/path_tree.go index 865459411..72ef53313 100644 --- a/pkg/p9/path_tree.go +++ b/pkg/p9/path_tree.go @@ -16,7 +16,8 @@ package p9 import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // pathNode is a single node in a path traversal. diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go index 52de889e1..2b14a5ce3 100644 --- a/pkg/p9/pool.go +++ b/pkg/p9/pool.go @@ -15,7 +15,7 @@ package p9 import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // pool is a simple allocator. diff --git a/pkg/p9/server.go b/pkg/p9/server.go index 40b8fa023..fdfa83648 100644 --- a/pkg/p9/server.go +++ b/pkg/p9/server.go @@ -17,7 +17,6 @@ package p9 import ( "io" "runtime/debug" - "sync" "sync/atomic" "syscall" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/fdchannel" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go index 6e8b4bbcd..9c11e28ce 100644 --- a/pkg/p9/transport.go +++ b/pkg/p9/transport.go @@ -19,11 +19,11 @@ import ( "fmt" "io" "io/ioutil" - "sync" "syscall" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD index 078f084b2..b506813f0 100644 --- a/pkg/procid/BUILD +++ b/pkg/procid/BUILD @@ -21,6 +21,7 @@ go_test( "procid_test.go", ], embed = [":procid"], + deps = ["//pkg/sync"], ) go_test( @@ -31,4 +32,5 @@ go_test( "procid_test.go", ], embed = [":procid"], + deps = ["//pkg/sync"], ) diff --git a/pkg/procid/procid_test.go b/pkg/procid/procid_test.go index 88dd0b3ae..9ec08c3d6 100644 --- a/pkg/procid/procid_test.go +++ b/pkg/procid/procid_test.go @@ -17,9 +17,10 @@ package procid import ( "os" "runtime" - "sync" "syscall" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) // runOnMain is used to send functions to run on the main (initial) thread. diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD index f4f2001f3..9d5b4859b 100644 --- a/pkg/rand/BUILD +++ b/pkg/rand/BUILD @@ -10,5 +10,8 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/rand", visibility = ["//:sandbox"], - deps = ["@org_golang_x_sys//unix:go_default_library"], + deps = [ + "//pkg/sync", + "@org_golang_x_sys//unix:go_default_library", + ], ) diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go index 2b92db3e6..0bdad5fad 100644 --- a/pkg/rand/rand_linux.go +++ b/pkg/rand/rand_linux.go @@ -19,9 +19,9 @@ package rand import ( "crypto/rand" "io" - "sync" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sync" ) // reader implements an io.Reader that returns pseudorandom bytes. diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 7ad59dfd7..974d9af9b 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -27,6 +27,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/log", + "//pkg/sync", ], ) @@ -35,4 +36,5 @@ go_test( size = "small", srcs = ["refcounter_test.go"], embed = [":refs"], + deps = ["//pkg/sync"], ) diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index ad69e0757..c45ba8200 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -21,10 +21,10 @@ import ( "fmt" "reflect" "runtime" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" ) // RefCounter is the interface to be implemented by objects that are reference diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go index ffd3d3f07..1ab4a4440 100644 --- a/pkg/refs/refcounter_test.go +++ b/pkg/refs/refcounter_test.go @@ -16,8 +16,9 @@ package refs import ( "reflect" - "sync" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) type testCounter struct { diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 18c73cc24..ae3e364cd 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/limits", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index 9294ac773..9f41e566f 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -19,7 +19,6 @@ package arch import ( "fmt" "io" - "sync" "syscall" "gvisor.dev/gvisor/pkg/binary" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 5522cecd0..2561a6109 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/sentry/strace", "//pkg/sentry/usage", "//pkg/sentry/watchdog", + "//pkg/sync", "//pkg/tcpip/link/sniffer", "//pkg/urpc", ], diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index e1f2fea60..151808911 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -19,10 +19,10 @@ import ( "runtime" "runtime/pprof" "runtime/trace" - "sync" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/urpc" ) diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD index 1098ed777..97fa1512c 100644 --- a/pkg/sentry/device/BUILD +++ b/pkg/sentry/device/BUILD @@ -8,7 +8,10 @@ go_library( srcs = ["device.go"], importpath = "gvisor.dev/gvisor/pkg/sentry/device", visibility = ["//pkg/sentry:internal"], - deps = ["//pkg/abi/linux"], + deps = [ + "//pkg/abi/linux", + "//pkg/sync", + ], ) go_test( diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go index 47945d1a7..69e71e322 100644 --- a/pkg/sentry/device/device.go +++ b/pkg/sentry/device/device.go @@ -19,10 +19,10 @@ package device import ( "bytes" "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sync" ) // Registry tracks all simple devices and related state on the system for diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index c035ffff7..7d5d72d5a 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -68,7 +68,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/state", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], @@ -115,6 +115,7 @@ go_test( "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/contexttest", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index 9ac62c84d..734177e90 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -17,12 +17,12 @@ package fs import ( "fmt" "io" - "sync" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index 1d80bf15a..738580c5f 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -19,13 +19,13 @@ import ( "crypto/rand" "fmt" "io" - "sync" "testing" "gvisor.dev/gvisor/pkg/sentry/fs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) const ( diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 3cb73bd78..31fc4d87b 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -18,7 +18,6 @@ import ( "fmt" "path" "sort" - "sync" "sync/atomic" "syscall" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 60a15a275..25514ace4 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -16,7 +16,8 @@ package fs import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // DirentCache is an LRU cache of Dirents. The Dirent's refCount is diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go index ebb80bd50..525ee25f9 100644 --- a/pkg/sentry/fs/dirent_cache_limiter.go +++ b/pkg/sentry/fs/dirent_cache_limiter.go @@ -16,7 +16,8 @@ package fs import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // DirentCacheLimiter acts as a global limit for all dirent caches in the diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index 277ee4c31..cc43de69d 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -23,6 +23,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/safemem", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 669ffcb75..5b6cfeb0a 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -17,7 +17,6 @@ package fdpipe import ( "os" - "sync" "syscall" "gvisor.dev/gvisor/pkg/fd" @@ -29,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go index 29175fb3d..cee87f726 100644 --- a/pkg/sentry/fs/fdpipe/pipe_state.go +++ b/pkg/sentry/fs/fdpipe/pipe_state.go @@ -17,10 +17,10 @@ package fdpipe import ( "fmt" "io/ioutil" - "sync" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" ) // beforeSave is invoked by stateify. diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index a2f966cb6..7c4586296 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -16,7 +16,6 @@ package fs import ( "math" - "sync" "sync/atomic" "time" @@ -29,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 225e40186..8a633b1ba 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -16,13 +16,13 @@ package fs import ( "io" - "sync" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go index b157fd228..c5b51620a 100644 --- a/pkg/sentry/fs/filesystems.go +++ b/pkg/sentry/fs/filesystems.go @@ -18,9 +18,9 @@ import ( "fmt" "sort" "strings" - "sync" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" ) // FilesystemFlags matches include/linux/fs.h:file_system_type.fs_flags. diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index 8b2a5e6b2..26abf49e2 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -54,10 +54,9 @@ package fs import ( - "sync" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" ) var ( diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 9ca695a95..945b6270d 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -93,6 +93,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/state", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index b06a71cc2..837fc70b5 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -16,7 +16,6 @@ package fsutil import ( "fmt" - "sync" "syscall" "gvisor.dev/gvisor/pkg/log" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // HostFileMapper caches mappings of an arbitrary host file descriptor. It is diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index 30475f340..a625f0e26 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -16,7 +16,6 @@ package fsutil import ( "math" - "sync" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // HostMappable implements memmap.Mappable and platform.File over a diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index 4e100a402..adf5ec69c 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -15,13 +15,12 @@ package fsutil import ( - "sync" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 798920d18..20a014402 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -17,7 +17,6 @@ package fsutil import ( "fmt" "io" - "sync" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" @@ -30,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // Lock order (compare the lock order model in mm/mm.go): diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index 4a005c605..fd870e8e1 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -44,6 +44,7 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 91263ebdc..245fe2ef1 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -16,7 +16,6 @@ package gofer import ( "errors" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -31,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 4e358a46a..edc796ce0 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -16,7 +16,6 @@ package gofer import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refs" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index 23daeb528..2b581aa69 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -50,6 +50,7 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/uniqueid", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index a6e4a09e3..873a1c52d 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -15,7 +15,6 @@ package host import ( - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 107336a3e..c076d5bdd 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -16,7 +16,6 @@ package host import ( "fmt" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -30,6 +29,7 @@ import ( unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 90331e3b2..753ef8cd6 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -15,8 +15,6 @@ package host import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" @@ -24,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 91e2fde2f..468043df0 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -15,8 +15,6 @@ package fs import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" @@ -26,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go index 0f2a66a79..efd3c962b 100644 --- a/pkg/sentry/fs/inode_inotify.go +++ b/pkg/sentry/fs/inode_inotify.go @@ -16,7 +16,8 @@ package fs import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // Watches is the collection of inotify watches on an inode. diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index ba3e0233d..cc7dd1c92 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -16,7 +16,6 @@ package fs import ( "io" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go index 0aa0a5e9b..900cba3ca 100644 --- a/pkg/sentry/fs/inotify_watch.go +++ b/pkg/sentry/fs/inotify_watch.go @@ -15,10 +15,10 @@ package fs import ( - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sync" ) // Watch represent a particular inotify watch created by inotify_add_watch. diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD index 8d62642e7..2c332a82a 100644 --- a/pkg/sentry/fs/lock/BUILD +++ b/pkg/sentry/fs/lock/BUILD @@ -44,6 +44,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", + "//pkg/sync", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 636484424..41b040818 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -52,9 +52,9 @@ package lock import ( "fmt" "math" - "sync" "syscall" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index ac0398bd9..db3dfd096 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -19,7 +19,6 @@ import ( "math" "path" "strings" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index 25573e986..4cad55327 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -17,13 +17,12 @@ package fs import ( "fmt" "strings" - "sync" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -199,7 +198,7 @@ type overlayEntry struct { upper *Inode // dirCacheMu protects dirCache. - dirCacheMu syncutil.DowngradableRWMutex `state:"nosave"` + dirCacheMu sync.DowngradableRWMutex `state:"nosave"` // dirCache is cache of DentAttrs from upper and lower Inodes. dirCache *SortedDentryMap diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 75cbb0622..94d46ab1b 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -51,6 +51,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/header", "//pkg/waiter", diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index fe7067be1..38b246dff 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/sentry/fs/proc/device", "//pkg/sentry/kernel/time", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go index 5fe823000..f9af191d5 100644 --- a/pkg/sentry/fs/proc/seqfile/seqfile.go +++ b/pkg/sentry/fs/proc/seqfile/seqfile.go @@ -17,7 +17,6 @@ package seqfile import ( "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" @@ -26,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index bd93f83fa..a37e1fa06 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -17,7 +17,6 @@ package proc import ( "fmt" "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 012cb3e44..3fb7b0633 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index 78e082b8e..dcbb8eb2e 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -17,7 +17,6 @@ package ramfs import ( "fmt" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go index f10168125..64c6a6ae9 100644 --- a/pkg/sentry/fs/restore.go +++ b/pkg/sentry/fs/restore.go @@ -15,7 +15,7 @@ package fs import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // RestoreEnvironment is the restore environment for file systems. It consists diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 59ce400c2..3400b940c 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -31,6 +31,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index f86dfaa36..f1c87fe41 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -17,7 +17,6 @@ package tmpfs import ( "fmt" "io" - "sync" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -31,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 95ad98cb0..f6f60d0cf 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 2f639c823..88aa66b24 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -19,7 +19,6 @@ import ( "fmt" "math" "strconv" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 7cc0eb409..894964260 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -16,13 +16,13 @@ package tty import ( "bytes" - "sync" "unicode/utf8" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 231e4e6eb..8b5d4699a 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -15,13 +15,12 @@ package tty import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index bc90330bc..903874141 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -50,6 +50,7 @@ go_library( "//pkg/sentry/syscalls/linux", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 91802dc1e..8944171c8 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -15,8 +15,6 @@ package ext import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/log" @@ -25,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 616fc002a..9afb1a84c 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -17,13 +17,13 @@ package ext import ( "errors" "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index aec33e00a..d11153c90 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -16,7 +16,6 @@ package ext import ( "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 39c03ee9d..809178250 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -39,6 +39,7 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", ], ) @@ -56,6 +57,7 @@ go_test( "//pkg/sentry/kernel/auth", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "@com_github_google_go-cmp//cmp:go_default_library", ], diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 752e0f659..1d469a0db 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -16,7 +16,6 @@ package kernfs import ( "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index d69b299ae..bb12f39a2 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -53,7 +53,6 @@ package kernfs import ( "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -61,6 +60,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" ) // FilesystemType implements vfs.FilesystemType. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 4b6b95f5f..5c9d580e1 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -19,7 +19,6 @@ import ( "fmt" "io" "runtime" - "sync" "testing" "github.com/google/go-cmp/cmp" @@ -31,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index a5b285987..82f5c2f41 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -47,6 +47,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index f51e247a7..f200e767d 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -17,7 +17,6 @@ package tmpfs import ( "io" "math" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -30,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 7be6faa5b..701826f90 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -26,7 +26,6 @@ package tmpfs import ( "fmt" "math" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -34,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 2706927ff..ac85ba0c8 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -35,7 +35,7 @@ go_template_instance( out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", - template = "//pkg/syncutil:generic_seqatomic", + template = "//pkg/sync:generic_seqatomic", types = { "Value": "TaskGoroutineSchedInfo", }, @@ -209,7 +209,7 @@ go_library( "//pkg/sentry/usermem", "//pkg/state", "//pkg/state/statefile", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", @@ -241,6 +241,7 @@ go_test( "//pkg/sentry/time", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 244655b5c..920fe4329 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -15,11 +15,11 @@ package kernel import ( - "sync" "syscall" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" ) // +stateify savable diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 04c244447..1aa72fa47 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -8,7 +8,7 @@ go_template_instance( out = "atomicptr_credentials_unsafe.go", package = "auth", suffix = "Credentials", - template = "//pkg/syncutil:generic_atomicptr", + template = "//pkg/sync:generic_atomicptr", types = { "Value": "Credentials", }, @@ -64,6 +64,7 @@ go_library( "//pkg/bits", "//pkg/log", "//pkg/sentry/context", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go index af28ccc65..9dd52c860 100644 --- a/pkg/sentry/kernel/auth/user_namespace.go +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -16,8 +16,8 @@ package auth import ( "math" - "sync" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index 3361e8b7d..c47f6b6fc 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 9c0a4e1b4..430311cc0 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -18,7 +18,6 @@ package epoll import ( "fmt" - "sync" "syscall" "gvisor.dev/gvisor/pkg/refs" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index e65b961e8..c831fbab2 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 12f0d429b..687690679 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -18,7 +18,6 @@ package eventfd import ( "math" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 49d81b712..6b36bc63e 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -12,6 +12,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sync", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 6b0bb0324..d32c3e90a 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -16,12 +16,11 @@ package fasync import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 11f613a11..cd1501f85 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -18,7 +18,6 @@ import ( "bytes" "fmt" "math" - "sync" "sync/atomic" "syscall" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" ) // FDFlags define flags for an individual descriptor. diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 2bcb6216a..eccb7d1e7 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -16,7 +16,6 @@ package kernel import ( "runtime" - "sync" "testing" "gvisor.dev/gvisor/pkg/sentry/context" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/filetest" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" ) const ( diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index ded27d668..2448c1d99 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -16,10 +16,10 @@ package kernel import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" ) // FSContext contains filesystem context. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 75ec31761..50db443ce 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -9,7 +9,7 @@ go_template_instance( out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", - template = "//pkg/syncutil:generic_atomicptr", + template = "//pkg/sync:generic_atomicptr", types = { "Value": "bucket", }, @@ -42,6 +42,7 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/memmap", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) @@ -51,5 +52,8 @@ go_test( size = "small", srcs = ["futex_test.go"], embed = [":futex"], - deps = ["//pkg/sentry/usermem"], + deps = [ + "//pkg/sentry/usermem", + "//pkg/sync", + ], ) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 278cc8143..d1931c8f4 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -18,11 +18,10 @@ package futex import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index 65e5d1428..c23126ca5 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -17,13 +17,13 @@ package futex import ( "math" "runtime" - "sync" "sync/atomic" "syscall" "testing" "unsafe" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // testData implements the Target interface, and allows us to diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 8653d2f63..c85e97fef 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -36,7 +36,6 @@ import ( "fmt" "io" "path/filepath" - "sync" "sync/atomic" "time" @@ -67,6 +66,7 @@ import ( uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD index d7a7d1169..7f36252a9 100644 --- a/pkg/sentry/kernel/memevent/BUILD +++ b/pkg/sentry/kernel/memevent/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/metric", "//pkg/sentry/kernel", "//pkg/sentry/usage", + "//pkg/sync", ], ) diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go index b0d98e7f0..200565bb8 100644 --- a/pkg/sentry/kernel/memevent/memory_events.go +++ b/pkg/sentry/kernel/memevent/memory_events.go @@ -17,7 +17,6 @@ package memevent import ( - "sync" "time" "gvisor.dev/gvisor/pkg/eventchannel" @@ -26,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" pb "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sync" ) var totalTicks = metric.MustCreateNewUint64Metric("/memory_events/ticks", false /*sync*/, "Total number of memory event periods that have elapsed since startup.") diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 9d34f6d4d..5eeaeff66 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -43,6 +43,7 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go index 95bee2d37..1c0f34269 100644 --- a/pkg/sentry/kernel/pipe/buffer.go +++ b/pkg/sentry/kernel/pipe/buffer.go @@ -16,9 +16,9 @@ package pipe import ( "io" - "sync" "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/sync" ) // buffer encapsulates a queueable byte buffer. diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 4a19ab7ce..716f589af 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -15,12 +15,11 @@ package pipe import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 1a1b38f83..e4fd7d420 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -17,12 +17,12 @@ package pipe import ( "fmt" - "sync" "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index ef9641e6a..8394eb78b 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -17,7 +17,6 @@ package pipe import ( "io" "math" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 6416e0dd8..bf7461cbb 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -15,13 +15,12 @@ package pipe import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index f4c00cd86..13a961594 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -31,6 +31,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index de9617e9d..18299814e 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -17,7 +17,6 @@ package semaphore import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index cd48945e6..7321b22ed 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -24,6 +24,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 19034a21e..8ddef7eb8 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -35,7 +35,6 @@ package shm import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" @@ -49,6 +48,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go index a16f3d57f..768fda220 100644 --- a/pkg/sentry/kernel/signal_handlers.go +++ b/pkg/sentry/kernel/signal_handlers.go @@ -15,10 +15,9 @@ package kernel import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" ) // SignalHandlers holds information about signal actions. diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 9f7e19b4d..89e4d84b1 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 4b08d7d72..28be4a939 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -16,8 +16,6 @@ package signalfd import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/context" @@ -26,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 2fdee0282..d2d01add4 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -16,13 +16,13 @@ package kernel import ( "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // maxSyscallNum is the highest supported syscall number. diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index 8227ecf1d..4607cde2f 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -17,7 +17,8 @@ package kernel import ( "fmt" "math/rand" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // syslog represents a sentry-global kernel log. diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index d25a7903b..978d66da8 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -17,7 +17,6 @@ package kernel import ( gocontext "context" "runtime/trace" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -37,7 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) @@ -85,7 +84,7 @@ type Task struct { // // gosched is protected by goschedSeq. gosched is owned by the task // goroutine. - goschedSeq syncutil.SeqCount `state:"nosave"` + goschedSeq sync.SeqCount `state:"nosave"` gosched TaskGoroutineSchedInfo // yieldCount is the number of times the task goroutine has called diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index c0197a563..768e958d2 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -15,7 +15,6 @@ package kernel import ( - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +24,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index 8267929a6..bf2dabb6e 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -16,9 +16,9 @@ package kernel import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 31847e1df..4e4de0512 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -13,6 +13,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/sentry/context", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index 107394183..706de83ef 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -19,10 +19,10 @@ package time import ( "fmt" "math" - "sync" "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index 76417342a..dc99301de 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -16,7 +16,6 @@ package kernel import ( "fmt" - "sync" "time" "gvisor.dev/gvisor/pkg/log" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sync" ) // Timekeeper manages all of the kernel clocks. diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go index 048de26dc..464d2306a 100644 --- a/pkg/sentry/kernel/tty.go +++ b/pkg/sentry/kernel/tty.go @@ -14,7 +14,7 @@ package kernel -import "sync" +import "gvisor.dev/gvisor/pkg/sync" // TTY defines the relationship between a thread group and its controlling // terminal. diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go index 0a563e715..8ccf04bd1 100644 --- a/pkg/sentry/kernel/uts_namespace.go +++ b/pkg/sentry/kernel/uts_namespace.go @@ -15,9 +15,8 @@ package kernel import ( - "sync" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" ) // UTSNamespace represents a UTS namespace, a holder of two system identifiers: diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD index 156e67bf8..9fa841e8b 100644 --- a/pkg/sentry/limits/BUILD +++ b/pkg/sentry/limits/BUILD @@ -15,6 +15,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/sentry/context", + "//pkg/sync", ], ) diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go index b6c22656b..31b9e9ff6 100644 --- a/pkg/sentry/limits/limits.go +++ b/pkg/sentry/limits/limits.go @@ -16,8 +16,9 @@ package limits import ( - "sync" "syscall" + + "gvisor.dev/gvisor/pkg/sync" ) // LimitType defines a type of resource limit. diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 839931f67..83e248431 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -118,7 +118,7 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/usage", "//pkg/sentry/usermem", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/buffer", ], diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 1b746d030..4b48866ad 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -15,8 +15,6 @@ package mm import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" @@ -25,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 58a5c186d..fa86ebced 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -35,8 +35,6 @@ package mm import ( - "sync" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -44,7 +42,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" ) // MemoryManager implements a virtual address space. @@ -82,7 +80,7 @@ type MemoryManager struct { users int32 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. - mappingMu syncutil.DowngradableRWMutex `state:"nosave"` + mappingMu sync.DowngradableRWMutex `state:"nosave"` // vmas stores virtual memory areas. Since vmas are stored by value, // clients should usually use vmaIterator.ValuePtr() instead of @@ -125,7 +123,7 @@ type MemoryManager struct { // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. - activeMu syncutil.DowngradableRWMutex `state:"nosave"` + activeMu sync.DowngradableRWMutex `state:"nosave"` // pmas stores platform mapping areas used to implement vmas. Since pmas // are stored by value, clients should usually use pmaIterator.ValuePtr() diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index f404107af..a9a2642c5 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -73,6 +73,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/state", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index f7f7298c4..c99e023d9 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -25,7 +25,6 @@ import ( "fmt" "math" "os" - "sync" "sync/atomic" "syscall" "time" @@ -37,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD index b6d008dbe..85e882df9 100644 --- a/pkg/sentry/platform/interrupt/BUILD +++ b/pkg/sentry/platform/interrupt/BUILD @@ -10,6 +10,7 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/sentry/platform/interrupt", visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/sync"], ) go_test( diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go index a4651f500..57be41647 100644 --- a/pkg/sentry/platform/interrupt/interrupt.go +++ b/pkg/sentry/platform/interrupt/interrupt.go @@ -17,7 +17,8 @@ package interrupt import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // Receiver receives interrupt notifications from a Forwarder. diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index f3afd98da..6a358d1d4 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -55,6 +55,7 @@ go_library( "//pkg/sentry/platform/safecopy", "//pkg/sentry/time", "//pkg/sentry/usermem", + "//pkg/sync", ], ) diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index ea8b9632e..a25f3c449 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -15,13 +15,13 @@ package kvm import ( - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // dirtySet tracks vCPUs for invalidation. diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index e5fac0d6a..2f02c03cf 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -17,8 +17,6 @@ package kvm import ( - "unsafe" - "gvisor.dev/gvisor/pkg/sentry/arch" ) diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index f2c2c059e..a7850faed 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -18,13 +18,13 @@ package kvm import ( "fmt" "os" - "sync" "syscall" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // KVM represents a lightweight VM context. diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 7d02ebf19..e6d912168 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -17,7 +17,6 @@ package kvm import ( "fmt" "runtime" - "sync" "sync/atomic" "syscall" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // machine contains state associated with the VM as a whole. diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 0df8cfa0f..cd13390c3 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -33,6 +33,7 @@ go_library( "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/safecopy", "//pkg/sentry/usermem", + "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 7b120a15d..bb0e03880 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -46,13 +46,13 @@ package ptrace import ( "os" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) var ( diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 20244fd95..15dc46a5b 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -18,7 +18,6 @@ import ( "fmt" "os" "runtime" - "sync" "syscall" "golang.org/x/sys/unix" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // Linux kernel errnos which "should never be seen by user programs", but will diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go index 2e6fbe488..245b20722 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -18,7 +18,6 @@ package ptrace import ( - "sync" "sync/atomic" "syscall" "unsafe" @@ -26,6 +25,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/hostcpu" + "gvisor.dev/gvisor/pkg/sync" ) // maskPool contains reusable CPU masks for setting affinity. Unfortunately, diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go index 3f094c2a7..86fd5ed58 100644 --- a/pkg/sentry/platform/ring0/defs.go +++ b/pkg/sentry/platform/ring0/defs.go @@ -17,7 +17,7 @@ package ring0 import ( "syscall" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" ) // Kernel is a global kernel object. diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 10dbd381f..9dae0dccb 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -18,6 +18,7 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/usermem" ) var ( diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go index dc0eeec01..a850ce6cf 100644 --- a/pkg/sentry/platform/ring0/defs_arm64.go +++ b/pkg/sentry/platform/ring0/defs_arm64.go @@ -18,6 +18,7 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/usermem" ) var ( diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index e2e15ba5c..387a7f6c3 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -96,7 +96,10 @@ go_library( "//pkg/sentry/platform/kvm:__subpackages__", "//pkg/sentry/platform/ring0:__subpackages__", ], - deps = ["//pkg/sentry/usermem"], + deps = [ + "//pkg/sentry/usermem", + "//pkg/sync", + ], ) go_test( diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go index 0f029f25d..e199bae18 100644 --- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go @@ -17,7 +17,7 @@ package pagetables import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // limitPCID is the number of valid PCIDs. diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 136821963..103933144 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD index 463544c1a..2d9f4ba9b 100644 --- a/pkg/sentry/socket/netlink/port/BUILD +++ b/pkg/sentry/socket/netlink/port/BUILD @@ -8,6 +8,7 @@ go_library( srcs = ["port.go"], importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port", visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/sync"], ) go_test( diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go index e9d3275b1..2cd3afc22 100644 --- a/pkg/sentry/socket/netlink/port/port.go +++ b/pkg/sentry/socket/netlink/port/port.go @@ -24,7 +24,8 @@ import ( "fmt" "math" "math/rand" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // maxPorts is a sanity limit on the maximum number of ports to allocate per diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index d2e3644a6..cea56f4ed 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -17,7 +17,6 @@ package netlink import ( "math" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" @@ -34,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index e414d8055..f78784569 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -34,6 +34,7 @@ go_library( "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 764f11a6b..0affb8071 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -29,7 +29,6 @@ import ( "io" "math" "reflect" - "sync" "syscall" "time" @@ -49,6 +48,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD index 23eadcb1b..b2677c659 100644 --- a/pkg/sentry/socket/rpcinet/conn/BUILD +++ b/pkg/sentry/socket/rpcinet/conn/BUILD @@ -10,6 +10,7 @@ go_library( deps = [ "//pkg/binary", "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", + "//pkg/sync", "//pkg/syserr", "//pkg/unet", "@com_github_golang_protobuf//proto:go_default_library", diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go index 356adad99..02f39c767 100644 --- a/pkg/sentry/socket/rpcinet/conn/conn.go +++ b/pkg/sentry/socket/rpcinet/conn/conn.go @@ -17,12 +17,12 @@ package conn import ( "fmt" - "sync" "sync/atomic" "syscall" "github.com/golang/protobuf/proto" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/unet" diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD index a3585e10d..a5954f22b 100644 --- a/pkg/sentry/socket/rpcinet/notifier/BUILD +++ b/pkg/sentry/socket/rpcinet/notifier/BUILD @@ -10,6 +10,7 @@ go_library( deps = [ "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", "//pkg/sentry/socket/rpcinet/conn", + "//pkg/sync", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go index 7efe4301f..82b75d6dd 100644 --- a/pkg/sentry/socket/rpcinet/notifier/notifier.go +++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go @@ -17,12 +17,12 @@ package notifier import ( "fmt" - "sync" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index 788ad70d2..d7ba95dff 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/ilist", "//pkg/refs", "//pkg/sentry/context", + "//pkg/sync", "//pkg/syserr", "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index dea11e253..9e6fbc111 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -15,10 +15,9 @@ package transport import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index e27b1c714..5dcd3d95e 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -15,9 +15,8 @@ package transport import ( - "sync" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 37c7ac3c1..fcc0da332 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -16,11 +16,11 @@ package transport import ( - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index a76975cee..aa05e208a 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -91,6 +91,7 @@ go_library( "//pkg/sentry/syscalls", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 1d9018c96..60469549d 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -16,13 +16,13 @@ package linux import ( "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index 18e212dff..3cde3a0be 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -9,7 +9,7 @@ go_template_instance( out = "seqatomic_parameters_unsafe.go", package = "time", suffix = "Parameters", - template = "//pkg/syncutil:generic_seqatomic", + template = "//pkg/sync:generic_seqatomic", types = { "Value": "Parameters", }, @@ -36,7 +36,7 @@ go_library( deps = [ "//pkg/log", "//pkg/metric", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go index 318503277..f9a93115d 100644 --- a/pkg/sentry/time/calibrated_clock.go +++ b/pkg/sentry/time/calibrated_clock.go @@ -17,11 +17,11 @@ package time import ( - "sync" "time" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD index c32fe3241..5518ac3d0 100644 --- a/pkg/sentry/usage/BUILD +++ b/pkg/sentry/usage/BUILD @@ -18,5 +18,6 @@ go_library( deps = [ "//pkg/bits", "//pkg/memutil", + "//pkg/sync", ], ) diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index d6ef644d8..538c645eb 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -17,12 +17,12 @@ package usage import ( "fmt" "os" - "sync" "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sync" ) // MemoryKind represents a type of memory used by the application. diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 4c6aa04a1..35c7be259 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -34,7 +34,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/usermem", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], @@ -54,6 +54,7 @@ go_test( "//pkg/sentry/context/contexttest", "//pkg/sentry/kernel/auth", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 1bc9c4a38..486a76475 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -16,9 +16,9 @@ package vfs import ( "fmt" - "sync" "sync/atomic" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 66eb57bc2..c00b3c84b 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -17,13 +17,13 @@ package vfs import ( "bytes" "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go index adff0b94b..3b933468d 100644 --- a/pkg/sentry/vfs/mount_test.go +++ b/pkg/sentry/vfs/mount_test.go @@ -17,8 +17,9 @@ package vfs import ( "fmt" "runtime" - "sync" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) func TestMountTableLookupEmpty(t *testing.T) { diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index ab13fa461..bd90d36c4 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -26,7 +26,7 @@ import ( "sync/atomic" "unsafe" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" ) // mountKey represents the location at which a Mount is mounted. It is @@ -75,7 +75,7 @@ type mountTable struct { // intrinsics and inline assembly, limiting the performance of this // approach.) - seq syncutil.SeqCount + seq sync.SeqCount seed uint32 // for hashing keys // size holds both length (number of elements) and capacity (number of diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index 8e155654f..cf80df90e 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -15,10 +15,9 @@ package vfs import ( - "sync" - "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index f0641d314..8a0b382f6 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -16,11 +16,11 @@ package vfs import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index ea2db7031..1f21b0b31 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -29,12 +29,12 @@ package vfs import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD index 4d8435265..28f21f13d 100644 --- a/pkg/sentry/watchdog/BUILD +++ b/pkg/sentry/watchdog/BUILD @@ -13,5 +13,6 @@ go_library( "//pkg/metric", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", + "//pkg/sync", ], ) diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index 5e4611333..bfb2fac26 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -32,7 +32,6 @@ package watchdog import ( "bytes" "fmt" - "sync" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -40,6 +39,7 @@ import ( "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" ) // Opts configures the watchdog. diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD new file mode 100644 index 000000000..e8cd16b8f --- /dev/null +++ b/pkg/sync/BUILD @@ -0,0 +1,53 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template") + +package( + default_visibility = ["//:sandbox"], + licenses = ["notice"], +) + +exports_files(["LICENSE"]) + +go_template( + name = "generic_atomicptr", + srcs = ["atomicptr_unsafe.go"], + types = [ + "Value", + ], +) + +go_template( + name = "generic_seqatomic", + srcs = ["seqatomic_unsafe.go"], + types = [ + "Value", + ], + deps = [ + ":sync", + ], +) + +go_library( + name = "sync", + srcs = [ + "aliases.go", + "downgradable_rwmutex_unsafe.go", + "memmove_unsafe.go", + "norace_unsafe.go", + "race_unsafe.go", + "seqcount.go", + "syncutil.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sync", +) + +go_test( + name = "sync_test", + size = "small", + srcs = [ + "downgradable_rwmutex_test.go", + "seqcount_test.go", + ], + embed = [":sync"], +) diff --git a/pkg/sync/LICENSE b/pkg/sync/LICENSE new file mode 100644 index 000000000..6a66aea5e --- /dev/null +++ b/pkg/sync/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/sync/README.md b/pkg/sync/README.md new file mode 100644 index 000000000..2183c4e20 --- /dev/null +++ b/pkg/sync/README.md @@ -0,0 +1,5 @@ +# Syncutil + +This package provides additional synchronization primitives not provided by the +Go stdlib 'sync' package. It is partially derived from the upstream 'sync' +package from go1.10. diff --git a/pkg/sync/aliases.go b/pkg/sync/aliases.go new file mode 100644 index 000000000..20c7ca041 --- /dev/null +++ b/pkg/sync/aliases.go @@ -0,0 +1,37 @@ +// Copyright 2020 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sync + +import ( + "sync" +) + +// Aliases of standard library types. +type ( + // Mutex is an alias of sync.Mutex. + Mutex = sync.Mutex + + // RWMutex is an alias of sync.RWMutex. + RWMutex = sync.RWMutex + + // Cond is an alias of sync.Cond. + Cond = sync.Cond + + // Locker is an alias of sync.Locker. + Locker = sync.Locker + + // Once is an alias of sync.Once. + Once = sync.Once + + // Pool is an alias of sync.Pool. + Pool = sync.Pool + + // WaitGroup is an alias of sync.WaitGroup. + WaitGroup = sync.WaitGroup + + // Map is an alias of sync.Map. + Map = sync.Map +) diff --git a/pkg/sync/atomicptr_unsafe.go b/pkg/sync/atomicptr_unsafe.go new file mode 100644 index 000000000..525c4beed --- /dev/null +++ b/pkg/sync/atomicptr_unsafe.go @@ -0,0 +1,47 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package template doesn't exist. This file must be instantiated using the +// go_template_instance rule in tools/go_generics/defs.bzl. +package template + +import ( + "sync/atomic" + "unsafe" +) + +// Value is a required type parameter. +type Value struct{} + +// An AtomicPtr is a pointer to a value of type Value that can be atomically +// loaded and stored. The zero value of an AtomicPtr represents nil. +// +// Note that copying AtomicPtr by value performs a non-atomic read of the +// stored pointer, which is unsafe if Store() can be called concurrently; in +// this case, do `dst.Store(src.Load())` instead. +// +// +stateify savable +type AtomicPtr struct { + ptr unsafe.Pointer `state:".(*Value)"` +} + +func (p *AtomicPtr) savePtr() *Value { + return p.Load() +} + +func (p *AtomicPtr) loadPtr(v *Value) { + p.Store(v) +} + +// Load returns the value set by the most recent Store. It returns nil if there +// has been no previous call to Store. +func (p *AtomicPtr) Load() *Value { + return (*Value)(atomic.LoadPointer(&p.ptr)) +} + +// Store sets the value returned by Load to x. +func (p *AtomicPtr) Store(x *Value) { + atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) +} diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD new file mode 100644 index 000000000..418eda29c --- /dev/null +++ b/pkg/sync/atomicptrtest/BUILD @@ -0,0 +1,29 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "atomicptr_int", + out = "atomicptr_int_unsafe.go", + package = "atomicptr", + suffix = "Int", + template = "//pkg/sync:generic_atomicptr", + types = { + "Value": "int", + }, +) + +go_library( + name = "atomicptr", + srcs = ["atomicptr_int_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/sync/atomicptr", +) + +go_test( + name = "atomicptr_test", + size = "small", + srcs = ["atomicptr_test.go"], + embed = [":atomicptr"], +) diff --git a/pkg/sync/atomicptrtest/atomicptr_test.go b/pkg/sync/atomicptrtest/atomicptr_test.go new file mode 100644 index 000000000..8fdc5112e --- /dev/null +++ b/pkg/sync/atomicptrtest/atomicptr_test.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package atomicptr + +import ( + "testing" +) + +func newInt(val int) *int { + return &val +} + +func TestAtomicPtr(t *testing.T) { + var p AtomicPtrInt + if got := p.Load(); got != nil { + t.Errorf("initial value is %p (%v), wanted nil", got, got) + } + want := newInt(42) + p.Store(want) + if got := p.Load(); got != want { + t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) + } + want = newInt(100) + p.Store(want) + if got := p.Load(); got != want { + t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) + } +} diff --git a/pkg/sync/downgradable_rwmutex_test.go b/pkg/sync/downgradable_rwmutex_test.go new file mode 100644 index 000000000..f04496bc5 --- /dev/null +++ b/pkg/sync/downgradable_rwmutex_test.go @@ -0,0 +1,150 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Copyright 2019 The gVisor Authors. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// GOMAXPROCS=10 go test + +// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the +// addition of downgradingWriter and the renaming of num_iterations to +// numIterations to shut up Golint. + +package sync + +import ( + "fmt" + "runtime" + "sync/atomic" + "testing" +) + +func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) { + m.RLock() + clocked <- true + <-cunlock + m.RUnlock() + cdone <- true +} + +func doTestParallelReaders(numReaders, gomaxprocs int) { + runtime.GOMAXPROCS(gomaxprocs) + var m DowngradableRWMutex + clocked := make(chan bool) + cunlock := make(chan bool) + cdone := make(chan bool) + for i := 0; i < numReaders; i++ { + go parallelReader(&m, clocked, cunlock, cdone) + } + // Wait for all parallel RLock()s to succeed. + for i := 0; i < numReaders; i++ { + <-clocked + } + for i := 0; i < numReaders; i++ { + cunlock <- true + } + // Wait for the goroutines to finish. + for i := 0; i < numReaders; i++ { + <-cdone + } +} + +func TestParallelReaders(t *testing.T) { + defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) + doTestParallelReaders(1, 4) + doTestParallelReaders(3, 4) + doTestParallelReaders(4, 2) +} + +func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.RLock() + n := atomic.AddInt32(activity, 1) + if n < 1 || n >= 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -1) + rwm.RUnlock() + } + cdone <- true +} + +func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.Lock() + n := atomic.AddInt32(activity, 10000) + if n != 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -10000) + rwm.Unlock() + } + cdone <- true +} + +func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.Lock() + n := atomic.AddInt32(activity, 10000) + if n != 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -10000) + rwm.DowngradeLock() + n = atomic.AddInt32(activity, 1) + if n < 1 || n >= 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + n = atomic.AddInt32(activity, -1) + rwm.RUnlock() + } + cdone <- true +} + +func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) { + runtime.GOMAXPROCS(gomaxprocs) + // Number of active readers + 10000 * number of active writers. + var activity int32 + var rwm DowngradableRWMutex + cdone := make(chan bool) + go writer(&rwm, numIterations, &activity, cdone) + go downgradingWriter(&rwm, numIterations, &activity, cdone) + var i int + for i = 0; i < numReaders/2; i++ { + go reader(&rwm, numIterations, &activity, cdone) + } + go writer(&rwm, numIterations, &activity, cdone) + go downgradingWriter(&rwm, numIterations, &activity, cdone) + for ; i < numReaders; i++ { + go reader(&rwm, numIterations, &activity, cdone) + } + // Wait for the 4 writers and all readers to finish. + for i := 0; i < 4+numReaders; i++ { + <-cdone + } +} + +func TestDowngradableRWMutex(t *testing.T) { + defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) + n := 1000 + if testing.Short() { + n = 5 + } + HammerDowngradableRWMutex(1, 1, n) + HammerDowngradableRWMutex(1, 3, n) + HammerDowngradableRWMutex(1, 10, n) + HammerDowngradableRWMutex(4, 1, n) + HammerDowngradableRWMutex(4, 3, n) + HammerDowngradableRWMutex(4, 10, n) + HammerDowngradableRWMutex(10, 1, n) + HammerDowngradableRWMutex(10, 3, n) + HammerDowngradableRWMutex(10, 10, n) + HammerDowngradableRWMutex(10, 5, n) +} diff --git a/pkg/sync/downgradable_rwmutex_unsafe.go b/pkg/sync/downgradable_rwmutex_unsafe.go new file mode 100644 index 000000000..9bb55cd3a --- /dev/null +++ b/pkg/sync/downgradable_rwmutex_unsafe.go @@ -0,0 +1,146 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Copyright 2019 The gVisor Authors. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.13 +// +build !go1.15 + +// Check go:linkname function signatures when updating Go version. + +// This is mostly copied from the standard library's sync/rwmutex.go. +// +// Happens-before relationships indicated to the race detector: +// - Unlock -> Lock (via writerSem) +// - Unlock -> RLock (via readerSem) +// - RUnlock -> Lock (via writerSem) +// - DowngradeLock -> RLock (via readerSem) + +package sync + +import ( + "sync" + "sync/atomic" + "unsafe" +) + +//go:linkname runtimeSemacquire sync.runtime_Semacquire +func runtimeSemacquire(s *uint32) + +//go:linkname runtimeSemrelease sync.runtime_Semrelease +func runtimeSemrelease(s *uint32, handoff bool, skipframes int) + +// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock +// method. +type DowngradableRWMutex struct { + w sync.Mutex // held if there are pending writers + writerSem uint32 // semaphore for writers to wait for completing readers + readerSem uint32 // semaphore for readers to wait for completing writers + readerCount int32 // number of pending readers + readerWait int32 // number of departing readers +} + +const rwmutexMaxReaders = 1 << 30 + +// RLock locks rw for reading. +func (rw *DowngradableRWMutex) RLock() { + if RaceEnabled { + RaceDisable() + } + if atomic.AddInt32(&rw.readerCount, 1) < 0 { + // A writer is pending, wait for it. + runtimeSemacquire(&rw.readerSem) + } + if RaceEnabled { + RaceEnable() + RaceAcquire(unsafe.Pointer(&rw.readerSem)) + } +} + +// RUnlock undoes a single RLock call. +func (rw *DowngradableRWMutex) RUnlock() { + if RaceEnabled { + RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) + RaceDisable() + } + if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 { + if r+1 == 0 || r+1 == -rwmutexMaxReaders { + panic("RUnlock of unlocked DowngradableRWMutex") + } + // A writer is pending. + if atomic.AddInt32(&rw.readerWait, -1) == 0 { + // The last reader unblocks the writer. + runtimeSemrelease(&rw.writerSem, false, 0) + } + } + if RaceEnabled { + RaceEnable() + } +} + +// Lock locks rw for writing. +func (rw *DowngradableRWMutex) Lock() { + if RaceEnabled { + RaceDisable() + } + // First, resolve competition with other writers. + rw.w.Lock() + // Announce to readers there is a pending writer. + r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders + // Wait for active readers. + if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 { + runtimeSemacquire(&rw.writerSem) + } + if RaceEnabled { + RaceEnable() + RaceAcquire(unsafe.Pointer(&rw.writerSem)) + } +} + +// Unlock unlocks rw for writing. +func (rw *DowngradableRWMutex) Unlock() { + if RaceEnabled { + RaceRelease(unsafe.Pointer(&rw.writerSem)) + RaceRelease(unsafe.Pointer(&rw.readerSem)) + RaceDisable() + } + // Announce to readers there is no active writer. + r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders) + if r >= rwmutexMaxReaders { + panic("Unlock of unlocked DowngradableRWMutex") + } + // Unblock blocked readers, if any. + for i := 0; i < int(r); i++ { + runtimeSemrelease(&rw.readerSem, false, 0) + } + // Allow other writers to proceed. + rw.w.Unlock() + if RaceEnabled { + RaceEnable() + } +} + +// DowngradeLock atomically unlocks rw for writing and locks it for reading. +func (rw *DowngradableRWMutex) DowngradeLock() { + if RaceEnabled { + RaceRelease(unsafe.Pointer(&rw.readerSem)) + RaceDisable() + } + // Announce to readers there is no active writer and one additional reader. + r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1) + if r >= rwmutexMaxReaders+1 { + panic("DowngradeLock of unlocked DowngradableRWMutex") + } + // Unblock blocked readers, if any. Note that this loop starts as 1 since r + // includes this goroutine. + for i := 1; i < int(r); i++ { + runtimeSemrelease(&rw.readerSem, false, 0) + } + // Allow other writers to proceed to rw.w.Lock(). Note that they will still + // block on rw.writerSem since at least this reader exists, such that + // DowngradeLock() is atomic with the previous write lock. + rw.w.Unlock() + if RaceEnabled { + RaceEnable() + } +} diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go new file mode 100644 index 000000000..ad4a3a37e --- /dev/null +++ b/pkg/sync/memmove_unsafe.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.12 +// +build !go1.15 + +// Check go:linkname function signatures when updating Go version. + +package sync + +import ( + "unsafe" +) + +//go:linkname memmove runtime.memmove +//go:noescape +func memmove(to, from unsafe.Pointer, n uintptr) + +// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad, which can't +// define it because go_generics can't update the go:linkname annotation. +// Furthermore, go:linkname silently doesn't work if the local name is exported +// (this is of course undocumented), which is why this indirection is +// necessary. +func Memmove(to, from unsafe.Pointer, n uintptr) { + memmove(to, from, n) +} diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go new file mode 100644 index 000000000..006055dd6 --- /dev/null +++ b/pkg/sync/norace_unsafe.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !race + +package sync + +import ( + "unsafe" +) + +// RaceEnabled is true if the Go data race detector is enabled. +const RaceEnabled = false + +// RaceDisable has the same semantics as runtime.RaceDisable. +func RaceDisable() { +} + +// RaceEnable has the same semantics as runtime.RaceEnable. +func RaceEnable() { +} + +// RaceAcquire has the same semantics as runtime.RaceAcquire. +func RaceAcquire(addr unsafe.Pointer) { +} + +// RaceRelease has the same semantics as runtime.RaceRelease. +func RaceRelease(addr unsafe.Pointer) { +} + +// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. +func RaceReleaseMerge(addr unsafe.Pointer) { +} diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go new file mode 100644 index 000000000..31d8fa9a6 --- /dev/null +++ b/pkg/sync/race_unsafe.go @@ -0,0 +1,41 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build race + +package sync + +import ( + "runtime" + "unsafe" +) + +// RaceEnabled is true if the Go data race detector is enabled. +const RaceEnabled = true + +// RaceDisable has the same semantics as runtime.RaceDisable. +func RaceDisable() { + runtime.RaceDisable() +} + +// RaceEnable has the same semantics as runtime.RaceEnable. +func RaceEnable() { + runtime.RaceEnable() +} + +// RaceAcquire has the same semantics as runtime.RaceAcquire. +func RaceAcquire(addr unsafe.Pointer) { + runtime.RaceAcquire(addr) +} + +// RaceRelease has the same semantics as runtime.RaceRelease. +func RaceRelease(addr unsafe.Pointer) { + runtime.RaceRelease(addr) +} + +// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. +func RaceReleaseMerge(addr unsafe.Pointer) { + runtime.RaceReleaseMerge(addr) +} diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go new file mode 100644 index 000000000..eda6fb131 --- /dev/null +++ b/pkg/sync/seqatomic_unsafe.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package template doesn't exist. This file must be instantiated using the +// go_template_instance rule in tools/go_generics/defs.bzl. +package template + +import ( + "fmt" + "reflect" + "strings" + "unsafe" + + "gvisor.dev/gvisor/pkg/sync" +) + +// Value is a required type parameter. +// +// Value must not contain any pointers, including interface objects, function +// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs +// containing any of the above. An init() function will panic if this property +// does not hold. +type Value struct{} + +// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race +// with any writer critical sections in sc. +func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value { + // This function doesn't use SeqAtomicTryLoad because doing so is + // measurably, significantly (~20%) slower; Go is awful at inlining. + var val Value + for { + epoch := sc.BeginRead() + if sync.RaceEnabled { + // runtime.RaceDisable() doesn't actually stop the race detector, + // so it can't help us here. Instead, call runtime.memmove + // directly, which is not instrumented by the race detector. + sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) + } else { + // This is ~40% faster for short reads than going through memmove. + val = *ptr + } + if sc.ReadOk(epoch) { + break + } + } + return val +} + +// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section +// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read +// would race with a writer critical section, SeqAtomicTryLoad returns +// (unspecified, false). +func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) { + var val Value + if sync.RaceEnabled { + sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) + } else { + val = *ptr + } + return val, sc.ReadOk(epoch) +} + +func init() { + var val Value + typ := reflect.TypeOf(val) + name := typ.Name() + if ptrs := sync.PointersInType(typ, name); len(ptrs) != 0 { + panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n"))) + } +} diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD new file mode 100644 index 000000000..eba21518d --- /dev/null +++ b/pkg/sync/seqatomictest/BUILD @@ -0,0 +1,33 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "seqatomic_int", + out = "seqatomic_int_unsafe.go", + package = "seqatomic", + suffix = "Int", + template = "//pkg/sync:generic_seqatomic", + types = { + "Value": "int", + }, +) + +go_library( + name = "seqatomic", + srcs = ["seqatomic_int_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/sync/seqatomic", + deps = [ + "//pkg/sync", + ], +) + +go_test( + name = "seqatomic_test", + size = "small", + srcs = ["seqatomic_test.go"], + embed = [":seqatomic"], + deps = ["//pkg/sync"], +) diff --git a/pkg/sync/seqatomictest/seqatomic_test.go b/pkg/sync/seqatomictest/seqatomic_test.go new file mode 100644 index 000000000..2c4568b07 --- /dev/null +++ b/pkg/sync/seqatomictest/seqatomic_test.go @@ -0,0 +1,132 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seqatomic + +import ( + "sync/atomic" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/sync" +) + +func TestSeqAtomicLoadUncontended(t *testing.T) { + var seq sync.SeqCount + const want = 1 + data := want + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicLoadAfterWrite(t *testing.T) { + var seq sync.SeqCount + var data int + const want = 1 + seq.BeginWrite() + data = want + seq.EndWrite() + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicLoadDuringWrite(t *testing.T) { + var seq sync.SeqCount + var data int + const want = 1 + seq.BeginWrite() + go func() { + time.Sleep(time.Second) + data = want + seq.EndWrite() + }() + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicTryLoadUncontended(t *testing.T) { + var seq sync.SeqCount + const want = 1 + data := want + epoch := seq.BeginRead() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { + t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) + } +} + +func TestSeqAtomicTryLoadDuringWrite(t *testing.T) { + var seq sync.SeqCount + var data int + epoch := seq.BeginRead() + seq.BeginWrite() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { + t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) + } + seq.EndWrite() +} + +func TestSeqAtomicTryLoadAfterWrite(t *testing.T) { + var seq sync.SeqCount + var data int + epoch := seq.BeginRead() + seq.BeginWrite() + seq.EndWrite() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { + t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) + } +} + +func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) { + var seq sync.SeqCount + const want = 42 + data := want + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if got := SeqAtomicLoadInt(&seq, &data); got != want { + b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } + } + }) +} + +func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) { + var seq sync.SeqCount + const want = 42 + data := want + b.RunParallel(func(pb *testing.PB) { + epoch := seq.BeginRead() + for pb.Next() { + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { + b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) + } + } + }) +} + +// For comparison: +func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) { + var a atomic.Value + const want = 42 + a.Store(int(want)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if got := a.Load().(int); got != want { + b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want) + } + } + }) +} diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go new file mode 100644 index 000000000..a1e895352 --- /dev/null +++ b/pkg/sync/seqcount.go @@ -0,0 +1,149 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sync + +import ( + "fmt" + "reflect" + "runtime" + "sync/atomic" +) + +// SeqCount is a synchronization primitive for optimistic reader/writer +// synchronization in cases where readers can work with stale data and +// therefore do not need to block writers. +// +// Compared to sync/atomic.Value: +// +// - Mutation of SeqCount-protected data does not require memory allocation, +// whereas atomic.Value generally does. This is a significant advantage when +// writes are common. +// +// - Atomic reads of SeqCount-protected data require copying. This is a +// disadvantage when atomic reads are common. +// +// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other +// operations to be made atomic with reads of SeqCount-protected data. +// +// - SeqCount may be less flexible: as of this writing, SeqCount-protected data +// cannot include pointers. +// +// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected +// data require instantiating function templates using go_generics (see +// seqatomic.go). +type SeqCount struct { + // epoch is incremented by BeginWrite and EndWrite, such that epoch is odd + // if a writer critical section is active, and a read from data protected + // by this SeqCount is atomic iff epoch is the same even value before and + // after the read. + epoch uint32 +} + +// SeqCountEpoch tracks writer critical sections in a SeqCount. +type SeqCountEpoch struct { + val uint32 +} + +// We assume that: +// +// - All functions in sync/atomic that perform a memory read are at least a +// read fence: memory reads before calls to such functions cannot be reordered +// after the call, and memory reads after calls to such functions cannot be +// reordered before the call, even if those reads do not use sync/atomic. +// +// - All functions in sync/atomic that perform a memory write are at least a +// write fence: memory writes before calls to such functions cannot be +// reordered after the call, and memory writes after calls to such functions +// cannot be reordered before the call, even if those writes do not use +// sync/atomic. +// +// As of this writing, the Go memory model completely fails to describe +// sync/atomic, but these properties are implied by +// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8. + +// BeginRead indicates the beginning of a reader critical section. Reader +// critical sections DO NOT BLOCK writer critical sections, so operations in a +// reader critical section MAY RACE with writer critical sections. Races are +// detected by ReadOk at the end of the reader critical section. Thus, the +// low-level structure of readers is generally: +// +// for { +// epoch := seq.BeginRead() +// // do something idempotent with seq-protected data +// if seq.ReadOk(epoch) { +// break +// } +// } +// +// However, since reader critical sections may race with writer critical +// sections, the Go race detector will (accurately) flag data races in readers +// using this pattern. Most users of SeqCount will need to use the +// SeqAtomicLoad function template in seqatomic.go. +func (s *SeqCount) BeginRead() SeqCountEpoch { + epoch := atomic.LoadUint32(&s.epoch) + for epoch&1 != 0 { + runtime.Gosched() + epoch = atomic.LoadUint32(&s.epoch) + } + return SeqCountEpoch{epoch} +} + +// ReadOk returns true if the reader critical section initiated by a previous +// call to BeginRead() that returned epoch did not race with any writer critical +// sections. +// +// ReadOk may be called any number of times during a reader critical section. +// Reader critical sections do not need to be explicitly terminated; the last +// call to ReadOk is implicitly the end of the reader critical section. +func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool { + return atomic.LoadUint32(&s.epoch) == epoch.val +} + +// BeginWrite indicates the beginning of a writer critical section. +// +// SeqCount does not support concurrent writer critical sections; clients with +// concurrent writers must synchronize them using e.g. sync.Mutex. +func (s *SeqCount) BeginWrite() { + if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 { + panic("SeqCount.BeginWrite during writer critical section") + } +} + +// EndWrite ends the effect of a preceding BeginWrite. +func (s *SeqCount) EndWrite() { + if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 { + panic("SeqCount.EndWrite outside writer critical section") + } +} + +// PointersInType returns a list of pointers reachable from values named +// valName of the given type. +// +// PointersInType is not exhaustive, but it is guaranteed that if typ contains +// at least one pointer, then PointersInTypeOf returns a non-empty list. +func PointersInType(typ reflect.Type, valName string) []string { + switch kind := typ.Kind(); kind { + case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128: + return nil + + case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer: + return []string{valName} + + case reflect.Array: + return PointersInType(typ.Elem(), valName+"[]") + + case reflect.Struct: + var ptrs []string + for i, n := 0, typ.NumField(); i < n; i++ { + field := typ.Field(i) + ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...) + } + return ptrs + + default: + return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)} + } +} diff --git a/pkg/sync/seqcount_test.go b/pkg/sync/seqcount_test.go new file mode 100644 index 000000000..6eb7b4b59 --- /dev/null +++ b/pkg/sync/seqcount_test.go @@ -0,0 +1,153 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sync + +import ( + "reflect" + "testing" + "time" +) + +func TestSeqCountWriteUncontended(t *testing.T) { + var seq SeqCount + seq.BeginWrite() + seq.EndWrite() +} + +func TestSeqCountReadUncontended(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountBeginReadAfterWrite(t *testing.T) { + var seq SeqCount + var data int32 + const want = 1 + seq.BeginWrite() + data = want + seq.EndWrite() + epoch := seq.BeginRead() + if data != want { + t.Errorf("Reader: got %v, wanted %v", data, want) + } + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountBeginReadDuringWrite(t *testing.T) { + var seq SeqCount + var data int + const want = 1 + seq.BeginWrite() + go func() { + time.Sleep(time.Second) + data = want + seq.EndWrite() + }() + epoch := seq.BeginRead() + if data != want { + t.Errorf("Reader: got %v, wanted %v", data, want) + } + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountReadOkAfterWrite(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + seq.BeginWrite() + seq.EndWrite() + if seq.ReadOk(epoch) { + t.Errorf("ReadOk: got true, wanted false") + } +} + +func TestSeqCountReadOkDuringWrite(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + seq.BeginWrite() + if seq.ReadOk(epoch) { + t.Errorf("ReadOk: got true, wanted false") + } + seq.EndWrite() +} + +func BenchmarkSeqCountWriteUncontended(b *testing.B) { + var seq SeqCount + for i := 0; i < b.N; i++ { + seq.BeginWrite() + seq.EndWrite() + } +} + +func BenchmarkSeqCountReadUncontended(b *testing.B) { + var seq SeqCount + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + epoch := seq.BeginRead() + if !seq.ReadOk(epoch) { + b.Fatalf("ReadOk: got false, wanted true") + } + } + }) +} + +func TestPointersInType(t *testing.T) { + for _, test := range []struct { + name string // used for both test and value name + val interface{} + ptrs []string + }{ + { + name: "EmptyStruct", + val: struct{}{}, + }, + { + name: "Int", + val: int(0), + }, + { + name: "MixedStruct", + val: struct { + b bool + I int + ExportedPtr *struct{} + unexportedPtr *struct{} + arr [2]int + ptrArr [2]*int + nestedStruct struct { + nestedNonptr int + nestedPtr *int + } + structArr [1]struct { + nonptr int + ptr *int + } + }{}, + ptrs: []string{ + "MixedStruct.ExportedPtr", + "MixedStruct.unexportedPtr", + "MixedStruct.ptrArr[]", + "MixedStruct.nestedStruct.nestedPtr", + "MixedStruct.structArr[].ptr", + }, + }, + } { + t.Run(test.name, func(t *testing.T) { + typ := reflect.TypeOf(test.val) + ptrs := PointersInType(typ, test.name) + t.Logf("Found pointers: %v", ptrs) + if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) { + t.Errorf("Got %v, wanted %v", ptrs, test.ptrs) + } + }) + } +} diff --git a/pkg/sync/syncutil.go b/pkg/sync/syncutil.go new file mode 100644 index 000000000..b16cf5333 --- /dev/null +++ b/pkg/sync/syncutil.go @@ -0,0 +1,7 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package sync provides synchronization primitives. +package sync diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD deleted file mode 100644 index cb1f41628..000000000 --- a/pkg/syncutil/BUILD +++ /dev/null @@ -1,52 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_generics:defs.bzl", "go_template") - -package( - default_visibility = ["//:sandbox"], - licenses = ["notice"], -) - -exports_files(["LICENSE"]) - -go_template( - name = "generic_atomicptr", - srcs = ["atomicptr_unsafe.go"], - types = [ - "Value", - ], -) - -go_template( - name = "generic_seqatomic", - srcs = ["seqatomic_unsafe.go"], - types = [ - "Value", - ], - deps = [ - ":sync", - ], -) - -go_library( - name = "syncutil", - srcs = [ - "downgradable_rwmutex_unsafe.go", - "memmove_unsafe.go", - "norace_unsafe.go", - "race_unsafe.go", - "seqcount.go", - "syncutil.go", - ], - importpath = "gvisor.dev/gvisor/pkg/syncutil", -) - -go_test( - name = "syncutil_test", - size = "small", - srcs = [ - "downgradable_rwmutex_test.go", - "seqcount_test.go", - ], - embed = [":syncutil"], -) diff --git a/pkg/syncutil/LICENSE b/pkg/syncutil/LICENSE deleted file mode 100644 index 6a66aea5e..000000000 --- a/pkg/syncutil/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/syncutil/README.md b/pkg/syncutil/README.md deleted file mode 100644 index 2183c4e20..000000000 --- a/pkg/syncutil/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Syncutil - -This package provides additional synchronization primitives not provided by the -Go stdlib 'sync' package. It is partially derived from the upstream 'sync' -package from go1.10. diff --git a/pkg/syncutil/atomicptr_unsafe.go b/pkg/syncutil/atomicptr_unsafe.go deleted file mode 100644 index 525c4beed..000000000 --- a/pkg/syncutil/atomicptr_unsafe.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package template doesn't exist. This file must be instantiated using the -// go_template_instance rule in tools/go_generics/defs.bzl. -package template - -import ( - "sync/atomic" - "unsafe" -) - -// Value is a required type parameter. -type Value struct{} - -// An AtomicPtr is a pointer to a value of type Value that can be atomically -// loaded and stored. The zero value of an AtomicPtr represents nil. -// -// Note that copying AtomicPtr by value performs a non-atomic read of the -// stored pointer, which is unsafe if Store() can be called concurrently; in -// this case, do `dst.Store(src.Load())` instead. -// -// +stateify savable -type AtomicPtr struct { - ptr unsafe.Pointer `state:".(*Value)"` -} - -func (p *AtomicPtr) savePtr() *Value { - return p.Load() -} - -func (p *AtomicPtr) loadPtr(v *Value) { - p.Store(v) -} - -// Load returns the value set by the most recent Store. It returns nil if there -// has been no previous call to Store. -func (p *AtomicPtr) Load() *Value { - return (*Value)(atomic.LoadPointer(&p.ptr)) -} - -// Store sets the value returned by Load to x. -func (p *AtomicPtr) Store(x *Value) { - atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) -} diff --git a/pkg/syncutil/atomicptrtest/BUILD b/pkg/syncutil/atomicptrtest/BUILD deleted file mode 100644 index 63f411a90..000000000 --- a/pkg/syncutil/atomicptrtest/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "atomicptr_int", - out = "atomicptr_int_unsafe.go", - package = "atomicptr", - suffix = "Int", - template = "//pkg/syncutil:generic_atomicptr", - types = { - "Value": "int", - }, -) - -go_library( - name = "atomicptr", - srcs = ["atomicptr_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/syncutil/atomicptr", -) - -go_test( - name = "atomicptr_test", - size = "small", - srcs = ["atomicptr_test.go"], - embed = [":atomicptr"], -) diff --git a/pkg/syncutil/atomicptrtest/atomicptr_test.go b/pkg/syncutil/atomicptrtest/atomicptr_test.go deleted file mode 100644 index 8fdc5112e..000000000 --- a/pkg/syncutil/atomicptrtest/atomicptr_test.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package atomicptr - -import ( - "testing" -) - -func newInt(val int) *int { - return &val -} - -func TestAtomicPtr(t *testing.T) { - var p AtomicPtrInt - if got := p.Load(); got != nil { - t.Errorf("initial value is %p (%v), wanted nil", got, got) - } - want := newInt(42) - p.Store(want) - if got := p.Load(); got != want { - t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) - } - want = newInt(100) - p.Store(want) - if got := p.Load(); got != want { - t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) - } -} diff --git a/pkg/syncutil/downgradable_rwmutex_test.go b/pkg/syncutil/downgradable_rwmutex_test.go deleted file mode 100644 index ffaf7ecc7..000000000 --- a/pkg/syncutil/downgradable_rwmutex_test.go +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// GOMAXPROCS=10 go test - -// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the -// addition of downgradingWriter and the renaming of num_iterations to -// numIterations to shut up Golint. - -package syncutil - -import ( - "fmt" - "runtime" - "sync/atomic" - "testing" -) - -func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) { - m.RLock() - clocked <- true - <-cunlock - m.RUnlock() - cdone <- true -} - -func doTestParallelReaders(numReaders, gomaxprocs int) { - runtime.GOMAXPROCS(gomaxprocs) - var m DowngradableRWMutex - clocked := make(chan bool) - cunlock := make(chan bool) - cdone := make(chan bool) - for i := 0; i < numReaders; i++ { - go parallelReader(&m, clocked, cunlock, cdone) - } - // Wait for all parallel RLock()s to succeed. - for i := 0; i < numReaders; i++ { - <-clocked - } - for i := 0; i < numReaders; i++ { - cunlock <- true - } - // Wait for the goroutines to finish. - for i := 0; i < numReaders; i++ { - <-cdone - } -} - -func TestParallelReaders(t *testing.T) { - defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) - doTestParallelReaders(1, 4) - doTestParallelReaders(3, 4) - doTestParallelReaders(4, 2) -} - -func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.RLock() - n := atomic.AddInt32(activity, 1) - if n < 1 || n >= 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -1) - rwm.RUnlock() - } - cdone <- true -} - -func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.Lock() - n := atomic.AddInt32(activity, 10000) - if n != 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -10000) - rwm.Unlock() - } - cdone <- true -} - -func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.Lock() - n := atomic.AddInt32(activity, 10000) - if n != 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -10000) - rwm.DowngradeLock() - n = atomic.AddInt32(activity, 1) - if n < 1 || n >= 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - n = atomic.AddInt32(activity, -1) - rwm.RUnlock() - } - cdone <- true -} - -func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) { - runtime.GOMAXPROCS(gomaxprocs) - // Number of active readers + 10000 * number of active writers. - var activity int32 - var rwm DowngradableRWMutex - cdone := make(chan bool) - go writer(&rwm, numIterations, &activity, cdone) - go downgradingWriter(&rwm, numIterations, &activity, cdone) - var i int - for i = 0; i < numReaders/2; i++ { - go reader(&rwm, numIterations, &activity, cdone) - } - go writer(&rwm, numIterations, &activity, cdone) - go downgradingWriter(&rwm, numIterations, &activity, cdone) - for ; i < numReaders; i++ { - go reader(&rwm, numIterations, &activity, cdone) - } - // Wait for the 4 writers and all readers to finish. - for i := 0; i < 4+numReaders; i++ { - <-cdone - } -} - -func TestDowngradableRWMutex(t *testing.T) { - defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) - n := 1000 - if testing.Short() { - n = 5 - } - HammerDowngradableRWMutex(1, 1, n) - HammerDowngradableRWMutex(1, 3, n) - HammerDowngradableRWMutex(1, 10, n) - HammerDowngradableRWMutex(4, 1, n) - HammerDowngradableRWMutex(4, 3, n) - HammerDowngradableRWMutex(4, 10, n) - HammerDowngradableRWMutex(10, 1, n) - HammerDowngradableRWMutex(10, 3, n) - HammerDowngradableRWMutex(10, 10, n) - HammerDowngradableRWMutex(10, 5, n) -} diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go deleted file mode 100644 index 51e11555d..000000000 --- a/pkg/syncutil/downgradable_rwmutex_unsafe.go +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.13 -// +build !go1.15 - -// Check go:linkname function signatures when updating Go version. - -// This is mostly copied from the standard library's sync/rwmutex.go. -// -// Happens-before relationships indicated to the race detector: -// - Unlock -> Lock (via writerSem) -// - Unlock -> RLock (via readerSem) -// - RUnlock -> Lock (via writerSem) -// - DowngradeLock -> RLock (via readerSem) - -package syncutil - -import ( - "sync" - "sync/atomic" - "unsafe" -) - -//go:linkname runtimeSemacquire sync.runtime_Semacquire -func runtimeSemacquire(s *uint32) - -//go:linkname runtimeSemrelease sync.runtime_Semrelease -func runtimeSemrelease(s *uint32, handoff bool, skipframes int) - -// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock -// method. -type DowngradableRWMutex struct { - w sync.Mutex // held if there are pending writers - writerSem uint32 // semaphore for writers to wait for completing readers - readerSem uint32 // semaphore for readers to wait for completing writers - readerCount int32 // number of pending readers - readerWait int32 // number of departing readers -} - -const rwmutexMaxReaders = 1 << 30 - -// RLock locks rw for reading. -func (rw *DowngradableRWMutex) RLock() { - if RaceEnabled { - RaceDisable() - } - if atomic.AddInt32(&rw.readerCount, 1) < 0 { - // A writer is pending, wait for it. - runtimeSemacquire(&rw.readerSem) - } - if RaceEnabled { - RaceEnable() - RaceAcquire(unsafe.Pointer(&rw.readerSem)) - } -} - -// RUnlock undoes a single RLock call. -func (rw *DowngradableRWMutex) RUnlock() { - if RaceEnabled { - RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) - RaceDisable() - } - if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 { - if r+1 == 0 || r+1 == -rwmutexMaxReaders { - panic("RUnlock of unlocked DowngradableRWMutex") - } - // A writer is pending. - if atomic.AddInt32(&rw.readerWait, -1) == 0 { - // The last reader unblocks the writer. - runtimeSemrelease(&rw.writerSem, false, 0) - } - } - if RaceEnabled { - RaceEnable() - } -} - -// Lock locks rw for writing. -func (rw *DowngradableRWMutex) Lock() { - if RaceEnabled { - RaceDisable() - } - // First, resolve competition with other writers. - rw.w.Lock() - // Announce to readers there is a pending writer. - r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders - // Wait for active readers. - if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 { - runtimeSemacquire(&rw.writerSem) - } - if RaceEnabled { - RaceEnable() - RaceAcquire(unsafe.Pointer(&rw.writerSem)) - } -} - -// Unlock unlocks rw for writing. -func (rw *DowngradableRWMutex) Unlock() { - if RaceEnabled { - RaceRelease(unsafe.Pointer(&rw.writerSem)) - RaceRelease(unsafe.Pointer(&rw.readerSem)) - RaceDisable() - } - // Announce to readers there is no active writer. - r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders) - if r >= rwmutexMaxReaders { - panic("Unlock of unlocked DowngradableRWMutex") - } - // Unblock blocked readers, if any. - for i := 0; i < int(r); i++ { - runtimeSemrelease(&rw.readerSem, false, 0) - } - // Allow other writers to proceed. - rw.w.Unlock() - if RaceEnabled { - RaceEnable() - } -} - -// DowngradeLock atomically unlocks rw for writing and locks it for reading. -func (rw *DowngradableRWMutex) DowngradeLock() { - if RaceEnabled { - RaceRelease(unsafe.Pointer(&rw.readerSem)) - RaceDisable() - } - // Announce to readers there is no active writer and one additional reader. - r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1) - if r >= rwmutexMaxReaders+1 { - panic("DowngradeLock of unlocked DowngradableRWMutex") - } - // Unblock blocked readers, if any. Note that this loop starts as 1 since r - // includes this goroutine. - for i := 1; i < int(r); i++ { - runtimeSemrelease(&rw.readerSem, false, 0) - } - // Allow other writers to proceed to rw.w.Lock(). Note that they will still - // block on rw.writerSem since at least this reader exists, such that - // DowngradeLock() is atomic with the previous write lock. - rw.w.Unlock() - if RaceEnabled { - RaceEnable() - } -} diff --git a/pkg/syncutil/memmove_unsafe.go b/pkg/syncutil/memmove_unsafe.go deleted file mode 100644 index 348675baa..000000000 --- a/pkg/syncutil/memmove_unsafe.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.12 -// +build !go1.15 - -// Check go:linkname function signatures when updating Go version. - -package syncutil - -import ( - "unsafe" -) - -//go:linkname memmove runtime.memmove -//go:noescape -func memmove(to, from unsafe.Pointer, n uintptr) - -// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad, which can't -// define it because go_generics can't update the go:linkname annotation. -// Furthermore, go:linkname silently doesn't work if the local name is exported -// (this is of course undocumented), which is why this indirection is -// necessary. -func Memmove(to, from unsafe.Pointer, n uintptr) { - memmove(to, from, n) -} diff --git a/pkg/syncutil/norace_unsafe.go b/pkg/syncutil/norace_unsafe.go deleted file mode 100644 index 0a0a9deda..000000000 --- a/pkg/syncutil/norace_unsafe.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !race - -package syncutil - -import ( - "unsafe" -) - -// RaceEnabled is true if the Go data race detector is enabled. -const RaceEnabled = false - -// RaceDisable has the same semantics as runtime.RaceDisable. -func RaceDisable() { -} - -// RaceEnable has the same semantics as runtime.RaceEnable. -func RaceEnable() { -} - -// RaceAcquire has the same semantics as runtime.RaceAcquire. -func RaceAcquire(addr unsafe.Pointer) { -} - -// RaceRelease has the same semantics as runtime.RaceRelease. -func RaceRelease(addr unsafe.Pointer) { -} - -// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. -func RaceReleaseMerge(addr unsafe.Pointer) { -} diff --git a/pkg/syncutil/race_unsafe.go b/pkg/syncutil/race_unsafe.go deleted file mode 100644 index 206067ec1..000000000 --- a/pkg/syncutil/race_unsafe.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build race - -package syncutil - -import ( - "runtime" - "unsafe" -) - -// RaceEnabled is true if the Go data race detector is enabled. -const RaceEnabled = true - -// RaceDisable has the same semantics as runtime.RaceDisable. -func RaceDisable() { - runtime.RaceDisable() -} - -// RaceEnable has the same semantics as runtime.RaceEnable. -func RaceEnable() { - runtime.RaceEnable() -} - -// RaceAcquire has the same semantics as runtime.RaceAcquire. -func RaceAcquire(addr unsafe.Pointer) { - runtime.RaceAcquire(addr) -} - -// RaceRelease has the same semantics as runtime.RaceRelease. -func RaceRelease(addr unsafe.Pointer) { - runtime.RaceRelease(addr) -} - -// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. -func RaceReleaseMerge(addr unsafe.Pointer) { - runtime.RaceReleaseMerge(addr) -} diff --git a/pkg/syncutil/seqatomic_unsafe.go b/pkg/syncutil/seqatomic_unsafe.go deleted file mode 100644 index cb6d2eb22..000000000 --- a/pkg/syncutil/seqatomic_unsafe.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package template doesn't exist. This file must be instantiated using the -// go_template_instance rule in tools/go_generics/defs.bzl. -package template - -import ( - "fmt" - "reflect" - "strings" - "unsafe" - - "gvisor.dev/gvisor/pkg/syncutil" -) - -// Value is a required type parameter. -// -// Value must not contain any pointers, including interface objects, function -// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs -// containing any of the above. An init() function will panic if this property -// does not hold. -type Value struct{} - -// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race -// with any writer critical sections in sc. -func SeqAtomicLoad(sc *syncutil.SeqCount, ptr *Value) Value { - // This function doesn't use SeqAtomicTryLoad because doing so is - // measurably, significantly (~20%) slower; Go is awful at inlining. - var val Value - for { - epoch := sc.BeginRead() - if syncutil.RaceEnabled { - // runtime.RaceDisable() doesn't actually stop the race detector, - // so it can't help us here. Instead, call runtime.memmove - // directly, which is not instrumented by the race detector. - syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) - } else { - // This is ~40% faster for short reads than going through memmove. - val = *ptr - } - if sc.ReadOk(epoch) { - break - } - } - return val -} - -// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section -// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read -// would race with a writer critical section, SeqAtomicTryLoad returns -// (unspecified, false). -func SeqAtomicTryLoad(sc *syncutil.SeqCount, epoch syncutil.SeqCountEpoch, ptr *Value) (Value, bool) { - var val Value - if syncutil.RaceEnabled { - syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) - } else { - val = *ptr - } - return val, sc.ReadOk(epoch) -} - -func init() { - var val Value - typ := reflect.TypeOf(val) - name := typ.Name() - if ptrs := syncutil.PointersInType(typ, name); len(ptrs) != 0 { - panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n"))) - } -} diff --git a/pkg/syncutil/seqatomictest/BUILD b/pkg/syncutil/seqatomictest/BUILD deleted file mode 100644 index ba18f3238..000000000 --- a/pkg/syncutil/seqatomictest/BUILD +++ /dev/null @@ -1,35 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "seqatomic_int", - out = "seqatomic_int_unsafe.go", - package = "seqatomic", - suffix = "Int", - template = "//pkg/syncutil:generic_seqatomic", - types = { - "Value": "int", - }, -) - -go_library( - name = "seqatomic", - srcs = ["seqatomic_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/syncutil/seqatomic", - deps = [ - "//pkg/syncutil", - ], -) - -go_test( - name = "seqatomic_test", - size = "small", - srcs = ["seqatomic_test.go"], - embed = [":seqatomic"], - deps = [ - "//pkg/syncutil", - ], -) diff --git a/pkg/syncutil/seqatomictest/seqatomic_test.go b/pkg/syncutil/seqatomictest/seqatomic_test.go deleted file mode 100644 index b0db44999..000000000 --- a/pkg/syncutil/seqatomictest/seqatomic_test.go +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package seqatomic - -import ( - "sync/atomic" - "testing" - "time" - - "gvisor.dev/gvisor/pkg/syncutil" -) - -func TestSeqAtomicLoadUncontended(t *testing.T) { - var seq syncutil.SeqCount - const want = 1 - data := want - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicLoadAfterWrite(t *testing.T) { - var seq syncutil.SeqCount - var data int - const want = 1 - seq.BeginWrite() - data = want - seq.EndWrite() - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicLoadDuringWrite(t *testing.T) { - var seq syncutil.SeqCount - var data int - const want = 1 - seq.BeginWrite() - go func() { - time.Sleep(time.Second) - data = want - seq.EndWrite() - }() - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicTryLoadUncontended(t *testing.T) { - var seq syncutil.SeqCount - const want = 1 - data := want - epoch := seq.BeginRead() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { - t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) - } -} - -func TestSeqAtomicTryLoadDuringWrite(t *testing.T) { - var seq syncutil.SeqCount - var data int - epoch := seq.BeginRead() - seq.BeginWrite() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { - t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) - } - seq.EndWrite() -} - -func TestSeqAtomicTryLoadAfterWrite(t *testing.T) { - var seq syncutil.SeqCount - var data int - epoch := seq.BeginRead() - seq.BeginWrite() - seq.EndWrite() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { - t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) - } -} - -func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) { - var seq syncutil.SeqCount - const want = 42 - data := want - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - if got := SeqAtomicLoadInt(&seq, &data); got != want { - b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } - } - }) -} - -func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) { - var seq syncutil.SeqCount - const want = 42 - data := want - b.RunParallel(func(pb *testing.PB) { - epoch := seq.BeginRead() - for pb.Next() { - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { - b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) - } - } - }) -} - -// For comparison: -func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) { - var a atomic.Value - const want = 42 - a.Store(int(want)) - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - if got := a.Load().(int); got != want { - b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want) - } - } - }) -} diff --git a/pkg/syncutil/seqcount.go b/pkg/syncutil/seqcount.go deleted file mode 100644 index 11d8dbfaa..000000000 --- a/pkg/syncutil/seqcount.go +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syncutil - -import ( - "fmt" - "reflect" - "runtime" - "sync/atomic" -) - -// SeqCount is a synchronization primitive for optimistic reader/writer -// synchronization in cases where readers can work with stale data and -// therefore do not need to block writers. -// -// Compared to sync/atomic.Value: -// -// - Mutation of SeqCount-protected data does not require memory allocation, -// whereas atomic.Value generally does. This is a significant advantage when -// writes are common. -// -// - Atomic reads of SeqCount-protected data require copying. This is a -// disadvantage when atomic reads are common. -// -// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other -// operations to be made atomic with reads of SeqCount-protected data. -// -// - SeqCount may be less flexible: as of this writing, SeqCount-protected data -// cannot include pointers. -// -// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected -// data require instantiating function templates using go_generics (see -// seqatomic.go). -type SeqCount struct { - // epoch is incremented by BeginWrite and EndWrite, such that epoch is odd - // if a writer critical section is active, and a read from data protected - // by this SeqCount is atomic iff epoch is the same even value before and - // after the read. - epoch uint32 -} - -// SeqCountEpoch tracks writer critical sections in a SeqCount. -type SeqCountEpoch struct { - val uint32 -} - -// We assume that: -// -// - All functions in sync/atomic that perform a memory read are at least a -// read fence: memory reads before calls to such functions cannot be reordered -// after the call, and memory reads after calls to such functions cannot be -// reordered before the call, even if those reads do not use sync/atomic. -// -// - All functions in sync/atomic that perform a memory write are at least a -// write fence: memory writes before calls to such functions cannot be -// reordered after the call, and memory writes after calls to such functions -// cannot be reordered before the call, even if those writes do not use -// sync/atomic. -// -// As of this writing, the Go memory model completely fails to describe -// sync/atomic, but these properties are implied by -// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8. - -// BeginRead indicates the beginning of a reader critical section. Reader -// critical sections DO NOT BLOCK writer critical sections, so operations in a -// reader critical section MAY RACE with writer critical sections. Races are -// detected by ReadOk at the end of the reader critical section. Thus, the -// low-level structure of readers is generally: -// -// for { -// epoch := seq.BeginRead() -// // do something idempotent with seq-protected data -// if seq.ReadOk(epoch) { -// break -// } -// } -// -// However, since reader critical sections may race with writer critical -// sections, the Go race detector will (accurately) flag data races in readers -// using this pattern. Most users of SeqCount will need to use the -// SeqAtomicLoad function template in seqatomic.go. -func (s *SeqCount) BeginRead() SeqCountEpoch { - epoch := atomic.LoadUint32(&s.epoch) - for epoch&1 != 0 { - runtime.Gosched() - epoch = atomic.LoadUint32(&s.epoch) - } - return SeqCountEpoch{epoch} -} - -// ReadOk returns true if the reader critical section initiated by a previous -// call to BeginRead() that returned epoch did not race with any writer critical -// sections. -// -// ReadOk may be called any number of times during a reader critical section. -// Reader critical sections do not need to be explicitly terminated; the last -// call to ReadOk is implicitly the end of the reader critical section. -func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool { - return atomic.LoadUint32(&s.epoch) == epoch.val -} - -// BeginWrite indicates the beginning of a writer critical section. -// -// SeqCount does not support concurrent writer critical sections; clients with -// concurrent writers must synchronize them using e.g. sync.Mutex. -func (s *SeqCount) BeginWrite() { - if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 { - panic("SeqCount.BeginWrite during writer critical section") - } -} - -// EndWrite ends the effect of a preceding BeginWrite. -func (s *SeqCount) EndWrite() { - if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 { - panic("SeqCount.EndWrite outside writer critical section") - } -} - -// PointersInType returns a list of pointers reachable from values named -// valName of the given type. -// -// PointersInType is not exhaustive, but it is guaranteed that if typ contains -// at least one pointer, then PointersInTypeOf returns a non-empty list. -func PointersInType(typ reflect.Type, valName string) []string { - switch kind := typ.Kind(); kind { - case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128: - return nil - - case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer: - return []string{valName} - - case reflect.Array: - return PointersInType(typ.Elem(), valName+"[]") - - case reflect.Struct: - var ptrs []string - for i, n := 0, typ.NumField(); i < n; i++ { - field := typ.Field(i) - ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...) - } - return ptrs - - default: - return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)} - } -} diff --git a/pkg/syncutil/seqcount_test.go b/pkg/syncutil/seqcount_test.go deleted file mode 100644 index 14d6aedea..000000000 --- a/pkg/syncutil/seqcount_test.go +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syncutil - -import ( - "reflect" - "testing" - "time" -) - -func TestSeqCountWriteUncontended(t *testing.T) { - var seq SeqCount - seq.BeginWrite() - seq.EndWrite() -} - -func TestSeqCountReadUncontended(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountBeginReadAfterWrite(t *testing.T) { - var seq SeqCount - var data int32 - const want = 1 - seq.BeginWrite() - data = want - seq.EndWrite() - epoch := seq.BeginRead() - if data != want { - t.Errorf("Reader: got %v, wanted %v", data, want) - } - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountBeginReadDuringWrite(t *testing.T) { - var seq SeqCount - var data int - const want = 1 - seq.BeginWrite() - go func() { - time.Sleep(time.Second) - data = want - seq.EndWrite() - }() - epoch := seq.BeginRead() - if data != want { - t.Errorf("Reader: got %v, wanted %v", data, want) - } - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountReadOkAfterWrite(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - seq.BeginWrite() - seq.EndWrite() - if seq.ReadOk(epoch) { - t.Errorf("ReadOk: got true, wanted false") - } -} - -func TestSeqCountReadOkDuringWrite(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - seq.BeginWrite() - if seq.ReadOk(epoch) { - t.Errorf("ReadOk: got true, wanted false") - } - seq.EndWrite() -} - -func BenchmarkSeqCountWriteUncontended(b *testing.B) { - var seq SeqCount - for i := 0; i < b.N; i++ { - seq.BeginWrite() - seq.EndWrite() - } -} - -func BenchmarkSeqCountReadUncontended(b *testing.B) { - var seq SeqCount - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - epoch := seq.BeginRead() - if !seq.ReadOk(epoch) { - b.Fatalf("ReadOk: got false, wanted true") - } - } - }) -} - -func TestPointersInType(t *testing.T) { - for _, test := range []struct { - name string // used for both test and value name - val interface{} - ptrs []string - }{ - { - name: "EmptyStruct", - val: struct{}{}, - }, - { - name: "Int", - val: int(0), - }, - { - name: "MixedStruct", - val: struct { - b bool - I int - ExportedPtr *struct{} - unexportedPtr *struct{} - arr [2]int - ptrArr [2]*int - nestedStruct struct { - nestedNonptr int - nestedPtr *int - } - structArr [1]struct { - nonptr int - ptr *int - } - }{}, - ptrs: []string{ - "MixedStruct.ExportedPtr", - "MixedStruct.unexportedPtr", - "MixedStruct.ptrArr[]", - "MixedStruct.nestedStruct.nestedPtr", - "MixedStruct.structArr[].ptr", - }, - }, - } { - t.Run(test.name, func(t *testing.T) { - typ := reflect.TypeOf(test.val) - ptrs := PointersInType(typ, test.name) - t.Logf("Found pointers: %v", ptrs) - if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) { - t.Errorf("Got %v, wanted %v", ptrs, test.ptrs) - } - }) - } -} diff --git a/pkg/syncutil/syncutil.go b/pkg/syncutil/syncutil.go deleted file mode 100644 index 66e750d06..000000000 --- a/pkg/syncutil/syncutil.go +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package syncutil provides synchronization primitives. -package syncutil diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD index e07ebd153..db06d02c6 100644 --- a/pkg/tcpip/BUILD +++ b/pkg/tcpip/BUILD @@ -15,6 +15,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/tcpip", visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip/buffer", "//pkg/tcpip/iptables", "//pkg/waiter", diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD index 78df5a0b1..3df7d18d3 100644 --- a/pkg/tcpip/adapters/gonet/BUILD +++ b/pkg/tcpip/adapters/gonet/BUILD @@ -9,6 +9,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet", visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/stack", diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go index cd6ce930a..a2f44b496 100644 --- a/pkg/tcpip/adapters/gonet/gonet.go +++ b/pkg/tcpip/adapters/gonet/gonet.go @@ -20,9 +20,9 @@ import ( "errors" "io" "net" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/stack" diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index 897c94821..66cc53ed4 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -16,6 +16,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased", visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index fa8a703d9..b7f60178e 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -41,10 +41,10 @@ package fdbased import ( "fmt" - "sync" "syscall" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD index a4f9cdd69..09165dd4c 100644 --- a/pkg/tcpip/link/sharedmem/BUILD +++ b/pkg/tcpip/link/sharedmem/BUILD @@ -15,6 +15,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/log", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", @@ -31,6 +32,7 @@ go_test( ], embed = [":sharedmem"], deps = [ + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD index 6b5bc542c..a0d4ad0be 100644 --- a/pkg/tcpip/link/sharedmem/pipe/BUILD +++ b/pkg/tcpip/link/sharedmem/pipe/BUILD @@ -21,4 +21,5 @@ go_test( "pipe_test.go", ], embed = [":pipe"], + deps = ["//pkg/sync"], ) diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go index 59ef69a8b..dc239a0d0 100644 --- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go +++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go @@ -18,8 +18,9 @@ import ( "math/rand" "reflect" "runtime" - "sync" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) func TestSimpleReadWrite(t *testing.T) { diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index 080f9d667..655e537c4 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -23,11 +23,11 @@ package sharedmem import ( - "sync" "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go index 89603c48f..5c729a439 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem_test.go +++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go @@ -22,11 +22,11 @@ import ( "math/rand" "os" "strings" - "sync" "syscall" "testing" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD index acf1e022c..ed16076fd 100644 --- a/pkg/tcpip/network/fragmentation/BUILD +++ b/pkg/tcpip/network/fragmentation/BUILD @@ -28,6 +28,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/log", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", ], diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go index 6da5238ec..92f2aa13a 100644 --- a/pkg/tcpip/network/fragmentation/fragmentation.go +++ b/pkg/tcpip/network/fragmentation/fragmentation.go @@ -19,9 +19,9 @@ package fragmentation import ( "fmt" "log" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip/buffer" ) diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go index 9e002e396..0a83d81f2 100644 --- a/pkg/tcpip/network/fragmentation/reassembler.go +++ b/pkg/tcpip/network/fragmentation/reassembler.go @@ -18,9 +18,9 @@ import ( "container/heap" "fmt" "math" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip/buffer" ) diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD index e156b01f6..a6ef3bdcc 100644 --- a/pkg/tcpip/ports/BUILD +++ b/pkg/tcpip/ports/BUILD @@ -9,6 +9,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/tcpip/ports", visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip", ], ) diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go index 6c5e19e8f..b937cb84b 100644 --- a/pkg/tcpip/ports/ports.go +++ b/pkg/tcpip/ports/ports.go @@ -18,9 +18,9 @@ package ports import ( "math" "math/rand" - "sync" "sync/atomic" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index 826fca4de..6a8654105 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/ilist", "//pkg/rand", "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/hash/jenkins", @@ -80,6 +81,7 @@ go_test( embed = [":stack"], deps = [ "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", ], ) diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go index 267df60d1..403557fd7 100644 --- a/pkg/tcpip/stack/linkaddrcache.go +++ b/pkg/tcpip/stack/linkaddrcache.go @@ -16,10 +16,10 @@ package stack import ( "fmt" - "sync" "time" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go index 9946b8fe8..1baa498d0 100644 --- a/pkg/tcpip/stack/linkaddrcache_test.go +++ b/pkg/tcpip/stack/linkaddrcache_test.go @@ -16,12 +16,12 @@ package stack import ( "fmt" - "sync" "sync/atomic" "testing" "time" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 3810c6602..fe557ccbd 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -16,9 +16,9 @@ package stack import ( "strings" - "sync" "sync/atomic" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 41bf9fd9b..a47ceba54 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -21,13 +21,13 @@ package stack import ( "encoding/binary" - "sync" "sync/atomic" "time" "golang.org/x/time/rate" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go index 67c21be42..f384a91de 100644 --- a/pkg/tcpip/stack/transport_demuxer.go +++ b/pkg/tcpip/stack/transport_demuxer.go @@ -18,8 +18,8 @@ import ( "fmt" "math/rand" "sort" - "sync" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 72b5ce179..4a090ac86 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -35,10 +35,10 @@ import ( "reflect" "strconv" "strings" - "sync" "sync/atomic" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/iptables" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD index d8c5b5058..3aa23d529 100644 --- a/pkg/tcpip/transport/icmp/BUILD +++ b/pkg/tcpip/transport/icmp/BUILD @@ -28,6 +28,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index c7ce74cdd..330786f4c 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -15,8 +15,7 @@ package icmp import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD index 44b58ff6b..4858d150c 100644 --- a/pkg/tcpip/transport/packet/BUILD +++ b/pkg/tcpip/transport/packet/BUILD @@ -28,6 +28,7 @@ go_library( deps = [ "//pkg/log", "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 07ffa8aba..fc5bc69fa 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -25,8 +25,7 @@ package packet import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index 00991ac8e..2f2131ff7 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -29,6 +29,7 @@ go_library( deps = [ "//pkg/log", "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 85f7eb76b..ee9c4c58b 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -26,8 +26,7 @@ package raw import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 3b353d56c..353bd06f4 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -48,6 +48,7 @@ go_library( "//pkg/log", "//pkg/rand", "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/hash/jenkins", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 5422ae80c..1ea996936 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -19,11 +19,11 @@ import ( "encoding/binary" "hash" "io" - "sync" "time" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index cdd69f360..613ec1775 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -16,11 +16,11 @@ package tcp import ( "encoding/binary" - "sync" "time" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 830bc1e3e..cca511fb9 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -19,12 +19,12 @@ import ( "fmt" "math" "strings" - "sync" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index 7aa4c3f0e..4b8d867bc 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -16,9 +16,9 @@ package tcp import ( "fmt" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go index 4983bca81..7eb613be5 100644 --- a/pkg/tcpip/transport/tcp/forwarder.go +++ b/pkg/tcpip/transport/tcp/forwarder.go @@ -15,8 +15,7 @@ package tcp import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index bc718064c..9a8f64aa6 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -22,9 +22,9 @@ package tcp import ( "strings" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go index e0759225e..bd20a7ee9 100644 --- a/pkg/tcpip/transport/tcp/segment_queue.go +++ b/pkg/tcpip/transport/tcp/segment_queue.go @@ -15,7 +15,7 @@ package tcp import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // segmentQueue is a bounded, thread-safe queue of TCP segments. diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 8a947dc66..79f2d274b 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -16,11 +16,11 @@ package tcp import ( "math" - "sync" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index 97e4d5825..57ff123e3 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -30,6 +30,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 864dc8733..a4ff29a7d 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -15,8 +15,7 @@ package udp import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD index 6afdb29b7..07778e4f7 100644 --- a/pkg/tmutex/BUILD +++ b/pkg/tmutex/BUILD @@ -15,4 +15,5 @@ go_test( size = "medium", srcs = ["tmutex_test.go"], embed = [":tmutex"], + deps = ["//pkg/sync"], ) diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go index ce34c7962..05540696a 100644 --- a/pkg/tmutex/tmutex_test.go +++ b/pkg/tmutex/tmutex_test.go @@ -17,10 +17,11 @@ package tmutex import ( "fmt" "runtime" - "sync" "sync/atomic" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) func TestBasicLock(t *testing.T) { diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD index 8f6f180e5..d1885ae66 100644 --- a/pkg/unet/BUILD +++ b/pkg/unet/BUILD @@ -24,4 +24,5 @@ go_test( "unet_test.go", ], embed = [":unet"], + deps = ["//pkg/sync"], ) diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go index a3cc6f5d3..5c4b9e8e9 100644 --- a/pkg/unet/unet_test.go +++ b/pkg/unet/unet_test.go @@ -19,10 +19,11 @@ import ( "os" "path/filepath" "reflect" - "sync" "syscall" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) func randomFilename() (string, error) { diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD index b6bbb0ea2..b8fdc3125 100644 --- a/pkg/urpc/BUILD +++ b/pkg/urpc/BUILD @@ -11,6 +11,7 @@ go_library( deps = [ "//pkg/fd", "//pkg/log", + "//pkg/sync", "//pkg/unet", ], ) diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go index df59ffab1..13b2ea314 100644 --- a/pkg/urpc/urpc.go +++ b/pkg/urpc/urpc.go @@ -27,10 +27,10 @@ import ( "os" "reflect" "runtime" - "sync" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD index 0427bc41f..1c6890e52 100644 --- a/pkg/waiter/BUILD +++ b/pkg/waiter/BUILD @@ -24,6 +24,7 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/waiter", visibility = ["//visibility:public"], + deps = ["//pkg/sync"], ) go_test( diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go index 8a65ed164..f708e95fa 100644 --- a/pkg/waiter/waiter.go +++ b/pkg/waiter/waiter.go @@ -58,7 +58,7 @@ package waiter import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // EventMask represents io events as used in the poll() syscall. diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 6226b63f8..3e20f8f2f 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -74,6 +74,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/sentry/watchdog", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/link/fdbased", @@ -114,6 +115,7 @@ go_test( "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", + "//pkg/sync", "//pkg/unet", "//runsc/fsgofer", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 352e710d2..9c23b9553 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -17,7 +17,6 @@ package boot import ( "fmt" "os" - "sync" "syscall" "github.com/golang/protobuf/proto" @@ -27,6 +26,7 @@ import ( ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/sentry/strace" spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/sync" ) func initCompatLogs(fd int) error { diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go index d1c0bb9b5..ce62236e5 100644 --- a/runsc/boot/limits.go +++ b/runsc/boot/limits.go @@ -16,12 +16,12 @@ package boot import ( "fmt" - "sync" "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" ) // Mapping from linux resource names to limits.LimitType. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index bc1d0c1bb..fad72f4ab 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,7 +20,6 @@ import ( mrand "math/rand" "os" "runtime" - "sync" "sync/atomic" "syscall" gtime "time" @@ -46,6 +45,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 147ff7703..bec0dc292 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -19,7 +19,6 @@ import ( "math/rand" "os" "reflect" - "sync" "syscall" "testing" "time" @@ -30,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/fsgofer" ) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 250845ad7..b94bc4fa0 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -44,6 +44,7 @@ go_library( "//pkg/sentry/control", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sync", "//pkg/unet", "//pkg/urpc", "//runsc/boot", diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go index a4e3071b3..1815c93b9 100644 --- a/runsc/cmd/create.go +++ b/runsc/cmd/create.go @@ -16,6 +16,7 @@ package cmd import ( "context" + "flag" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/boot" diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 4831210c0..7df7995f0 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -21,7 +21,6 @@ import ( "os" "path/filepath" "strings" - "sync" "syscall" "flag" @@ -30,6 +29,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/fsgofer" diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go index de2115dff..5e9bc53ab 100644 --- a/runsc/cmd/start.go +++ b/runsc/cmd/start.go @@ -16,6 +16,7 @@ package cmd import ( "context" + "flag" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/boot" diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 2bd12120d..6dea179e4 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -18,6 +18,7 @@ go_library( deps = [ "//pkg/log", "//pkg/sentry/control", + "//pkg/sync", "//runsc/boot", "//runsc/cgroup", "//runsc/sandbox", @@ -53,6 +54,7 @@ go_test( "//pkg/sentry/control", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sync", "//pkg/unet", "//pkg/urpc", "//runsc/boot", diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 5ed131a7f..060b63bf3 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -20,7 +20,6 @@ import ( "io" "os" "path/filepath" - "sync" "syscall" "testing" "time" @@ -29,6 +28,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/testutil" diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index c10f85992..b54d8f712 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -26,7 +26,6 @@ import ( "reflect" "strconv" "strings" - "sync" "syscall" "testing" "time" @@ -39,6 +38,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/specutils" diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 4ad09ceab..2da93ec5b 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -22,7 +22,6 @@ import ( "path" "path/filepath" "strings" - "sync" "syscall" "testing" "time" @@ -30,6 +29,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/testutil" diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go index d95151ea5..17a251530 100644 --- a/runsc/container/state_file.go +++ b/runsc/container/state_file.go @@ -20,10 +20,10 @@ import ( "io/ioutil" "os" "path/filepath" - "sync" "github.com/gofrs/flock" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" ) const stateFileExtension = ".state" diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index afcb41801..a9582d92b 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -19,6 +19,7 @@ go_library( "//pkg/fd", "//pkg/log", "//pkg/p9", + "//pkg/sync", "//pkg/syserr", "//runsc/specutils", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index b59e1a70e..93606d051 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -29,7 +29,6 @@ import ( "path/filepath" "runtime" "strconv" - "sync" "syscall" "golang.org/x/sys/unix" @@ -37,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/runsc/specutils" ) diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 8001949d5..ddbc37456 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -19,6 +19,7 @@ go_library( "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", + "//pkg/sync", "//pkg/tcpip/header", "//pkg/tcpip/stack", "//pkg/urpc", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index ce1452b87..ec72bdbfd 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -22,7 +22,6 @@ import ( "os" "os/exec" "strconv" - "sync" "syscall" "time" @@ -34,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD index c96ca2eb6..3c3027cb5 100644 --- a/runsc/testutil/BUILD +++ b/runsc/testutil/BUILD @@ -10,6 +10,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/log", + "//pkg/sync", "//runsc/boot", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go index 9632776d2..fb22eae39 100644 --- a/runsc/testutil/testutil.go +++ b/runsc/testutil/testutil.go @@ -34,7 +34,6 @@ import ( "path/filepath" "strconv" "strings" - "sync" "sync/atomic" "syscall" "time" @@ -42,6 +41,7 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" ) -- cgit v1.2.3 From 376a777c55680f8139313d87bd460248fd251941 Mon Sep 17 00:00:00 2001 From: "chris.zn" Date: Tue, 14 Jan 2020 18:59:33 +0800 Subject: Fix "unlock of unlocked mutex" crash when getting tty This patch holds taskset.mu when getting tty. If we don't do this, it may cause a "unlock of unlocked mutex" problem, since signalHandlers may be replaced by CopyForExec() in runSyscallAfterExecStop after the signalHandlers.mu has been holded in TTY(). The problem is easy to reproduce with keeping to do "runsc ps". The crash log is : fatal error: sync: unlock of unlocked mutex goroutine 5801304 [running]: runtime.throw(0xfd019c, 0x1e) GOROOT/src/runtime/panic.go:774 +0x72 fp=0xc001ba47b0 sp=0xc001ba4780 pc=0x431702 sync.throw(0xfd019c, 0x1e) GOROOT/src/runtime/panic.go:760 +0x35 fp=0xc001ba47d0 sp=0xc001ba47b0 pc=0x431685 sync.(*Mutex).unlockSlow(0xc00cf94a30, 0xc0ffffffff) GOROOT/src/sync/mutex.go:196 +0xd6 fp=0xc001ba47f8 sp=0xc001ba47d0 pc=0x4707d6 sync.(*Mutex).Unlock(0xc00cf94a30) GOROOT/src/sync/mutex.go:190 +0x48 fp=0xc001ba4818 sp=0xc001ba47f8 pc=0x4706e8 gvisor.dev/gvisor/pkg/sentry/kernel.(*ThreadGroup).TTY(0xc011a9e800, 0x0) pkg/sentry/kernel/tty.go:38 +0x88 fp=0xc001ba4868 sp=0xc001ba4818 pc=0x835fa8 gvisor.dev/gvisor/pkg/sentry/control.Processes(0xc00025ae00, 0xc013e397c0, 0x40, 0xc0137b9800, 0x1, 0x7f292e9a4cc0) pkg/sentry/control/proc.go:366 +0x355 fp=0xc001ba49a0 sp=0xc001ba4868 pc=0x9ac4a5 gvisor.dev/gvisor/runsc/boot.(*containerManager).Processes(0xc0003b62c0, 0xc0051423d0, 0xc0137b9800, 0x0, 0x0) runsc/boot/controller.go:228 +0xdf fp=0xc001ba49e8 sp=0xc001ba49a0 pc=0xaf06cf Signed-off-by: chris.zn --- pkg/sentry/kernel/tty.go | 2 ++ 1 file changed, 2 insertions(+) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go index 464d2306a..d0e0810e8 100644 --- a/pkg/sentry/kernel/tty.go +++ b/pkg/sentry/kernel/tty.go @@ -33,6 +33,8 @@ type TTY struct { // TTY returns the thread group's controlling terminal. If nil, there is no // controlling terminal. func (tg *ThreadGroup) TTY() *TTY { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() return tg.tty -- cgit v1.2.3 From 5ab1213a6c405071546c783d6d93b4e9af52842e Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 22 Jan 2020 12:27:16 -0800 Subject: Move VFS2 handling of FD readability/writability to vfs.FileDescription. PiperOrigin-RevId: 291006713 --- pkg/sentry/fsimpl/ext/inode.go | 8 +++- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 11 +++-- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 11 ++++- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 18 +++++-- pkg/sentry/fsimpl/tmpfs/filesystem.go | 15 ++---- pkg/sentry/fsimpl/tmpfs/named_pipe.go | 5 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 14 +----- pkg/sentry/kernel/pipe/vfs.go | 12 ++--- pkg/sentry/vfs/file_description.go | 66 ++++++++++++++++++++++++-- pkg/sentry/vfs/permissions.go | 5 +- 10 files changed, 111 insertions(+), 54 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 8608805bf..191b39970 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -157,7 +157,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -166,7 +168,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 606ca692d..75624e0b1 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -55,7 +55,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy // Open implements Inode.Open. func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - fd.Init(rp.Mount(), vfsd, f.data, flags) + if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil { + return nil, err + } return &fd.vfsfd, nil } @@ -80,10 +82,13 @@ type DynamicBytesFD struct { } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error { + if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return err + } fd.inode = d.Impl().(*Dentry).inode fd.SetDataSource(data) - fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) + return nil } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index bcf069b5f..5fa1fa67b 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -43,9 +43,16 @@ type GenericDirectoryFD struct { } // Init initializes a GenericDirectoryFD. -func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) { +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error { + if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 { + // Can't open directories for writing. + return syserror.EISDIR + } + if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return err + } fd.children = children - fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) + return nil } // VFSFileDescription returns a pointer to the vfs.FileDescription representing diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index a5fdfbde5..aa3fe76ee 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -115,7 +115,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } @@ -225,7 +227,9 @@ func TestReadStaticFile(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("file1") - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } @@ -258,7 +262,9 @@ func TestCreateNewFileInStaticDir(t *testing.T) { // Close the file. The file should persist. fd.DecRef() - fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } @@ -272,7 +278,9 @@ func TestDirFDReadWrite(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("/") - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } @@ -282,7 +290,7 @@ func TestDirFDReadWrite(t *testing.T) { if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { t.Fatalf("Read for directory FD failed with unexpected error: %v", err) } - if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR { + if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EBADF { t.Fatalf("Write for directory FD failed with unexpected error: %v", err) } } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 4cd7e9aea..a9f66a42a 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -337,19 +337,12 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, err } } - mnt := rp.Mount() switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD - fd.readable = vfs.MayReadFileWithOpenFlags(flags) - fd.writable = vfs.MayWriteFileWithOpenFlags(flags) - if fd.writable { - if err := mnt.CheckBeginWrite(); err != nil { - return nil, err - } - // mnt.EndWrite() is called by regularFileFD.Release(). + if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err } - fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data.Truncate(0, impl.memFile) @@ -363,7 +356,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH (which is unimplemented). diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 40bde54de..482aabd52 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -50,11 +50,10 @@ type namedPipeFD struct { func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { var err error var fd namedPipeFD - fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags) + fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags) if err != nil { return nil, err } - mnt := rp.Mount() - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 5fa70cc6d..7c633c1b0 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -101,10 +101,6 @@ func (rf *regularFile) truncate(size uint64) (bool, error) { type regularFileFD struct { fileDescription - // These are immutable. - readable bool - writable bool - // off is the file offset. off is accessed using atomic memory operations. // offMu serializes operations that may mutate off. off int64 @@ -113,16 +109,11 @@ type regularFileFD struct { // Release implements vfs.FileDescriptionImpl.Release. func (fd *regularFileFD) Release() { - if fd.writable { - fd.vfsfd.VirtualDentry().Mount().EndWrite() - } + // noop } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if !fd.readable { - return 0, syserror.EINVAL - } if offset < 0 { return 0, syserror.EINVAL } @@ -147,9 +138,6 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if !fd.writable { - return 0, syserror.EINVAL - } if offset < 0 { return 0, syserror.EINVAL } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index bf7461cbb..6f83e3cee 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -66,7 +66,7 @@ func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe { // for read and write will succeed both in blocking and nonblocking mode. POSIX // leaves this behavior undefined. This can be used to open a FIFO for writing // while there are no readers available." - fifo(7) -func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -76,7 +76,7 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd return nil, syserror.EINVAL } - vfd, err := vp.open(rp, vfsd, vfsfd, flags) + vfd, err := vp.open(vfsd, vfsfd, flags) if err != nil { return nil, err } @@ -118,19 +118,13 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { var fd VFSPipeFD fd.flags = flags fd.readable = vfs.MayReadFileWithOpenFlags(flags) fd.writable = vfs.MayWriteFileWithOpenFlags(flags) fd.vfsfd = vfsfd fd.pipe = &vp.pipe - if fd.writable { - // The corresponding Mount.EndWrite() is in VFSPipe.Release(). - if err := rp.Mount().CheckBeginWrite(); err != nil { - return nil, err - } - } switch { case fd.readable && fd.writable: diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 6afe280bc..51c95c2d9 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -49,8 +49,23 @@ type FileDescription struct { // A reference is held on vd. vd is immutable. vd VirtualDentry + // opts contains options passed to FileDescription.Init(). opts is + // immutable. opts FileDescriptionOptions + // readable is MayReadFileWithOpenFlags(statusFlags). readable is + // immutable. + // + // readable is analogous to Linux's FMODE_READ. + readable bool + + // writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true, + // the FileDescription holds a write count on vd.mount. writable is + // immutable. + // + // writable is analogous to Linux's FMODE_WRITE. + writable bool + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl @@ -77,10 +92,17 @@ type FileDescriptionOptions struct { UseDentryMetadata bool } -// Init must be called before first use of fd. It takes references on mnt and -// d. statusFlags is the initial file description status flags, which is -// usually the full set of flags passed to open(2). -func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) { +// Init must be called before first use of fd. If it succeeds, it takes +// references on mnt and d. statusFlags is the initial file description status +// flags, which is usually the full set of flags passed to open(2). +func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error { + writable := MayWriteFileWithOpenFlags(statusFlags) + if writable { + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + } + fd.refs = 1 fd.statusFlags = statusFlags | linux.O_LARGEFILE fd.vd = VirtualDentry{ @@ -89,7 +111,10 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn } fd.vd.IncRef() fd.opts = *opts + fd.readable = MayReadFileWithOpenFlags(statusFlags) + fd.writable = writable fd.impl = impl + return nil } // IncRef increments fd's reference count. @@ -117,6 +142,9 @@ func (fd *FileDescription) TryIncRef() bool { func (fd *FileDescription) DecRef() { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { fd.impl.Release() + if fd.writable { + fd.vd.mount.EndWrite() + } fd.vd.DecRef() } else if refs < 0 { panic("FileDescription.DecRef() called without holding a reference") @@ -194,6 +222,16 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede return nil } +// IsReadable returns true if fd was opened for reading. +func (fd *FileDescription) IsReadable() bool { + return fd.readable +} + +// IsWritable returns true if fd was opened for writing. +func (fd *FileDescription) IsWritable() bool { + return fd.writable +} + // Impl returns the FileDescriptionImpl associated with fd. func (fd *FileDescription) Impl() FileDescriptionImpl { return fd.impl @@ -241,6 +279,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for reading. PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) // Read is similar to PRead, but does not specify an offset. @@ -254,6 +294,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for reading. Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) // PWrite writes src to the file, starting at the given offset, and returns @@ -268,6 +310,8 @@ type FileDescriptionImpl interface { // // - If opts.Flags specifies unsupported options, PWrite returns // EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for writing. PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) // Write is similar to PWrite, but does not specify an offset, which is @@ -281,6 +325,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for writing. Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) // IterDirents invokes cb on each entry in the directory represented by the @@ -411,11 +457,17 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { // offset, and returns the number of bytes read. PRead is permitted to return // partial reads with a nil error. func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EBADF + } return fd.impl.PRead(ctx, dst, offset, opts) } // Read is similar to PRead, but does not specify an offset. func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EBADF + } return fd.impl.Read(ctx, dst, opts) } @@ -423,11 +475,17 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt // offset, and returns the number of bytes written. PWrite is permitted to // return partial writes with a nil error. func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EBADF + } return fd.impl.PWrite(ctx, src, offset, opts) } // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EBADF + } return fd.impl.Write(ctx, src, opts) } diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index d279d05ca..f664581f4 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -94,14 +94,13 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // the set of accesses permitted for the opened file: // // - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it -// mutates the file), but does not permit the opened to write to the file +// mutates the file), but does not permit writing to the open file description // thereafter. // // - "Linux reserves the special, nonstandard access mode 3 (binary 11) in // flags to mean: check for read and write permission on the file and return a // file descriptor that can't be used for reading or writing." - open(2). Thus -// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but -// filesystems are responsible for ensuring that access is denied. +// AccessTypesForOpenFlags returns MayRead|MayWrite in this case. // // Use May{Read,Write}FileWithOpenFlags() for these checks instead. func AccessTypesForOpenFlags(flags uint32) AccessTypes { -- cgit v1.2.3 From 3db317390b5cc491d680fc4a5fc7b8372890b4da Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 23 Jan 2020 16:17:50 -0800 Subject: Remove epoll entry from map when dropping it. This pattern (delete from map when dropping) is also used in epoll.RemoveEntry, and seems like generally a good idea. PiperOrigin-RevId: 291268208 --- pkg/sentry/kernel/epoll/epoll.go | 1 + 1 file changed, 1 insertion(+) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 430311cc0..e84742993 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -174,6 +174,7 @@ func (e *EventPoll) Release() { entry.id.File.EventUnregister(&entry.waiter) entry.file.Drop() } + e.files = nil } // Read implements fs.FileOperations.Read. -- cgit v1.2.3 From 390bb9c241c2b05c311579562d95cc39d899157b Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Fri, 24 Jan 2020 11:58:13 -0800 Subject: Ignore external SIGURG Go 1.14+ sends SIGURG to Ms to attempt asynchronous preemption of a G. Since it can't guarantee that a SIGURG is only related to preemption, it continues to forward them to signal.Notify (see runtime.sighandler). We should ignore these signals, as applications shouldn't receive them. Note that this means that truly external SIGURG can no longer be sent to the application (as with SIGCHLD). PiperOrigin-RevId: 291415357 --- pkg/sentry/kernel/signal.go | 3 +++ 1 file changed, 3 insertions(+) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go index 02eede93d..e8cce37d0 100644 --- a/pkg/sentry/kernel/signal.go +++ b/pkg/sentry/kernel/signal.go @@ -38,6 +38,9 @@ const SignalPanic = linux.SIGUSR2 // Preconditions: Kernel must have an init process. func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) { switch linux.Signal(info.Signo) { + case linux.SIGURG: + // Sent by the Go 1.14+ runtime for asynchronous goroutine preemption. + case platform.SignalInterrupt: // Assume that a call to platform.Context.Interrupt() misfired. -- cgit v1.2.3 From d29e59af9fbd420e34378bcbf7ae543134070217 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 27 Jan 2020 10:04:07 -0800 Subject: Standardize on tools directory. PiperOrigin-RevId: 291745021 --- .bazelrc | 8 +- BUILD | 49 ++++++- benchmarks/defs.bzl | 18 --- benchmarks/harness/BUILD | 74 +++++----- benchmarks/harness/machine_producers/BUILD | 4 +- benchmarks/runner/BUILD | 24 ++-- benchmarks/tcp/BUILD | 3 +- benchmarks/workloads/ab/BUILD | 19 ++- benchmarks/workloads/absl/BUILD | 19 ++- benchmarks/workloads/curl/BUILD | 2 +- benchmarks/workloads/ffmpeg/BUILD | 2 +- benchmarks/workloads/fio/BUILD | 19 ++- benchmarks/workloads/httpd/BUILD | 2 +- benchmarks/workloads/iperf/BUILD | 19 ++- benchmarks/workloads/netcat/BUILD | 2 +- benchmarks/workloads/nginx/BUILD | 2 +- benchmarks/workloads/node/BUILD | 2 +- benchmarks/workloads/node_template/BUILD | 2 +- benchmarks/workloads/redis/BUILD | 2 +- benchmarks/workloads/redisbenchmark/BUILD | 19 ++- benchmarks/workloads/ruby/BUILD | 2 +- benchmarks/workloads/ruby_template/BUILD | 2 +- benchmarks/workloads/sleep/BUILD | 2 +- benchmarks/workloads/sysbench/BUILD | 19 ++- benchmarks/workloads/syscall/BUILD | 19 ++- benchmarks/workloads/tensorflow/BUILD | 2 +- benchmarks/workloads/true/BUILD | 2 +- pkg/abi/BUILD | 3 +- pkg/abi/linux/BUILD | 6 +- pkg/amutex/BUILD | 6 +- pkg/atomicbitops/BUILD | 6 +- pkg/binary/BUILD | 6 +- pkg/bits/BUILD | 6 +- pkg/bpf/BUILD | 6 +- pkg/compressio/BUILD | 6 +- pkg/control/client/BUILD | 3 +- pkg/control/server/BUILD | 3 +- pkg/cpuid/BUILD | 8 +- pkg/eventchannel/BUILD | 16 +-- pkg/fd/BUILD | 6 +- pkg/fdchannel/BUILD | 8 +- pkg/fdnotifier/BUILD | 3 +- pkg/flipcall/BUILD | 8 +- pkg/fspath/BUILD | 13 +- pkg/gate/BUILD | 4 +- pkg/goid/BUILD | 6 +- pkg/ilist/BUILD | 6 +- pkg/linewriter/BUILD | 6 +- pkg/log/BUILD | 6 +- pkg/memutil/BUILD | 3 +- pkg/metric/BUILD | 23 +-- pkg/p9/BUILD | 6 +- pkg/p9/p9test/BUILD | 6 +- pkg/procid/BUILD | 8 +- pkg/rand/BUILD | 3 +- pkg/refs/BUILD | 6 +- pkg/seccomp/BUILD | 6 +- pkg/secio/BUILD | 6 +- pkg/segment/test/BUILD | 6 +- pkg/sentry/BUILD | 2 + pkg/sentry/arch/BUILD | 20 +-- pkg/sentry/context/BUILD | 3 +- pkg/sentry/context/contexttest/BUILD | 3 +- pkg/sentry/control/BUILD | 8 +- pkg/sentry/device/BUILD | 6 +- pkg/sentry/fs/BUILD | 6 +- pkg/sentry/fs/anon/BUILD | 3 +- pkg/sentry/fs/dev/BUILD | 3 +- pkg/sentry/fs/fdpipe/BUILD | 6 +- pkg/sentry/fs/filetest/BUILD | 3 +- pkg/sentry/fs/fsutil/BUILD | 6 +- pkg/sentry/fs/gofer/BUILD | 6 +- pkg/sentry/fs/host/BUILD | 6 +- pkg/sentry/fs/lock/BUILD | 6 +- pkg/sentry/fs/proc/BUILD | 6 +- pkg/sentry/fs/proc/device/BUILD | 3 +- pkg/sentry/fs/proc/seqfile/BUILD | 6 +- pkg/sentry/fs/ramfs/BUILD | 6 +- pkg/sentry/fs/sys/BUILD | 3 +- pkg/sentry/fs/timerfd/BUILD | 3 +- pkg/sentry/fs/tmpfs/BUILD | 6 +- pkg/sentry/fs/tty/BUILD | 6 +- pkg/sentry/fsimpl/ext/BUILD | 6 +- pkg/sentry/fsimpl/ext/benchmark/BUILD | 2 +- pkg/sentry/fsimpl/ext/disklayout/BUILD | 6 +- pkg/sentry/fsimpl/kernfs/BUILD | 6 +- pkg/sentry/fsimpl/proc/BUILD | 8 +- pkg/sentry/fsimpl/sys/BUILD | 6 +- pkg/sentry/fsimpl/testutil/BUILD | 5 +- pkg/sentry/fsimpl/tmpfs/BUILD | 8 +- pkg/sentry/hostcpu/BUILD | 6 +- pkg/sentry/hostmm/BUILD | 3 +- pkg/sentry/inet/BUILD | 3 +- pkg/sentry/kernel/BUILD | 24 +--- pkg/sentry/kernel/auth/BUILD | 3 +- pkg/sentry/kernel/contexttest/BUILD | 3 +- pkg/sentry/kernel/epoll/BUILD | 6 +- pkg/sentry/kernel/eventfd/BUILD | 6 +- pkg/sentry/kernel/fasync/BUILD | 3 +- pkg/sentry/kernel/futex/BUILD | 6 +- pkg/sentry/kernel/memevent/BUILD | 20 +-- pkg/sentry/kernel/pipe/BUILD | 6 +- pkg/sentry/kernel/sched/BUILD | 6 +- pkg/sentry/kernel/semaphore/BUILD | 6 +- pkg/sentry/kernel/shm/BUILD | 3 +- pkg/sentry/kernel/signalfd/BUILD | 5 +- pkg/sentry/kernel/time/BUILD | 3 +- pkg/sentry/limits/BUILD | 6 +- pkg/sentry/loader/BUILD | 4 +- pkg/sentry/memmap/BUILD | 6 +- pkg/sentry/mm/BUILD | 6 +- pkg/sentry/pgalloc/BUILD | 6 +- pkg/sentry/platform/BUILD | 3 +- pkg/sentry/platform/interrupt/BUILD | 6 +- pkg/sentry/platform/kvm/BUILD | 6 +- pkg/sentry/platform/kvm/testutil/BUILD | 3 +- pkg/sentry/platform/ptrace/BUILD | 3 +- pkg/sentry/platform/ring0/BUILD | 3 +- pkg/sentry/platform/ring0/gen_offsets/BUILD | 2 +- pkg/sentry/platform/ring0/pagetables/BUILD | 16 +-- pkg/sentry/platform/safecopy/BUILD | 6 +- pkg/sentry/safemem/BUILD | 6 +- pkg/sentry/sighandling/BUILD | 3 +- pkg/sentry/socket/BUILD | 3 +- pkg/sentry/socket/control/BUILD | 3 +- pkg/sentry/socket/hostinet/BUILD | 3 +- pkg/sentry/socket/netfilter/BUILD | 3 +- pkg/sentry/socket/netlink/BUILD | 3 +- pkg/sentry/socket/netlink/port/BUILD | 6 +- pkg/sentry/socket/netlink/route/BUILD | 3 +- pkg/sentry/socket/netlink/uevent/BUILD | 3 +- pkg/sentry/socket/netstack/BUILD | 3 +- pkg/sentry/socket/unix/BUILD | 3 +- pkg/sentry/socket/unix/transport/BUILD | 3 +- pkg/sentry/state/BUILD | 3 +- pkg/sentry/strace/BUILD | 20 +-- pkg/sentry/syscalls/BUILD | 3 +- pkg/sentry/syscalls/linux/BUILD | 3 +- pkg/sentry/time/BUILD | 6 +- pkg/sentry/unimpl/BUILD | 21 +-- pkg/sentry/uniqueid/BUILD | 3 +- pkg/sentry/usage/BUILD | 5 +- pkg/sentry/usermem/BUILD | 7 +- pkg/sentry/vfs/BUILD | 8 +- pkg/sentry/watchdog/BUILD | 3 +- pkg/sleep/BUILD | 6 +- pkg/state/BUILD | 17 +-- pkg/state/statefile/BUILD | 6 +- pkg/sync/BUILD | 6 +- pkg/sync/atomicptrtest/BUILD | 6 +- pkg/sync/seqatomictest/BUILD | 6 +- pkg/syserr/BUILD | 3 +- pkg/syserror/BUILD | 4 +- pkg/tcpip/BUILD | 6 +- pkg/tcpip/adapters/gonet/BUILD | 6 +- pkg/tcpip/buffer/BUILD | 6 +- pkg/tcpip/checker/BUILD | 3 +- pkg/tcpip/hash/jenkins/BUILD | 6 +- pkg/tcpip/header/BUILD | 6 +- pkg/tcpip/iptables/BUILD | 3 +- pkg/tcpip/link/channel/BUILD | 3 +- pkg/tcpip/link/fdbased/BUILD | 6 +- pkg/tcpip/link/loopback/BUILD | 3 +- pkg/tcpip/link/muxed/BUILD | 6 +- pkg/tcpip/link/rawfile/BUILD | 3 +- pkg/tcpip/link/sharedmem/BUILD | 6 +- pkg/tcpip/link/sharedmem/pipe/BUILD | 6 +- pkg/tcpip/link/sharedmem/queue/BUILD | 6 +- pkg/tcpip/link/sniffer/BUILD | 3 +- pkg/tcpip/link/tun/BUILD | 3 +- pkg/tcpip/link/waitable/BUILD | 6 +- pkg/tcpip/network/BUILD | 2 +- pkg/tcpip/network/arp/BUILD | 4 +- pkg/tcpip/network/fragmentation/BUILD | 6 +- pkg/tcpip/network/hash/BUILD | 3 +- pkg/tcpip/network/ipv4/BUILD | 4 +- pkg/tcpip/network/ipv6/BUILD | 6 +- pkg/tcpip/ports/BUILD | 6 +- pkg/tcpip/sample/tun_tcp_connect/BUILD | 2 +- pkg/tcpip/sample/tun_tcp_echo/BUILD | 2 +- pkg/tcpip/seqnum/BUILD | 3 +- pkg/tcpip/stack/BUILD | 6 +- pkg/tcpip/transport/icmp/BUILD | 3 +- pkg/tcpip/transport/packet/BUILD | 3 +- pkg/tcpip/transport/raw/BUILD | 3 +- pkg/tcpip/transport/tcp/BUILD | 4 +- pkg/tcpip/transport/tcp/testing/context/BUILD | 3 +- pkg/tcpip/transport/tcpconntrack/BUILD | 4 +- pkg/tcpip/transport/udp/BUILD | 4 +- pkg/tmutex/BUILD | 6 +- pkg/unet/BUILD | 6 +- pkg/urpc/BUILD | 6 +- pkg/waiter/BUILD | 6 +- runsc/BUILD | 27 ++-- runsc/boot/BUILD | 5 +- runsc/boot/filter/BUILD | 3 +- runsc/boot/platforms/BUILD | 3 +- runsc/cgroup/BUILD | 5 +- runsc/cmd/BUILD | 5 +- runsc/console/BUILD | 3 +- runsc/container/BUILD | 5 +- runsc/container/test_app/BUILD | 4 +- runsc/criutil/BUILD | 3 +- runsc/dockerutil/BUILD | 3 +- runsc/fsgofer/BUILD | 9 +- runsc/fsgofer/filter/BUILD | 3 +- runsc/sandbox/BUILD | 3 +- runsc/specutils/BUILD | 5 +- runsc/testutil/BUILD | 3 +- runsc/version_test.sh | 2 +- scripts/common.sh | 6 +- scripts/common_bazel.sh | 99 ------------- scripts/common_build.sh | 99 +++++++++++++ test/BUILD | 45 +----- test/e2e/BUILD | 5 +- test/image/BUILD | 5 +- test/iptables/BUILD | 5 +- test/iptables/runner/BUILD | 12 +- test/root/BUILD | 5 +- test/root/testdata/BUILD | 3 +- test/runtimes/BUILD | 4 +- test/runtimes/build_defs.bzl | 5 +- test/runtimes/images/proctor/BUILD | 4 +- test/syscalls/BUILD | 2 +- test/syscalls/build_defs.bzl | 6 +- test/syscalls/gtest/BUILD | 7 +- test/syscalls/linux/BUILD | 23 ++- test/syscalls/linux/arch_prctl.cc | 2 + test/syscalls/linux/rseq/BUILD | 5 +- .../linux/udp_socket_errqueue_test_case.cc | 4 + test/uds/BUILD | 3 +- test/util/BUILD | 27 ++-- test/util/save_util_linux.cc | 4 + test/util/save_util_other.cc | 4 + test/util/test_util_runfiles.cc | 4 + tools/BUILD | 3 + tools/build/BUILD | 10 ++ tools/build/defs.bzl | 91 ++++++++++++ tools/checkunsafe/BUILD | 3 +- tools/defs.bzl | 154 +++++++++++++++++++++ tools/go_generics/BUILD | 2 +- tools/go_generics/globals/BUILD | 4 +- tools/go_generics/go_merge/BUILD | 2 +- tools/go_generics/rules_tests/BUILD | 2 +- tools/go_marshal/BUILD | 4 +- tools/go_marshal/README.md | 52 +------ tools/go_marshal/analysis/BUILD | 5 +- tools/go_marshal/defs.bzl | 112 ++------------- tools/go_marshal/gomarshal/BUILD | 6 +- tools/go_marshal/gomarshal/generator.go | 20 ++- tools/go_marshal/gomarshal/generator_tests.go | 6 +- tools/go_marshal/main.go | 11 +- tools/go_marshal/marshal/BUILD | 5 +- tools/go_marshal/test/BUILD | 7 +- tools/go_marshal/test/external/BUILD | 6 +- tools/go_stateify/BUILD | 2 +- tools/go_stateify/defs.bzl | 79 +---------- tools/images/BUILD | 2 +- tools/images/defs.bzl | 6 +- tools/issue_reviver/BUILD | 2 +- tools/issue_reviver/github/BUILD | 3 +- tools/issue_reviver/reviver/BUILD | 5 +- tools/workspace_status.sh | 2 +- vdso/BUILD | 33 ++--- 264 files changed, 1012 insertions(+), 1380 deletions(-) delete mode 100644 benchmarks/defs.bzl delete mode 100755 scripts/common_bazel.sh create mode 100755 scripts/common_build.sh create mode 100644 tools/BUILD create mode 100644 tools/build/BUILD create mode 100644 tools/build/defs.bzl create mode 100644 tools/defs.bzl (limited to 'pkg/sentry/kernel') diff --git a/.bazelrc b/.bazelrc index 9c35c5e7b..ef214bcfa 100644 --- a/.bazelrc +++ b/.bazelrc @@ -30,10 +30,10 @@ build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools" # Add a custom platform and toolchain that builds in a privileged docker # container, which is required by our syscall tests. -build:remote --host_platform=//test:rbe_ubuntu1604 -build:remote --extra_toolchains=//test:cc-toolchain-clang-x86_64-default -build:remote --extra_execution_platforms=//test:rbe_ubuntu1604 -build:remote --platforms=//test:rbe_ubuntu1604 +build:remote --host_platform=//:rbe_ubuntu1604 +build:remote --extra_toolchains=//:cc-toolchain-clang-x86_64-default +build:remote --extra_execution_platforms=//:rbe_ubuntu1604 +build:remote --platforms=//:rbe_ubuntu1604 build:remote --crosstool_top=@rbe_default//cc:toolchain build:remote --jobs=50 build:remote --remote_timeout=3600 diff --git a/BUILD b/BUILD index 76286174f..5fd929378 100644 --- a/BUILD +++ b/BUILD @@ -1,8 +1,8 @@ -package(licenses = ["notice"]) # Apache 2.0 - load("@io_bazel_rules_go//go:def.bzl", "go_path", "nogo") load("@bazel_gazelle//:def.bzl", "gazelle") +package(licenses = ["notice"]) + # The sandbox filegroup is used for sandbox-internal dependencies. package_group( name = "sandbox", @@ -49,9 +49,52 @@ gazelle(name = "gazelle") # live in the tools subdirectory (unless they are standard). nogo( name = "nogo", - config = "tools/nogo.js", + config = "//tools:nogo.js", visibility = ["//visibility:public"], deps = [ "//tools/checkunsafe", ], ) + +# We need to define a bazel platform and toolchain to specify dockerPrivileged +# and dockerRunAsRoot options, they are required to run tests on the RBE +# cluster in Kokoro. +alias( + name = "rbe_ubuntu1604", + actual = ":rbe_ubuntu1604_r346485", +) + +platform( + name = "rbe_ubuntu1604_r346485", + constraint_values = [ + "@bazel_tools//platforms:x86_64", + "@bazel_tools//platforms:linux", + "@bazel_tools//tools/cpp:clang", + "@bazel_toolchains//constraints:xenial", + "@bazel_toolchains//constraints/sanitizers:support_msan", + ], + remote_execution_properties = """ + properties: { + name: "container-image" + value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50" + } + properties: { + name: "dockerAddCapabilities" + value: "SYS_ADMIN" + } + properties: { + name: "dockerPrivileged" + value: "true" + } + """, +) + +toolchain( + name = "cc-toolchain-clang-x86_64-default", + exec_compatible_with = [ + ], + target_compatible_with = [ + ], + toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8", + toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", +) diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl deleted file mode 100644 index 79e6cdbc8..000000000 --- a/benchmarks/defs.bzl +++ /dev/null @@ -1,18 +0,0 @@ -"""Provides python helper functions.""" - -load("@pydeps//:requirements.bzl", _requirement = "requirement") - -def filter_deps(deps = None): - if deps == None: - deps = [] - return [dep for dep in deps if dep] - -def py_library(deps = None, **kwargs): - return native.py_library(deps = filter_deps(deps), **kwargs) - -def py_test(deps = None, **kwargs): - return native.py_test(deps = filter_deps(deps), **kwargs) - -def requirement(name, direct = True): - """ requirement returns the required dependency. """ - return _requirement(name) diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD index 081a74243..52d4e42f8 100644 --- a/benchmarks/harness/BUILD +++ b/benchmarks/harness/BUILD @@ -1,4 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "requirement") +load("//tools:defs.bzl", "py_library", "py_requirement") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -25,16 +25,16 @@ py_library( srcs = ["container.py"], deps = [ "//benchmarks/workloads", - requirement("asn1crypto", False), - requirement("chardet", False), - requirement("certifi", False), - requirement("docker", True), - requirement("docker-pycreds", False), - requirement("idna", False), - requirement("ptyprocess", False), - requirement("requests", False), - requirement("urllib3", False), - requirement("websocket-client", False), + py_requirement("asn1crypto", False), + py_requirement("chardet", False), + py_requirement("certifi", False), + py_requirement("docker", True), + py_requirement("docker-pycreds", False), + py_requirement("idna", False), + py_requirement("ptyprocess", False), + py_requirement("requests", False), + py_requirement("urllib3", False), + py_requirement("websocket-client", False), ], ) @@ -47,17 +47,17 @@ py_library( "//benchmarks/harness:ssh_connection", "//benchmarks/harness:tunnel_dispatcher", "//benchmarks/harness/machine_mocks", - requirement("asn1crypto", False), - requirement("chardet", False), - requirement("certifi", False), - requirement("docker", True), - requirement("docker-pycreds", False), - requirement("idna", False), - requirement("ptyprocess", False), - requirement("requests", False), - requirement("six", False), - requirement("urllib3", False), - requirement("websocket-client", False), + py_requirement("asn1crypto", False), + py_requirement("chardet", False), + py_requirement("certifi", False), + py_requirement("docker", True), + py_requirement("docker-pycreds", False), + py_requirement("idna", False), + py_requirement("ptyprocess", False), + py_requirement("requests", False), + py_requirement("six", False), + py_requirement("urllib3", False), + py_requirement("websocket-client", False), ], ) @@ -66,10 +66,10 @@ py_library( srcs = ["ssh_connection.py"], deps = [ "//benchmarks/harness", - requirement("bcrypt", False), - requirement("cffi", True), - requirement("paramiko", True), - requirement("cryptography", False), + py_requirement("bcrypt", False), + py_requirement("cffi", True), + py_requirement("paramiko", True), + py_requirement("cryptography", False), ], ) @@ -77,16 +77,16 @@ py_library( name = "tunnel_dispatcher", srcs = ["tunnel_dispatcher.py"], deps = [ - requirement("asn1crypto", False), - requirement("chardet", False), - requirement("certifi", False), - requirement("docker", True), - requirement("docker-pycreds", False), - requirement("idna", False), - requirement("pexpect", True), - requirement("ptyprocess", False), - requirement("requests", False), - requirement("urllib3", False), - requirement("websocket-client", False), + py_requirement("asn1crypto", False), + py_requirement("chardet", False), + py_requirement("certifi", False), + py_requirement("docker", True), + py_requirement("docker-pycreds", False), + py_requirement("idna", False), + py_requirement("pexpect", True), + py_requirement("ptyprocess", False), + py_requirement("requests", False), + py_requirement("urllib3", False), + py_requirement("websocket-client", False), ], ) diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD index c4e943882..48ea0ef39 100644 --- a/benchmarks/harness/machine_producers/BUILD +++ b/benchmarks/harness/machine_producers/BUILD @@ -1,4 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "requirement") +load("//tools:defs.bzl", "py_library", "py_requirement") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -31,7 +31,7 @@ py_library( deps = [ "//benchmarks/harness:machine", "//benchmarks/harness/machine_producers:machine_producer", - requirement("PyYAML", False), + py_requirement("PyYAML", False), ], ) diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD index e1b2ea550..fae0ca800 100644 --- a/benchmarks/runner/BUILD +++ b/benchmarks/runner/BUILD @@ -1,4 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") +load("//tools:defs.bzl", "py_library", "py_requirement", "py_test") package(licenses = ["notice"]) @@ -28,7 +28,7 @@ py_library( "//benchmarks/suites:startup", "//benchmarks/suites:sysbench", "//benchmarks/suites:syscall", - requirement("click", True), + py_requirement("click", True), ], ) @@ -36,7 +36,7 @@ py_library( name = "commands", srcs = ["commands.py"], deps = [ - requirement("click", True), + py_requirement("click", True), ], ) @@ -50,14 +50,14 @@ py_test( ], deps = [ ":runner", - requirement("click", True), - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("click", True), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD index 735d7127f..d5e401acc 100644 --- a/benchmarks/tcp/BUILD +++ b/benchmarks/tcp/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") -load("@rules_cc//cc:defs.bzl", "cc_binary") +load("//tools:defs.bzl", "cc_binary", "go_binary") package(licenses = ["notice"]) diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD index 4fc0ab735..4dd91ceb3 100644 --- a/benchmarks/workloads/ab/BUILD +++ b/benchmarks/workloads/ab/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":ab", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD index 61e010096..55dae3baa 100644 --- a/benchmarks/workloads/absl/BUILD +++ b/benchmarks/workloads/absl/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":absl", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/curl/BUILD +++ b/benchmarks/workloads/curl/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD index be472dfb2..7c41ba631 100644 --- a/benchmarks/workloads/ffmpeg/BUILD +++ b/benchmarks/workloads/ffmpeg/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD index de257adad..7b78e8e75 100644 --- a/benchmarks/workloads/fio/BUILD +++ b/benchmarks/workloads/fio/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":fio", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/httpd/BUILD +++ b/benchmarks/workloads/httpd/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD index 8832a996c..570f40148 100644 --- a/benchmarks/workloads/iperf/BUILD +++ b/benchmarks/workloads/iperf/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":iperf", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/netcat/BUILD +++ b/benchmarks/workloads/netcat/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/nginx/BUILD +++ b/benchmarks/workloads/nginx/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD index 71cd9f519..bfcf78cf9 100644 --- a/benchmarks/workloads/node/BUILD +++ b/benchmarks/workloads/node/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD index ca996f068..e142f082a 100644 --- a/benchmarks/workloads/node_template/BUILD +++ b/benchmarks/workloads/node_template/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/redis/BUILD +++ b/benchmarks/workloads/redis/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD index f5994a815..f472a4443 100644 --- a/benchmarks/workloads/redisbenchmark/BUILD +++ b/benchmarks/workloads/redisbenchmark/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":redisbenchmark", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD index e37d77804..a3be4fe92 100644 --- a/benchmarks/workloads/ruby/BUILD +++ b/benchmarks/workloads/ruby/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD index 27f7c0c46..59443b14a 100644 --- a/benchmarks/workloads/ruby_template/BUILD +++ b/benchmarks/workloads/ruby_template/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/sleep/BUILD +++ b/benchmarks/workloads/sleep/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD index fd2f8f03d..3834af7ed 100644 --- a/benchmarks/workloads/sysbench/BUILD +++ b/benchmarks/workloads/sysbench/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":sysbench", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD index 5100cbb21..dba4bb1e7 100644 --- a/benchmarks/workloads/syscall/BUILD +++ b/benchmarks/workloads/syscall/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":syscall", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD index 026c3b316..a7b7742f4 100644 --- a/benchmarks/workloads/tensorflow/BUILD +++ b/benchmarks/workloads/tensorflow/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD index 221c4b9a7..eba23d325 100644 --- a/benchmarks/workloads/true/BUILD +++ b/benchmarks/workloads/true/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD index f5c08ea06..839f822eb 100644 --- a/pkg/abi/BUILD +++ b/pkg/abi/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,6 +9,5 @@ go_library( "abi_linux.go", "flag.go", ], - importpath = "gvisor.dev/gvisor/pkg/abi", visibility = ["//:sandbox"], ) diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 716ff22d2..1f3c0c687 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") # Package linux contains the constants and types needed to interface with a # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix @@ -60,7 +59,6 @@ go_library( "wait.go", "xattr.go", ], - importpath = "gvisor.dev/gvisor/pkg/abi/linux", visibility = ["//visibility:public"], deps = [ "//pkg/abi", @@ -73,7 +71,7 @@ go_test( name = "linux_test", size = "small", srcs = ["netfilter_test.go"], - embed = [":linux"], + library = ":linux", deps = [ "//pkg/binary", ], diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD index d99e37b40..9612f072e 100644 --- a/pkg/amutex/BUILD +++ b/pkg/amutex/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "amutex", srcs = ["amutex.go"], - importpath = "gvisor.dev/gvisor/pkg/amutex", visibility = ["//:sandbox"], ) @@ -14,6 +12,6 @@ go_test( name = "amutex_test", size = "small", srcs = ["amutex_test.go"], - embed = [":amutex"], + library = ":amutex", deps = ["//pkg/sync"], ) diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD index 6403c60c2..3948074ba 100644 --- a/pkg/atomicbitops/BUILD +++ b/pkg/atomicbitops/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "atomic_bitops_arm64.s", "atomic_bitops_common.go", ], - importpath = "gvisor.dev/gvisor/pkg/atomicbitops", visibility = ["//:sandbox"], ) @@ -19,6 +17,6 @@ go_test( name = "atomicbitops_test", size = "small", srcs = ["atomic_bitops_test.go"], - embed = [":atomicbitops"], + library = ":atomicbitops", deps = ["//pkg/sync"], ) diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD index 543fb54bf..7ca2fda90 100644 --- a/pkg/binary/BUILD +++ b/pkg/binary/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "binary", srcs = ["binary.go"], - importpath = "gvisor.dev/gvisor/pkg/binary", visibility = ["//:sandbox"], ) @@ -14,5 +12,5 @@ go_test( name = "binary_test", size = "small", srcs = ["binary_test.go"], - embed = [":binary"], + library = ":binary", ) diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD index 93b88a29a..63f4670d7 100644 --- a/pkg/bits/BUILD +++ b/pkg/bits/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) @@ -15,7 +14,6 @@ go_library( "uint64_arch_arm64_asm.s", "uint64_arch_generic.go", ], - importpath = "gvisor.dev/gvisor/pkg/bits", visibility = ["//:sandbox"], ) @@ -53,5 +51,5 @@ go_test( name = "bits_test", size = "small", srcs = ["uint64_test.go"], - embed = [":bits"], + library = ":bits", ) diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD index fba5643e8..2a6977f85 100644 --- a/pkg/bpf/BUILD +++ b/pkg/bpf/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "interpreter.go", "program_builder.go", ], - importpath = "gvisor.dev/gvisor/pkg/bpf", visibility = ["//visibility:public"], deps = ["//pkg/abi/linux"], ) @@ -25,7 +23,7 @@ go_test( "interpreter_test.go", "program_builder_test.go", ], - embed = [":bpf"], + library = ":bpf", deps = [ "//pkg/abi/linux", "//pkg/binary", diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD index 2bb581b18..1f75319a7 100644 --- a/pkg/compressio/BUILD +++ b/pkg/compressio/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "compressio", srcs = ["compressio.go"], - importpath = "gvisor.dev/gvisor/pkg/compressio", visibility = ["//:sandbox"], deps = [ "//pkg/binary", @@ -18,5 +16,5 @@ go_test( name = "compressio_test", size = "medium", srcs = ["compressio_test.go"], - embed = [":compressio"], + library = ":compressio", ) diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD index 066d7b1a1..1b9e10ee7 100644 --- a/pkg/control/client/BUILD +++ b/pkg/control/client/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -7,7 +7,6 @@ go_library( srcs = [ "client.go", ], - importpath = "gvisor.dev/gvisor/pkg/control/client", visibility = ["//:sandbox"], deps = [ "//pkg/unet", diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD index adbd1e3f8..002d2ef44 100644 --- a/pkg/control/server/BUILD +++ b/pkg/control/server/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "server", srcs = ["server.go"], - importpath = "gvisor.dev/gvisor/pkg/control/server", visibility = ["//:sandbox"], deps = [ "//pkg/log", diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD index ed111fd2a..43a432190 100644 --- a/pkg/cpuid/BUILD +++ b/pkg/cpuid/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "cpu_amd64.s", "cpuid.go", ], - importpath = "gvisor.dev/gvisor/pkg/cpuid", visibility = ["//:sandbox"], deps = ["//pkg/log"], ) @@ -18,7 +16,7 @@ go_test( name = "cpuid_test", size = "small", srcs = ["cpuid_test.go"], - embed = [":cpuid"], + library = ":cpuid", ) go_test( @@ -27,6 +25,6 @@ go_test( srcs = [ "cpuid_parse_test.go", ], - embed = [":cpuid"], + library = ":cpuid", tags = ["manual"], ) diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD index 9d68682c7..bee28b68d 100644 --- a/pkg/eventchannel/BUILD +++ b/pkg/eventchannel/BUILD @@ -1,6 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") package(licenses = ["notice"]) @@ -10,7 +8,6 @@ go_library( "event.go", "rate.go", ], - importpath = "gvisor.dev/gvisor/pkg/eventchannel", visibility = ["//:sandbox"], deps = [ ":eventchannel_go_proto", @@ -24,22 +21,15 @@ go_library( ) proto_library( - name = "eventchannel_proto", + name = "eventchannel", srcs = ["event.proto"], visibility = ["//:sandbox"], ) -go_proto_library( - name = "eventchannel_go_proto", - importpath = "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto", - proto = ":eventchannel_proto", - visibility = ["//:sandbox"], -) - go_test( name = "eventchannel_test", srcs = ["event_test.go"], - embed = [":eventchannel"], + library = ":eventchannel", deps = [ "//pkg/sync", "@com_github_golang_protobuf//proto:go_default_library", diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD index afa8f7659..872361546 100644 --- a/pkg/fd/BUILD +++ b/pkg/fd/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "fd", srcs = ["fd.go"], - importpath = "gvisor.dev/gvisor/pkg/fd", visibility = ["//visibility:public"], ) @@ -14,5 +12,5 @@ go_test( name = "fd_test", size = "small", srcs = ["fd_test.go"], - embed = [":fd"], + library = ":fd", ) diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD index b0478c672..d9104ef02 100644 --- a/pkg/fdchannel/BUILD +++ b/pkg/fdchannel/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "fdchannel", srcs = ["fdchannel_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/fdchannel", visibility = ["//visibility:public"], ) @@ -14,6 +12,6 @@ go_test( name = "fdchannel_test", size = "small", srcs = ["fdchannel_test.go"], - embed = [":fdchannel"], + library = ":fdchannel", deps = ["//pkg/sync"], ) diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD index 91a202a30..235dcc490 100644 --- a/pkg/fdnotifier/BUILD +++ b/pkg/fdnotifier/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "fdnotifier.go", "poll_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/fdnotifier", visibility = ["//:sandbox"], deps = [ "//pkg/sync", diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD index 85bd83af1..9c5ad500b 100644 --- a/pkg/flipcall/BUILD +++ b/pkg/flipcall/BUILD @@ -1,7 +1,6 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "flipcall", @@ -13,7 +12,6 @@ go_library( "io.go", "packet_window_allocator.go", ], - importpath = "gvisor.dev/gvisor/pkg/flipcall", visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", @@ -30,6 +28,6 @@ go_test( "flipcall_example_test.go", "flipcall_test.go", ], - embed = [":flipcall"], + library = ":flipcall", deps = ["//pkg/sync"], ) diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD index ca540363c..ee84471b2 100644 --- a/pkg/fspath/BUILD +++ b/pkg/fspath/BUILD @@ -1,10 +1,8 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package( - default_visibility = ["//visibility:public"], - licenses = ["notice"], -) +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) go_library( name = "fspath", @@ -13,7 +11,6 @@ go_library( "builder_unsafe.go", "fspath.go", ], - importpath = "gvisor.dev/gvisor/pkg/fspath", ) go_test( @@ -23,5 +20,5 @@ go_test( "builder_test.go", "fspath_test.go", ], - embed = [":fspath"], + library = ":fspath", ) diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD index f22bd070d..dd3141143 100644 --- a/pkg/gate/BUILD +++ b/pkg/gate/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -8,7 +7,6 @@ go_library( srcs = [ "gate.go", ], - importpath = "gvisor.dev/gvisor/pkg/gate", visibility = ["//visibility:public"], ) diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD index 5d31e5366..ea8d2422c 100644 --- a/pkg/goid/BUILD +++ b/pkg/goid/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "goid_race.go", "goid_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/goid", visibility = ["//visibility:public"], ) @@ -22,5 +20,5 @@ go_test( "empty_test.go", "goid_test.go", ], - embed = [":goid"], + library = ":goid", ) diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD index 34d2673ef..3f6eb07df 100644 --- a/pkg/ilist/BUILD +++ b/pkg/ilist/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( srcs = [ "interface_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/ilist", visibility = ["//visibility:public"], ) @@ -41,7 +39,7 @@ go_test( "list_test.go", "test_list.go", ], - embed = [":ilist"], + library = ":ilist", ) go_template( diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD index bcde6d308..41bf104d0 100644 --- a/pkg/linewriter/BUILD +++ b/pkg/linewriter/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "linewriter", srcs = ["linewriter.go"], - importpath = "gvisor.dev/gvisor/pkg/linewriter", visibility = ["//visibility:public"], deps = ["//pkg/sync"], ) @@ -14,5 +12,5 @@ go_library( go_test( name = "linewriter_test", srcs = ["linewriter_test.go"], - embed = [":linewriter"], + library = ":linewriter", ) diff --git a/pkg/log/BUILD b/pkg/log/BUILD index 0df0f2849..935d06963 100644 --- a/pkg/log/BUILD +++ b/pkg/log/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "json_k8s.go", "log.go", ], - importpath = "gvisor.dev/gvisor/pkg/log", visibility = [ "//visibility:public", ], @@ -29,5 +27,5 @@ go_test( "json_test.go", "log_test.go", ], - embed = [":log"], + library = ":log", ) diff --git a/pkg/memutil/BUILD b/pkg/memutil/BUILD index 7b50e2b28..9d07d98b4 100644 --- a/pkg/memutil/BUILD +++ b/pkg/memutil/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "memutil", srcs = ["memutil_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/memutil", visibility = ["//visibility:public"], deps = ["@org_golang_x_sys//unix:go_default_library"], ) diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD index 9145f3233..58305009d 100644 --- a/pkg/metric/BUILD +++ b/pkg/metric/BUILD @@ -1,14 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") package(licenses = ["notice"]) go_library( name = "metric", srcs = ["metric.go"], - importpath = "gvisor.dev/gvisor/pkg/metric", visibility = ["//:sandbox"], deps = [ ":metric_go_proto", @@ -19,28 +15,15 @@ go_library( ) proto_library( - name = "metric_proto", + name = "metric", srcs = ["metric.proto"], visibility = ["//:sandbox"], ) -cc_proto_library( - name = "metric_cc_proto", - visibility = ["//:sandbox"], - deps = [":metric_proto"], -) - -go_proto_library( - name = "metric_go_proto", - importpath = "gvisor.dev/gvisor/pkg/metric/metric_go_proto", - proto = ":metric_proto", - visibility = ["//:sandbox"], -) - go_test( name = "metric_test", srcs = ["metric_test.go"], - embed = [":metric"], + library = ":metric", deps = [ ":metric_go_proto", "//pkg/eventchannel", diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD index a3e05c96d..4ccc1de86 100644 --- a/pkg/p9/BUILD +++ b/pkg/p9/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package( default_visibility = ["//visibility:public"], @@ -23,7 +22,6 @@ go_library( "transport_flipcall.go", "version.go", ], - importpath = "gvisor.dev/gvisor/pkg/p9", deps = [ "//pkg/fd", "//pkg/fdchannel", @@ -47,7 +45,7 @@ go_test( "transport_test.go", "version_test.go", ], - embed = [":p9"], + library = ":p9", deps = [ "//pkg/fd", "//pkg/unet", diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD index f4edd68b2..7ca67cb19 100644 --- a/pkg/p9/p9test/BUILD +++ b/pkg/p9/p9test/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test") +load("//tools:defs.bzl", "go_binary", "go_library", "go_test") package(licenses = ["notice"]) @@ -64,7 +63,6 @@ go_library( "mocks.go", "p9test.go", ], - importpath = "gvisor.dev/gvisor/pkg/p9/p9test", visibility = ["//:sandbox"], deps = [ "//pkg/fd", @@ -80,7 +78,7 @@ go_test( name = "client_test", size = "medium", srcs = ["client_test.go"], - embed = [":p9test"], + library = ":p9test", deps = [ "//pkg/fd", "//pkg/p9", diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD index b506813f0..aa3e3ac0b 100644 --- a/pkg/procid/BUILD +++ b/pkg/procid/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +9,6 @@ go_library( "procid_amd64.s", "procid_arm64.s", ], - importpath = "gvisor.dev/gvisor/pkg/procid", visibility = ["//visibility:public"], ) @@ -20,7 +18,7 @@ go_test( srcs = [ "procid_test.go", ], - embed = [":procid"], + library = ":procid", deps = ["//pkg/sync"], ) @@ -31,6 +29,6 @@ go_test( "procid_net_test.go", "procid_test.go", ], - embed = [":procid"], + library = ":procid", deps = ["//pkg/sync"], ) diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD index 9d5b4859b..80b8ceb02 100644 --- a/pkg/rand/BUILD +++ b/pkg/rand/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "rand.go", "rand_linux.go", ], - importpath = "gvisor.dev/gvisor/pkg/rand", visibility = ["//:sandbox"], deps = [ "//pkg/sync", diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 974d9af9b..74affc887 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -23,7 +22,6 @@ go_library( "refcounter_state.go", "weak_ref_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/refs", visibility = ["//:sandbox"], deps = [ "//pkg/log", @@ -35,6 +33,6 @@ go_test( name = "refs_test", size = "small", srcs = ["refcounter_test.go"], - embed = [":refs"], + library = ":refs", deps = ["//pkg/sync"], ) diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD index af94e944d..742c8b79b 100644 --- a/pkg/seccomp/BUILD +++ b/pkg/seccomp/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data", "go_test") +load("//tools:defs.bzl", "go_binary", "go_embed_data", "go_library", "go_test") package(licenses = ["notice"]) @@ -27,7 +26,6 @@ go_library( "seccomp_rules.go", "seccomp_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/seccomp", visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", @@ -43,7 +41,7 @@ go_test( "seccomp_test.go", ":victim_data", ], - embed = [":seccomp"], + library = ":seccomp", deps = [ "//pkg/abi/linux", "//pkg/binary", diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD index 22abdc69f..60f63c7a6 100644 --- a/pkg/secio/BUILD +++ b/pkg/secio/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "full_reader.go", "secio.go", ], - importpath = "gvisor.dev/gvisor/pkg/secio", visibility = ["//pkg/sentry:internal"], ) @@ -17,5 +15,5 @@ go_test( name = "secio_test", size = "small", srcs = ["secio_test.go"], - embed = [":secio"], + library = ":secio", ) diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD index a27c35e21..f2d8462d8 100644 --- a/pkg/segment/test/BUILD +++ b/pkg/segment/test/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package( @@ -38,7 +37,6 @@ go_library( "int_set.go", "set_functions.go", ], - importpath = "gvisor.dev/gvisor/pkg/segment/segment", deps = [ "//pkg/state", ], @@ -48,5 +46,5 @@ go_test( name = "segment_test", size = "small", srcs = ["segment_test.go"], - embed = [":segment"], + library = ":segment", ) diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD index 2d6379c86..e8b794179 100644 --- a/pkg/sentry/BUILD +++ b/pkg/sentry/BUILD @@ -6,6 +6,8 @@ package(licenses = ["notice"]) package_group( name = "internal", packages = [ + "//cloud/gvisor/gopkg/sentry/...", + "//cloud/gvisor/sentry/...", "//pkg/sentry/...", "//runsc/...", # Code generated by go_marshal relies on go_marshal libraries. diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 65f22af2b..51ca09b24 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -1,6 +1,4 @@ -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "proto_library") package(licenses = ["notice"]) @@ -27,7 +25,6 @@ go_library( "syscalls_amd64.go", "syscalls_arm64.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/arch", visibility = ["//:sandbox"], deps = [ ":registers_go_proto", @@ -44,20 +41,7 @@ go_library( ) proto_library( - name = "registers_proto", + name = "registers", srcs = ["registers.proto"], visibility = ["//visibility:public"], ) - -cc_proto_library( - name = "registers_cc_proto", - visibility = ["//visibility:public"], - deps = [":registers_proto"], -) - -go_proto_library( - name = "registers_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto", - proto = ":registers_proto", - visibility = ["//visibility:public"], -) diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD index 8dc1a77b1..e13a9ce20 100644 --- a/pkg/sentry/context/BUILD +++ b/pkg/sentry/context/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "context", srcs = ["context.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/context", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/amutex", diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD index 581e7aa96..f91a6d4ed 100644 --- a/pkg/sentry/context/contexttest/BUILD +++ b/pkg/sentry/context/contexttest/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "contexttest", testonly = 1, srcs = ["contexttest.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/context/contexttest", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/memutil", diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 2561a6109..e69496477 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,9 +11,8 @@ go_library( "proc.go", "state.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/control", visibility = [ - "//pkg/sentry:internal", + "//:sandbox", ], deps = [ "//pkg/abi/linux", @@ -40,7 +38,7 @@ go_test( name = "control_test", size = "small", srcs = ["proc_test.go"], - embed = [":control"], + library = ":control", deps = [ "//pkg/log", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD index 97fa1512c..e403cbd8b 100644 --- a/pkg/sentry/device/BUILD +++ b/pkg/sentry/device/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "device", srcs = ["device.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/device", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -18,5 +16,5 @@ go_test( name = "device_test", size = "small", srcs = ["device_test.go"], - embed = [":device"], + library = ":device", ) diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 7d5d72d5a..605d61dbe 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -44,7 +43,6 @@ go_library( "splice.go", "sync.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -129,7 +127,7 @@ go_test( "mount_test.go", "path_test.go", ], - embed = [":fs"], + library = ":fs", deps = [ "//pkg/sentry/context", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD index ae1c9cf76..c14e5405e 100644 --- a/pkg/sentry/fs/anon/BUILD +++ b/pkg/sentry/fs/anon/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "anon.go", "device.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/anon", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index a0d9e8496..0c7247bd7 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -13,7 +13,6 @@ go_library( "random.go", "tty.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/dev", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index cc43de69d..25ef96299 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +9,6 @@ go_library( "pipe_opener.go", "pipe_state.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/fdpipe", imports = ["gvisor.dev/gvisor/pkg/sentry/fs"], visibility = ["//pkg/sentry:internal"], deps = [ @@ -36,7 +34,7 @@ go_test( "pipe_opener_test.go", "pipe_test.go", ], - embed = [":fdpipe"], + library = ":fdpipe", deps = [ "//pkg/fd", "//pkg/fdnotifier", diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD index 358dc2be3..9a7608cae 100644 --- a/pkg/sentry/fs/filetest/BUILD +++ b/pkg/sentry/fs/filetest/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "filetest", testonly = 1, srcs = ["filetest.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/filetest", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/context", diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 945b6270d..9142f5bdf 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -75,7 +74,6 @@ go_library( "inode.go", "inode_cached.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/fsutil", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -106,7 +104,7 @@ go_test( "dirty_set_test.go", "inode_cached_test.go", ], - embed = [":fsutil"], + library = ":fsutil", deps = [ "//pkg/sentry/context", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index fd870e8e1..cf48e7c03 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -22,7 +21,6 @@ go_library( "socket.go", "util.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/gofer", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -56,7 +54,7 @@ go_test( name = "gofer_test", size = "small", srcs = ["gofer_test.go"], - embed = [":gofer"], + library = ":gofer", deps = [ "//pkg/p9", "//pkg/p9/p9test", diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index 2b581aa69..f586f47c1 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -25,7 +24,6 @@ go_library( "util_arm64_unsafe.go", "util_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/host", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -69,7 +67,7 @@ go_test( "socket_test.go", "wait_test.go", ], - embed = [":host"], + library = ":host", deps = [ "//pkg/fd", "//pkg/fdnotifier", diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD index 2c332a82a..ae3331737 100644 --- a/pkg/sentry/fs/lock/BUILD +++ b/pkg/sentry/fs/lock/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -40,7 +39,6 @@ go_library( "lock_set.go", "lock_set_functions.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/lock", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", @@ -56,5 +54,5 @@ go_test( "lock_range_test.go", "lock_test.go", ], - embed = [":lock"], + library = ":lock", ) diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index cb37c6c6b..b06bead41 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -27,7 +26,6 @@ go_library( "uptime.go", "version.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -63,7 +61,7 @@ go_test( "net_test.go", "sys_net_test.go", ], - embed = [":proc"], + library = ":proc", deps = [ "//pkg/abi/linux", "//pkg/sentry/context", diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD index 0394451d4..52c9aa93d 100644 --- a/pkg/sentry/fs/proc/device/BUILD +++ b/pkg/sentry/fs/proc/device/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "device", srcs = ["device.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc/device", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/sentry/device"], ) diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index 38b246dff..310d8dd52 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "seqfile", srcs = ["seqfile.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -26,7 +24,7 @@ go_test( name = "seqfile_test", size = "small", srcs = ["seqfile_test.go"], - embed = [":seqfile"], + library = ":seqfile", deps = [ "//pkg/sentry/context", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 3fb7b0633..39c4b84f8 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "symlink.go", "tree.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ramfs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -31,7 +29,7 @@ go_test( name = "ramfs_test", size = "small", srcs = ["tree_test.go"], - embed = [":ramfs"], + library = ":ramfs", deps = [ "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD index 25f0f124e..cc6b3bfbf 100644 --- a/pkg/sentry/fs/sys/BUILD +++ b/pkg/sentry/fs/sys/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -10,7 +10,6 @@ go_library( "fs.go", "sys.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/sys", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD index a215c1b95..092668e8d 100644 --- a/pkg/sentry/fs/timerfd/BUILD +++ b/pkg/sentry/fs/timerfd/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "timerfd", srcs = ["timerfd.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/timerfd", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/context", diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 3400b940c..04776555f 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "inode_file.go", "tmpfs.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -41,7 +39,7 @@ go_test( name = "tmpfs_test", size = "small", srcs = ["file_test.go"], - embed = [":tmpfs"], + library = ":tmpfs", deps = [ "//pkg/sentry/context", "//pkg/sentry/fs", diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index f6f60d0cf..29f804c6c 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -14,7 +13,6 @@ go_library( "slave.go", "terminal.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/tty", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -40,7 +38,7 @@ go_test( name = "tty_test", size = "small", srcs = ["tty_test.go"], - embed = [":tty"], + library = ":tty", deps = [ "//pkg/abi/linux", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 903874141..a718920d5 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -32,7 +31,6 @@ go_library( "symlink.go", "utils.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -71,7 +69,7 @@ go_test( "//pkg/sentry/fsimpl/ext:assets/tiny.ext3", "//pkg/sentry/fsimpl/ext:assets/tiny.ext4", ], - embed = [":ext"], + library = ":ext", deps = [ "//pkg/abi/linux", "//pkg/binary", diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD index 4fc8296ef..12f3990c1 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD index fcfaf5c3e..9bd9c76c0 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/BUILD +++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -23,7 +22,6 @@ go_library( "superblock_old.go", "test_utils.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -44,6 +42,6 @@ go_test( "inode_test.go", "superblock_test.go", ], - embed = [":disklayout"], + library = ":disklayout", deps = ["//pkg/sentry/kernel/time"], ) diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 66d409785..7bf83ccba 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -1,8 +1,7 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -package(licenses = ["notice"]) +licenses(["notice"]) go_template_instance( name = "slot_list", @@ -27,7 +26,6 @@ go_library( "slot_list.go", "symlink.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index c5b79fb38..3768f55b2 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -1,7 +1,6 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "proc", @@ -15,7 +14,6 @@ go_library( "tasks_net.go", "tasks_sys.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", deps = [ "//pkg/abi/linux", "//pkg/log", @@ -47,7 +45,7 @@ go_test( "tasks_sys_test.go", "tasks_test.go", ], - embed = [":proc"], + library = ":proc", deps = [ "//pkg/abi/linux", "//pkg/fspath", diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index ee3c842bd..beda141f1 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -1,14 +1,12 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "sys", srcs = [ "sys.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys", deps = [ "//pkg/abi/linux", "//pkg/sentry/context", diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index 4e70d84a7..12053a5b6 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -1,6 +1,6 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "testutil", @@ -9,7 +9,6 @@ go_library( "kernel.go", "testutil.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 691476b4f..857e98bc5 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -1,8 +1,7 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -package(licenses = ["notice"]) +licenses(["notice"]) go_template_instance( name = "dentry_list", @@ -28,7 +27,6 @@ go_library( "symlink.go", "tmpfs.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs", deps = [ "//pkg/abi/linux", "//pkg/amutex", @@ -81,7 +79,7 @@ go_test( "regular_file_test.go", "stat_test.go", ], - embed = [":tmpfs"], + library = ":tmpfs", deps = [ "//pkg/abi/linux", "//pkg/fspath", diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD index 359468ccc..e6933aa70 100644 --- a/pkg/sentry/hostcpu/BUILD +++ b/pkg/sentry/hostcpu/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +9,6 @@ go_library( "getcpu_arm64.s", "hostcpu.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/hostcpu", visibility = ["//:sandbox"], ) @@ -18,5 +16,5 @@ go_test( name = "hostcpu_test", size = "small", srcs = ["hostcpu_test.go"], - embed = [":hostcpu"], + library = ":hostcpu", ) diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD index 67831d5a1..a145a5ca3 100644 --- a/pkg/sentry/hostmm/BUILD +++ b/pkg/sentry/hostmm/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "cgroup.go", "hostmm.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/hostmm", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/fd", diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index 8d60ad4ad..aa621b724 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package( default_visibility = ["//:sandbox"], @@ -12,7 +12,6 @@ go_library( "inet.go", "test_stack.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/inet", deps = [ "//pkg/sentry/context", "//pkg/tcpip/stack", diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index ac85ba0c8..cebaccd92 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -1,8 +1,5 @@ -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -78,26 +75,12 @@ go_template_instance( ) proto_library( - name = "uncaught_signal_proto", + name = "uncaught_signal", srcs = ["uncaught_signal.proto"], visibility = ["//visibility:public"], deps = ["//pkg/sentry/arch:registers_proto"], ) -cc_proto_library( - name = "uncaught_signal_cc_proto", - visibility = ["//visibility:public"], - deps = [":uncaught_signal_proto"], -) - -go_proto_library( - name = "uncaught_signal_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto", - proto = ":uncaught_signal_proto", - visibility = ["//visibility:public"], - deps = ["//pkg/sentry/arch:registers_go_proto"], -) - go_library( name = "kernel", srcs = [ @@ -156,7 +139,6 @@ go_library( "vdso.go", "version.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel", imports = [ "gvisor.dev/gvisor/pkg/bpf", "gvisor.dev/gvisor/pkg/sentry/device", @@ -227,7 +209,7 @@ go_test( "task_test.go", "timekeeper_test.go", ], - embed = [":kernel"], + library = ":kernel", deps = [ "//pkg/abi", "//pkg/sentry/arch", diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 1aa72fa47..64537c9be 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -57,7 +57,6 @@ go_library( "id_map_set.go", "user_namespace.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/auth", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD index 3a88a585c..daff608d7 100644 --- a/pkg/sentry/kernel/contexttest/BUILD +++ b/pkg/sentry/kernel/contexttest/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "contexttest", testonly = 1, srcs = ["contexttest.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/context", diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index c47f6b6fc..19e16ab3a 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -23,7 +22,6 @@ go_library( "epoll_list.go", "epoll_state.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/epoll", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/refs", @@ -43,7 +41,7 @@ go_test( srcs = [ "epoll_test.go", ], - embed = [":epoll"], + library = ":epoll", deps = [ "//pkg/sentry/context/contexttest", "//pkg/sentry/fs/filetest", diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index c831fbab2..ee2d74864 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "eventfd", srcs = ["eventfd.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -26,7 +24,7 @@ go_test( name = "eventfd_test", size = "small", srcs = ["eventfd_test.go"], - embed = [":eventfd"], + library = ":eventfd", deps = [ "//pkg/sentry/context/contexttest", "//pkg/sentry/usermem", diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 6b36bc63e..b9126e946 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "fasync", srcs = ["fasync.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/fasync", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 50db443ce..f413d8ae2 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -34,7 +33,6 @@ go_library( "futex.go", "waiter_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/futex", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -51,7 +49,7 @@ go_test( name = "futex_test", size = "small", srcs = ["futex_test.go"], - embed = [":futex"], + library = ":futex", deps = [ "//pkg/sentry/usermem", "//pkg/sync", diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD index 7f36252a9..4486848d2 100644 --- a/pkg/sentry/kernel/memevent/BUILD +++ b/pkg/sentry/kernel/memevent/BUILD @@ -1,13 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "proto_library") package(licenses = ["notice"]) go_library( name = "memevent", srcs = ["memory_events.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent", visibility = ["//:sandbox"], deps = [ ":memory_events_go_proto", @@ -21,20 +18,7 @@ go_library( ) proto_library( - name = "memory_events_proto", + name = "memory_events", srcs = ["memory_events.proto"], visibility = ["//visibility:public"], ) - -cc_proto_library( - name = "memory_events_cc_proto", - visibility = ["//visibility:public"], - deps = [":memory_events_proto"], -) - -go_proto_library( - name = "memory_events_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto", - proto = ":memory_events_proto", - visibility = ["//visibility:public"], -) diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 5eeaeff66..2c7b6206f 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -30,7 +29,6 @@ go_library( "vfs.go", "writer.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/pipe", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -57,7 +55,7 @@ go_test( "node_test.go", "pipe_test.go", ], - embed = [":pipe"], + library = ":pipe", deps = [ "//pkg/sentry/context", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD index 98ea7a0d8..1b82e087b 100644 --- a/pkg/sentry/kernel/sched/BUILD +++ b/pkg/sentry/kernel/sched/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "cpuset.go", "sched.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/sched", visibility = ["//pkg/sentry:internal"], ) @@ -17,5 +15,5 @@ go_test( name = "sched_test", size = "small", srcs = ["cpuset_test.go"], - embed = [":sched"], + library = ":sched", ) diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 13a961594..76e19b551 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -22,7 +21,6 @@ go_library( "semaphore.go", "waiter_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -40,7 +38,7 @@ go_test( name = "semaphore_test", size = "small", srcs = ["semaphore_test.go"], - embed = [":semaphore"], + library = ":semaphore", deps = [ "//pkg/abi/linux", "//pkg/sentry/context", diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 7321b22ed..5547c5abf 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "device.go", "shm.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/shm", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 89e4d84b1..5d44773d4 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "signalfd", srcs = ["signalfd.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 4e4de0512..d49594d9f 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "context.go", "time.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/time", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD index 9fa841e8b..67869757f 100644 --- a/pkg/sentry/limits/BUILD +++ b/pkg/sentry/limits/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +9,6 @@ go_library( "limits.go", "linux.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/limits", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", @@ -25,5 +23,5 @@ go_test( srcs = [ "limits_test.go", ], - embed = [":limits"], + library = ":limits", ) diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index 2890393bd..d4ad2bd6c 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_embed_data") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_embed_data", "go_library") package(licenses = ["notice"]) @@ -20,7 +19,6 @@ go_library( "vdso_state.go", ":vdso_bin", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/loader", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi", diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index 112794e9c..f9a65f086 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -37,7 +36,6 @@ go_library( "mapping_set_impl.go", "memmap.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/memmap", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", @@ -52,6 +50,6 @@ go_test( name = "memmap_test", size = "small", srcs = ["mapping_set_test.go"], - embed = [":memmap"], + library = ":memmap", deps = ["//pkg/sentry/usermem"], ) diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 83e248431..bd6399fa2 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -96,7 +95,6 @@ go_library( "vma.go", "vma_set.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/mm", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -128,7 +126,7 @@ go_test( name = "mm_test", size = "small", srcs = ["mm_test.go"], - embed = [":mm"], + library = ":mm", deps = [ "//pkg/sentry/arch", "//pkg/sentry/context", diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index a9a2642c5..02385a3ce 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -60,7 +59,6 @@ go_library( "save_restore.go", "usage_set.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/pgalloc", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", @@ -82,6 +80,6 @@ go_test( name = "pgalloc_test", size = "small", srcs = ["pgalloc_test.go"], - embed = [":pgalloc"], + library = ":pgalloc", deps = ["//pkg/sentry/usermem"], ) diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 157bffa81..006450b2d 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -22,7 +22,6 @@ go_library( "mmap_min_addr.go", "platform.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD index 85e882df9..83b385f14 100644 --- a/pkg/sentry/platform/interrupt/BUILD +++ b/pkg/sentry/platform/interrupt/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -8,7 +7,6 @@ go_library( srcs = [ "interrupt.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/interrupt", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/sync"], ) @@ -17,5 +15,5 @@ go_test( name = "interrupt_test", size = "small", srcs = ["interrupt_test.go"], - embed = [":interrupt"], + library = ":interrupt", ) diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 6a358d1d4..a4532a766 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -38,7 +37,6 @@ go_library( "physical_map_arm64.go", "virtual_map.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -65,7 +63,7 @@ go_test( "kvm_test.go", "virtual_map_test.go", ], - embed = [":kvm"], + library = ":kvm", tags = [ "manual", "nogotsan", diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD index b0e45f159..f7605df8a 100644 --- a/pkg/sentry/platform/kvm/testutil/BUILD +++ b/pkg/sentry/platform/kvm/testutil/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -12,6 +12,5 @@ go_library( "testutil_arm64.go", "testutil_arm64.s", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil", visibility = ["//pkg/sentry/platform/kvm:__pkg__"], ) diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index cd13390c3..3bcc5e040 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -20,7 +20,6 @@ go_library( "subprocess_linux_unsafe.go", "subprocess_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ptrace", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index 87f4552b5..6dee8fcc5 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) @@ -74,7 +74,6 @@ go_library( "lib_arm64.s", "ring0.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/cpuid", diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index 42076fb04..147311ed3 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index 387a7f6c3..8b5cdd6c1 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -1,17 +1,14 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test", "select_arch") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) -config_setting( - name = "aarch64", - constraint_values = ["@bazel_tools//platforms:aarch64"], -) - go_template( name = "generic_walker", - srcs = ["walker_amd64.go"], + srcs = select_arch( + amd64 = ["walker_amd64.go"], + arm64 = ["walker_amd64.go"], + ), opt_types = [ "Visitor", ], @@ -91,7 +88,6 @@ go_library( "walker_map.go", "walker_unmap.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables", visibility = [ "//pkg/sentry/platform/kvm:__subpackages__", "//pkg/sentry/platform/ring0:__subpackages__", @@ -111,6 +107,6 @@ go_test( "pagetables_test.go", "walker_check.go", ], - embed = [":pagetables"], + library = ":pagetables", deps = ["//pkg/sentry/usermem"], ) diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD index 6769cd0a5..b8747585b 100644 --- a/pkg/sentry/platform/safecopy/BUILD +++ b/pkg/sentry/platform/safecopy/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -17,7 +16,6 @@ go_library( "sighandler_amd64.s", "sighandler_arm64.s", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/safecopy", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/syserror"], ) @@ -27,5 +25,5 @@ go_test( srcs = [ "safecopy_test.go", ], - embed = [":safecopy"], + library = ":safecopy", ) diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD index 884020f7b..3ab76da97 100644 --- a/pkg/sentry/safemem/BUILD +++ b/pkg/sentry/safemem/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "safemem.go", "seq_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/safemem", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/platform/safecopy", @@ -25,5 +23,5 @@ go_test( "io_test.go", "seq_test.go", ], - embed = [":safemem"], + library = ":safemem", ) diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD index f561670c7..6c38a3f44 100644 --- a/pkg/sentry/sighandling/BUILD +++ b/pkg/sentry/sighandling/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "sighandling.go", "sighandling_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/sighandling", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/abi/linux"], ) diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 26176b10d..8e2b97afb 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "socket", srcs = ["socket.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 357517ed4..3850f6345 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "control", srcs = ["control.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/control", imports = [ "gvisor.dev/gvisor/pkg/sentry/fs", ], diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 4c44c7c0f..42bf7be6a 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -12,7 +12,6 @@ go_library( "socket_unsafe.go", "stack.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/hostinet", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index b70047d81..ed34a8308 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -7,7 +7,6 @@ go_library( srcs = [ "netfilter.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netfilter", # This target depends on netstack and should only be used by epsocket, # which is allowed to depend on netstack. visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 103933144..baaac13c6 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "provider.go", "socket.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD index 2d9f4ba9b..3a22923d8 100644 --- a/pkg/sentry/socket/netlink/port/BUILD +++ b/pkg/sentry/socket/netlink/port/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "port", srcs = ["port.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/sync"], ) @@ -14,5 +12,5 @@ go_library( go_test( name = "port_test", srcs = ["port_test.go"], - embed = [":port"], + library = ":port", ) diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD index 1d4912753..2137c7aeb 100644 --- a/pkg/sentry/socket/netlink/route/BUILD +++ b/pkg/sentry/socket/netlink/route/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "route", srcs = ["protocol.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD index 0777f3baf..73fbdf1eb 100644 --- a/pkg/sentry/socket/netlink/uevent/BUILD +++ b/pkg/sentry/socket/netlink/uevent/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "uevent", srcs = ["protocol.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index f78784569..e3d1f90cb 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -11,7 +11,6 @@ go_library( "save_restore.go", "stack.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netstack", visibility = [ "//pkg/sentry:internal", ], diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 5b6a154f6..bade18686 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "io.go", "unix.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/unix", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index d7ba95dff..4bdfc9208 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -25,7 +25,6 @@ go_library( "transport_message_list.go", "unix.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD index 88765f4d6..0ea4aab8b 100644 --- a/pkg/sentry/state/BUILD +++ b/pkg/sentry/state/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "state_metadata.go", "state_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/state", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index aa1ac720c..ff6fafa63 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -1,6 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "proto_library") package(licenses = ["notice"]) @@ -21,7 +19,6 @@ go_library( "strace.go", "syscalls.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/strace", visibility = ["//:sandbox"], deps = [ ":strace_go_proto", @@ -42,20 +39,7 @@ go_library( ) proto_library( - name = "strace_proto", + name = "strace", srcs = ["strace.proto"], visibility = ["//visibility:public"], ) - -cc_proto_library( - name = "strace_cc_proto", - visibility = ["//visibility:public"], - deps = [":strace_proto"], -) - -go_proto_library( - name = "strace_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto", - proto = ":strace_proto", - visibility = ["//visibility:public"], -) diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD index 79d972202..b8d1bd415 100644 --- a/pkg/sentry/syscalls/BUILD +++ b/pkg/sentry/syscalls/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "epoll.go", "syscalls.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 917f74e07..7d74e0f70 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -57,7 +57,6 @@ go_library( "sys_xattr.go", "timespec.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux", visibility = ["//:sandbox"], deps = [ "//pkg/abi", diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index 3cde3a0be..04f81a35b 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -31,7 +30,6 @@ go_library( "tsc_amd64.s", "tsc_arm64.s", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/time", visibility = ["//:sandbox"], deps = [ "//pkg/log", @@ -48,5 +46,5 @@ go_test( "parameters_test.go", "sampler_test.go", ], - embed = [":time"], + library = ":time", ) diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD index fc7614fff..370fa6ec5 100644 --- a/pkg/sentry/unimpl/BUILD +++ b/pkg/sentry/unimpl/BUILD @@ -1,34 +1,17 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "proto_library") package(licenses = ["notice"]) proto_library( - name = "unimplemented_syscall_proto", + name = "unimplemented_syscall", srcs = ["unimplemented_syscall.proto"], visibility = ["//visibility:public"], deps = ["//pkg/sentry/arch:registers_proto"], ) -cc_proto_library( - name = "unimplemented_syscall_cc_proto", - visibility = ["//visibility:public"], - deps = [":unimplemented_syscall_proto"], -) - -go_proto_library( - name = "unimplemented_syscall_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto", - proto = ":unimplemented_syscall_proto", - visibility = ["//visibility:public"], - deps = ["//pkg/sentry/arch:registers_go_proto"], -) - go_library( name = "unimpl", srcs = ["events.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl", visibility = ["//:sandbox"], deps = [ "//pkg/log", diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD index 86a87edd4..e9c18f170 100644 --- a/pkg/sentry/uniqueid/BUILD +++ b/pkg/sentry/uniqueid/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "uniqueid", srcs = ["context.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/uniqueid", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/context", diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD index 5518ac3d0..099315613 100644 --- a/pkg/sentry/usage/BUILD +++ b/pkg/sentry/usage/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -11,9 +11,8 @@ go_library( "memory_unsafe.go", "usage.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/usage", visibility = [ - "//pkg/sentry:internal", + "//:sandbox", ], deps = [ "//pkg/bits", diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD index 684f59a6b..c8322e29e 100644 --- a/pkg/sentry/usermem/BUILD +++ b/pkg/sentry/usermem/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -29,7 +28,6 @@ go_library( "usermem_unsafe.go", "usermem_x86.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/usermem", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/atomicbitops", @@ -38,7 +36,6 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/safemem", "//pkg/syserror", - "//pkg/tcpip/buffer", ], ) @@ -49,7 +46,7 @@ go_test( "addr_range_seq_test.go", "usermem_test.go", ], - embed = [":usermem"], + library = ":usermem", deps = [ "//pkg/sentry/context", "//pkg/sentry/safemem", diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 35c7be259..51acdc4e9 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -1,7 +1,6 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "vfs", @@ -24,7 +23,6 @@ go_library( "testutil.go", "vfs.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/vfs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -47,7 +45,7 @@ go_test( "file_description_impl_util_test.go", "mount_test.go", ], - embed = [":vfs"], + library = ":vfs", deps = [ "//pkg/abi/linux", "//pkg/sentry/context", diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD index 28f21f13d..1c5a1c9b6 100644 --- a/pkg/sentry/watchdog/BUILD +++ b/pkg/sentry/watchdog/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "watchdog", srcs = ["watchdog.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/watchdog", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD index a23c86fb1..e131455f7 100644 --- a/pkg/sleep/BUILD +++ b/pkg/sleep/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "commit_noasm.go", "sleep_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sleep", visibility = ["//:sandbox"], ) @@ -22,5 +20,5 @@ go_test( srcs = [ "sleep_test.go", ], - embed = [":sleep"], + library = ":sleep", ) diff --git a/pkg/state/BUILD b/pkg/state/BUILD index be93750bf..921af9d63 100644 --- a/pkg/state/BUILD +++ b/pkg/state/BUILD @@ -1,6 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -49,7 +47,7 @@ go_library( "state.go", "stats.go", ], - importpath = "gvisor.dev/gvisor/pkg/state", + stateify = False, visibility = ["//:sandbox"], deps = [ ":object_go_proto", @@ -58,21 +56,14 @@ go_library( ) proto_library( - name = "object_proto", + name = "object", srcs = ["object.proto"], visibility = ["//:sandbox"], ) -go_proto_library( - name = "object_go_proto", - importpath = "gvisor.dev/gvisor/pkg/state/object_go_proto", - proto = ":object_proto", - visibility = ["//:sandbox"], -) - go_test( name = "state_test", timeout = "long", srcs = ["state_test.go"], - embed = [":state"], + library = ":state", ) diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD index 8a865d229..e7581c09b 100644 --- a/pkg/state/statefile/BUILD +++ b/pkg/state/statefile/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "statefile", srcs = ["statefile.go"], - importpath = "gvisor.dev/gvisor/pkg/state/statefile", visibility = ["//:sandbox"], deps = [ "//pkg/binary", @@ -18,6 +16,6 @@ go_test( name = "statefile_test", size = "small", srcs = ["statefile_test.go"], - embed = [":statefile"], + library = ":statefile", deps = ["//pkg/compressio"], ) diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD index 97c4b3b1e..5340cf0d6 100644 --- a/pkg/sync/BUILD +++ b/pkg/sync/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template") package( @@ -40,7 +39,6 @@ go_library( "syncutil.go", "tmutex_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sync", ) go_test( @@ -51,5 +49,5 @@ go_test( "seqcount_test.go", "tmutex_test.go", ], - embed = [":sync"], + library = ":sync", ) diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD index 418eda29c..e97553254 100644 --- a/pkg/sync/atomicptrtest/BUILD +++ b/pkg/sync/atomicptrtest/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -18,12 +17,11 @@ go_template_instance( go_library( name = "atomicptr", srcs = ["atomicptr_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/sync/atomicptr", ) go_test( name = "atomicptr_test", size = "small", srcs = ["atomicptr_test.go"], - embed = [":atomicptr"], + library = ":atomicptr", ) diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD index eba21518d..5c38c783e 100644 --- a/pkg/sync/seqatomictest/BUILD +++ b/pkg/sync/seqatomictest/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -18,7 +17,6 @@ go_template_instance( go_library( name = "seqatomic", srcs = ["seqatomic_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/sync/seqatomic", deps = [ "//pkg/sync", ], @@ -28,6 +26,6 @@ go_test( name = "seqatomic_test", size = "small", srcs = ["seqatomic_test.go"], - embed = [":seqatomic"], + library = ":seqatomic", deps = ["//pkg/sync"], ) diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD index 5665ad4ee..7d760344a 100644 --- a/pkg/syserr/BUILD +++ b/pkg/syserr/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "netstack.go", "syserr.go", ], - importpath = "gvisor.dev/gvisor/pkg/syserr", visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD index bd3f9fd28..b13c15d9b 100644 --- a/pkg/syserror/BUILD +++ b/pkg/syserror/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "syserror", srcs = ["syserror.go"], - importpath = "gvisor.dev/gvisor/pkg/syserror", visibility = ["//visibility:public"], ) diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD index 23e4b09e7..26f7ba86b 100644 --- a/pkg/tcpip/BUILD +++ b/pkg/tcpip/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "time_unsafe.go", "timer.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip", visibility = ["//visibility:public"], deps = [ "//pkg/sync", @@ -25,7 +23,7 @@ go_test( name = "tcpip_test", size = "small", srcs = ["tcpip_test.go"], - embed = [":tcpip"], + library = ":tcpip", ) go_test( diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD index 3df7d18d3..a984f1712 100644 --- a/pkg/tcpip/adapters/gonet/BUILD +++ b/pkg/tcpip/adapters/gonet/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "gonet", srcs = ["gonet.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet", visibility = ["//visibility:public"], deps = [ "//pkg/sync", @@ -23,7 +21,7 @@ go_test( name = "gonet_test", size = "small", srcs = ["gonet_test.go"], - embed = [":gonet"], + library = ":gonet", deps = [ "//pkg/tcpip", "//pkg/tcpip/header", diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD index d6c31bfa2..563bc78ea 100644 --- a/pkg/tcpip/buffer/BUILD +++ b/pkg/tcpip/buffer/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "prependable.go", "view.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/buffer", visibility = ["//visibility:public"], ) @@ -17,5 +15,5 @@ go_test( name = "buffer_test", size = "small", srcs = ["view_test.go"], - embed = [":buffer"], + library = ":buffer", ) diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD index b6fa6fc37..ed434807f 100644 --- a/pkg/tcpip/checker/BUILD +++ b/pkg/tcpip/checker/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "checker", testonly = 1, srcs = ["checker.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/checker", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD index e648efa71..ff2719291 100644 --- a/pkg/tcpip/hash/jenkins/BUILD +++ b/pkg/tcpip/hash/jenkins/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "jenkins", srcs = ["jenkins.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins", visibility = ["//visibility:public"], ) @@ -16,5 +14,5 @@ go_test( srcs = [ "jenkins_test.go", ], - embed = [":jenkins"], + library = ":jenkins", ) diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD index cd747d100..9da0d71f8 100644 --- a/pkg/tcpip/header/BUILD +++ b/pkg/tcpip/header/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -24,7 +23,6 @@ go_library( "tcp.go", "udp.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/header", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", @@ -59,7 +57,7 @@ go_test( "eth_test.go", "ndp_test.go", ], - embed = [":header"], + library = ":header", deps = [ "//pkg/tcpip", "@com_github_google_go-cmp//cmp:go_default_library", diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD index 297eaccaf..d1b73cfdf 100644 --- a/pkg/tcpip/iptables/BUILD +++ b/pkg/tcpip/iptables/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "targets.go", "types.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables", visibility = ["//visibility:public"], deps = [ "//pkg/log", diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD index 7dbc05754..3974c464e 100644 --- a/pkg/tcpip/link/channel/BUILD +++ b/pkg/tcpip/link/channel/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "channel", srcs = ["channel.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/channel", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index 66cc53ed4..abe725548 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -13,7 +12,6 @@ go_library( "mmap_unsafe.go", "packet_dispatchers.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased", visibility = ["//visibility:public"], deps = [ "//pkg/sync", @@ -30,7 +28,7 @@ go_test( name = "fdbased_test", size = "small", srcs = ["endpoint_test.go"], - embed = [":fdbased"], + library = ":fdbased", deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD index f35fcdff4..6bf3805b7 100644 --- a/pkg/tcpip/link/loopback/BUILD +++ b/pkg/tcpip/link/loopback/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "loopback", srcs = ["loopback.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/loopback", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD index 1ac7948b6..82b441b79 100644 --- a/pkg/tcpip/link/muxed/BUILD +++ b/pkg/tcpip/link/muxed/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "muxed", srcs = ["injectable.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/muxed", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", @@ -19,7 +17,7 @@ go_test( name = "muxed_test", size = "small", srcs = ["injectable_test.go"], - embed = [":muxed"], + library = ":muxed", deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD index d8211e93d..14b527bc2 100644 --- a/pkg/tcpip/link/rawfile/BUILD +++ b/pkg/tcpip/link/rawfile/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -12,7 +12,6 @@ go_library( "errors.go", "rawfile_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/rawfile", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD index 09165dd4c..13243ebbb 100644 --- a/pkg/tcpip/link/sharedmem/BUILD +++ b/pkg/tcpip/link/sharedmem/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "sharedmem_unsafe.go", "tx.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem", visibility = ["//visibility:public"], deps = [ "//pkg/log", @@ -30,7 +28,7 @@ go_test( srcs = [ "sharedmem_test.go", ], - embed = [":sharedmem"], + library = ":sharedmem", deps = [ "//pkg/sync", "//pkg/tcpip", diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD index a0d4ad0be..87020ec08 100644 --- a/pkg/tcpip/link/sharedmem/pipe/BUILD +++ b/pkg/tcpip/link/sharedmem/pipe/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "rx.go", "tx.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe", visibility = ["//visibility:public"], ) @@ -20,6 +18,6 @@ go_test( srcs = [ "pipe_test.go", ], - embed = [":pipe"], + library = ":pipe", deps = ["//pkg/sync"], ) diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD index 8c9234d54..3ba06af73 100644 --- a/pkg/tcpip/link/sharedmem/queue/BUILD +++ b/pkg/tcpip/link/sharedmem/queue/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "rx.go", "tx.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue", visibility = ["//visibility:public"], deps = [ "//pkg/log", @@ -22,7 +20,7 @@ go_test( srcs = [ "queue_test.go", ], - embed = [":queue"], + library = ":queue", deps = [ "//pkg/tcpip/link/sharedmem/pipe", ], diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD index d6ae0368a..230a8d53a 100644 --- a/pkg/tcpip/link/sniffer/BUILD +++ b/pkg/tcpip/link/sniffer/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "pcap.go", "sniffer.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sniffer", visibility = ["//visibility:public"], deps = [ "//pkg/log", diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD index a71a493fc..e5096ea38 100644 --- a/pkg/tcpip/link/tun/BUILD +++ b/pkg/tcpip/link/tun/BUILD @@ -1,10 +1,9 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "tun", srcs = ["tun_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/tun", visibility = ["//visibility:public"], ) diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD index 134837943..0956d2c65 100644 --- a/pkg/tcpip/link/waitable/BUILD +++ b/pkg/tcpip/link/waitable/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -8,7 +7,6 @@ go_library( srcs = [ "waitable.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/waitable", visibility = ["//visibility:public"], deps = [ "//pkg/gate", @@ -23,7 +21,7 @@ go_test( srcs = [ "waitable_test.go", ], - embed = [":waitable"], + library = ":waitable", deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD index 9d16ff8c9..6a4839fb8 100644 --- a/pkg/tcpip/network/BUILD +++ b/pkg/tcpip/network/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD index e7617229b..eddf7b725 100644 --- a/pkg/tcpip/network/arp/BUILD +++ b/pkg/tcpip/network/arp/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "arp", srcs = ["arp.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/arp", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD index ed16076fd..d1c728ccf 100644 --- a/pkg/tcpip/network/fragmentation/BUILD +++ b/pkg/tcpip/network/fragmentation/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -24,7 +23,6 @@ go_library( "reassembler.go", "reassembler_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation", visibility = ["//visibility:public"], deps = [ "//pkg/log", @@ -42,6 +40,6 @@ go_test( "fragmentation_test.go", "reassembler_test.go", ], - embed = [":fragmentation"], + library = ":fragmentation", deps = ["//pkg/tcpip/buffer"], ) diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD index e6db5c0b0..872165866 100644 --- a/pkg/tcpip/network/hash/BUILD +++ b/pkg/tcpip/network/hash/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "hash", srcs = ["hash.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/hash", visibility = ["//visibility:public"], deps = [ "//pkg/rand", diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD index 4e2aae9a3..0fef2b1f1 100644 --- a/pkg/tcpip/network/ipv4/BUILD +++ b/pkg/tcpip/network/ipv4/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "icmp.go", "ipv4.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv4", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD index e4e273460..fb11874c6 100644 --- a/pkg/tcpip/network/ipv6/BUILD +++ b/pkg/tcpip/network/ipv6/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "icmp.go", "ipv6.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv6", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", @@ -27,7 +25,7 @@ go_test( "ipv6_test.go", "ndp_test.go", ], - embed = [":ipv6"], + library = ":ipv6", deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD index a6ef3bdcc..2bad05a2e 100644 --- a/pkg/tcpip/ports/BUILD +++ b/pkg/tcpip/ports/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "ports", srcs = ["ports.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/ports", visibility = ["//visibility:public"], deps = [ "//pkg/sync", @@ -17,7 +15,7 @@ go_library( go_test( name = "ports_test", srcs = ["ports_test.go"], - embed = [":ports"], + library = ":ports", deps = [ "//pkg/tcpip", ], diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD index d7496fde6..cf0a5fefe 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/BUILD +++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD index 875561566..43264b76d 100644 --- a/pkg/tcpip/sample/tun_tcp_echo/BUILD +++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD index b31ddba2f..45f503845 100644 --- a/pkg/tcpip/seqnum/BUILD +++ b/pkg/tcpip/seqnum/BUILD @@ -1,10 +1,9 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "seqnum", srcs = ["seqnum.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/seqnum", visibility = ["//visibility:public"], ) diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index 783351a69..f5b750046 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -30,7 +29,6 @@ go_library( "stack_global_state.go", "transport_demuxer.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/stack", visibility = ["//visibility:public"], deps = [ "//pkg/ilist", @@ -81,7 +79,7 @@ go_test( name = "stack_test", size = "small", srcs = ["linkaddrcache_test.go"], - embed = [":stack"], + library = ":stack", deps = [ "//pkg/sleep", "//pkg/sync", diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD index 3aa23d529..ac18ec5b1 100644 --- a/pkg/tcpip/transport/icmp/BUILD +++ b/pkg/tcpip/transport/icmp/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -23,7 +23,6 @@ go_library( "icmp_packet_list.go", "protocol.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/icmp", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD index 4858d150c..d22de6b26 100644 --- a/pkg/tcpip/transport/packet/BUILD +++ b/pkg/tcpip/transport/packet/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -22,7 +22,6 @@ go_library( "endpoint_state.go", "packet_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/packet", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index 2f2131ff7..c9baf4600 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -23,7 +23,6 @@ go_library( "protocol.go", "raw_packet_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 0e3ab05ad..4acd9fb9a 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -55,7 +54,6 @@ go_library( "tcp_segment_list.go", "timer.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD index b33ec2087..ce6a2c31d 100644 --- a/pkg/tcpip/transport/tcp/testing/context/BUILD +++ b/pkg/tcpip/transport/tcp/testing/context/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "context", testonly = 1, srcs = ["context.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context", visibility = [ "//visibility:public", ], diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD index 43fcc27f0..3ad6994a7 100644 --- a/pkg/tcpip/transport/tcpconntrack/BUILD +++ b/pkg/tcpip/transport/tcpconntrack/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "tcpconntrack", srcs = ["tcp_conntrack.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index 57ff123e3..adc908e24 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -25,7 +24,6 @@ go_library( "protocol.go", "udp_packet_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/udp", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD index 07778e4f7..2dcba84ae 100644 --- a/pkg/tmutex/BUILD +++ b/pkg/tmutex/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "tmutex", srcs = ["tmutex.go"], - importpath = "gvisor.dev/gvisor/pkg/tmutex", visibility = ["//:sandbox"], ) @@ -14,6 +12,6 @@ go_test( name = "tmutex_test", size = "medium", srcs = ["tmutex_test.go"], - embed = [":tmutex"], + library = ":tmutex", deps = ["//pkg/sync"], ) diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD index d1885ae66..a86501fa2 100644 --- a/pkg/unet/BUILD +++ b/pkg/unet/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "unet.go", "unet_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/unet", visibility = ["//visibility:public"], deps = [ "//pkg/gate", @@ -23,6 +21,6 @@ go_test( srcs = [ "unet_test.go", ], - embed = [":unet"], + library = ":unet", deps = ["//pkg/sync"], ) diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD index b8fdc3125..850c34ed0 100644 --- a/pkg/urpc/BUILD +++ b/pkg/urpc/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "urpc", srcs = ["urpc.go"], - importpath = "gvisor.dev/gvisor/pkg/urpc", visibility = ["//:sandbox"], deps = [ "//pkg/fd", @@ -20,6 +18,6 @@ go_test( name = "urpc_test", size = "small", srcs = ["urpc_test.go"], - embed = [":urpc"], + library = ":urpc", deps = ["//pkg/unet"], ) diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD index 1c6890e52..852480a09 100644 --- a/pkg/waiter/BUILD +++ b/pkg/waiter/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -22,7 +21,6 @@ go_library( "waiter.go", "waiter_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/waiter", visibility = ["//visibility:public"], deps = ["//pkg/sync"], ) @@ -33,5 +31,5 @@ go_test( srcs = [ "waiter_test.go", ], - embed = [":waiter"], + library = ":waiter", ) diff --git a/runsc/BUILD b/runsc/BUILD index e5587421d..b35b41d81 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -1,7 +1,6 @@ -package(licenses = ["notice"]) # Apache 2.0 +load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar") -load("@io_bazel_rules_go//go:def.bzl", "go_binary") -load("@rules_pkg//:pkg.bzl", "pkg_deb", "pkg_tar") +package(licenses = ["notice"]) go_binary( name = "runsc", @@ -9,7 +8,7 @@ go_binary( "main.go", "version.go", ], - pure = "on", + pure = True, visibility = [ "//visibility:public", ], @@ -26,10 +25,12 @@ go_binary( ) # The runsc-race target is a race-compatible BUILD target. This must be built -# via "bazel build --features=race //runsc:runsc-race", since the race feature -# must apply to all dependencies due a bug in gazelle file selection. The pure -# attribute must be off because the race detector requires linking with non-Go -# components, although we still require a static binary. +# via: bazel build --features=race //runsc:runsc-race +# +# This is neccessary because the race feature must apply to all dependencies +# due a bug in gazelle file selection. The pure attribute must be off because +# the race detector requires linking with non-Go components, although we still +# require a static binary. # # Note that in the future this might be convertible to a compatible target by # using the pure and static attributes within a select function, but select is @@ -42,7 +43,7 @@ go_binary( "main.go", "version.go", ], - static = "on", + static = True, visibility = [ "//visibility:public", ], @@ -82,7 +83,12 @@ genrule( # because they are assumes to be hermetic). srcs = [":runsc"], outs = ["version.txt"], - cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@", + # Note that the little dance here is necessary because files in the $(SRCS) + # attribute are not executable by default, and we can't touch in place. + cmd = "cp $(location :runsc) $(@D)/runsc && \ + chmod a+x $(@D)/runsc && \ + $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \ + rm -f $(@D)/runsc", stamp = 1, ) @@ -109,5 +115,6 @@ sh_test( name = "version_test", size = "small", srcs = ["version_test.sh"], + args = ["$(location :runsc)"], data = [":runsc"], ) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 3e20f8f2f..f3ebc0231 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -23,7 +23,6 @@ go_library( "strace.go", "user.go", ], - importpath = "gvisor.dev/gvisor/runsc/boot", visibility = [ "//runsc:__subpackages__", "//test:__subpackages__", @@ -107,7 +106,7 @@ go_test( "loader_test.go", "user_test.go", ], - embed = [":boot"], + library = ":boot", deps = [ "//pkg/control/server", "//pkg/log", diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index 3a9dcfc04..ce30f6c53 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -13,7 +13,6 @@ go_library( "extra_filters_race.go", "filter.go", ], - importpath = "gvisor.dev/gvisor/runsc/boot/filter", visibility = [ "//runsc/boot:__subpackages__", ], diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD index 03391cdca..77774f43c 100644 --- a/runsc/boot/platforms/BUILD +++ b/runsc/boot/platforms/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "platforms", srcs = ["platforms.go"], - importpath = "gvisor.dev/gvisor/runsc/boot/platforms", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD index d6165f9e5..d4c7bdfbb 100644 --- a/runsc/cgroup/BUILD +++ b/runsc/cgroup/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "cgroup", srcs = ["cgroup.go"], - importpath = "gvisor.dev/gvisor/runsc/cgroup", visibility = ["//:sandbox"], deps = [ "//pkg/log", @@ -19,6 +18,6 @@ go_test( name = "cgroup_test", size = "small", srcs = ["cgroup_test.go"], - embed = [":cgroup"], + library = ":cgroup", tags = ["local"], ) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index b94bc4fa0..09aa46434 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -34,7 +34,6 @@ go_library( "syscalls.go", "wait.go", ], - importpath = "gvisor.dev/gvisor/runsc/cmd", visibility = [ "//runsc:__subpackages__", ], @@ -73,7 +72,7 @@ go_test( data = [ "//runsc", ], - embed = [":cmd"], + library = ":cmd", deps = [ "//pkg/abi/linux", "//pkg/log", diff --git a/runsc/console/BUILD b/runsc/console/BUILD index e623c1a0f..06924bccd 100644 --- a/runsc/console/BUILD +++ b/runsc/console/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -7,7 +7,6 @@ go_library( srcs = [ "console.go", ], - importpath = "gvisor.dev/gvisor/runsc/console", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 6dea179e4..e21431e4c 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +10,6 @@ go_library( "state_file.go", "status.go", ], - importpath = "gvisor.dev/gvisor/runsc/container", visibility = [ "//runsc:__subpackages__", "//test:__subpackages__", @@ -42,7 +41,7 @@ go_test( "//runsc", "//runsc/container/test_app", ], - embed = [":container"], + library = ":container", shard_count = 5, tags = [ "requires-kvm", diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD index bfd338bb6..e200bafd9 100644 --- a/runsc/container/test_app/BUILD +++ b/runsc/container/test_app/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) @@ -9,7 +9,7 @@ go_binary( "fds.go", "test_app.go", ], - pure = "on", + pure = True, visibility = ["//runsc/container:__pkg__"], deps = [ "//pkg/unet", diff --git a/runsc/criutil/BUILD b/runsc/criutil/BUILD index 558133a0e..8a571a000 100644 --- a/runsc/criutil/BUILD +++ b/runsc/criutil/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "criutil", testonly = 1, srcs = ["criutil.go"], - importpath = "gvisor.dev/gvisor/runsc/criutil", visibility = ["//:sandbox"], deps = ["//runsc/testutil"], ) diff --git a/runsc/dockerutil/BUILD b/runsc/dockerutil/BUILD index 0e0423504..8621af901 100644 --- a/runsc/dockerutil/BUILD +++ b/runsc/dockerutil/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "dockerutil", testonly = 1, srcs = ["dockerutil.go"], - importpath = "gvisor.dev/gvisor/runsc/dockerutil", visibility = ["//:sandbox"], deps = [ "//runsc/testutil", diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index a9582d92b..64a406ae2 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,10 +10,7 @@ go_library( "fsgofer_arm64_unsafe.go", "fsgofer_unsafe.go", ], - importpath = "gvisor.dev/gvisor/runsc/fsgofer", - visibility = [ - "//runsc:__subpackages__", - ], + visibility = ["//runsc:__subpackages__"], deps = [ "//pkg/abi/linux", "//pkg/fd", @@ -30,7 +27,7 @@ go_test( name = "fsgofer_test", size = "small", srcs = ["fsgofer_test.go"], - embed = [":fsgofer"], + library = ":fsgofer", deps = [ "//pkg/log", "//pkg/p9", diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD index bac73f89d..82b48ef32 100644 --- a/runsc/fsgofer/filter/BUILD +++ b/runsc/fsgofer/filter/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -13,7 +13,6 @@ go_library( "extra_filters_race.go", "filter.go", ], - importpath = "gvisor.dev/gvisor/runsc/fsgofer/filter", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index ddbc37456..c95d50294 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "network_unsafe.go", "sandbox.go", ], - importpath = "gvisor.dev/gvisor/runsc/sandbox", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 205638803..4ccd77f63 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +10,6 @@ go_library( "namespace.go", "specutils.go", ], - importpath = "gvisor.dev/gvisor/runsc/specutils", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", @@ -28,6 +27,6 @@ go_test( name = "specutils_test", size = "small", srcs = ["specutils_test.go"], - embed = [":specutils"], + library = ":specutils", deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"], ) diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD index 3c3027cb5..f845120b0 100644 --- a/runsc/testutil/BUILD +++ b/runsc/testutil/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "testutil", testonly = 1, srcs = ["testutil.go"], - importpath = "gvisor.dev/gvisor/runsc/testutil", visibility = ["//:sandbox"], deps = [ "//pkg/log", diff --git a/runsc/version_test.sh b/runsc/version_test.sh index cc0ca3f05..747350654 100755 --- a/runsc/version_test.sh +++ b/runsc/version_test.sh @@ -16,7 +16,7 @@ set -euf -x -o pipefail -readonly runsc="${TEST_SRCDIR}/__main__/runsc/linux_amd64_pure_stripped/runsc" +readonly runsc="$1" readonly version=$($runsc --version) # Version should should not match VERSION, which is the default and which will diff --git a/scripts/common.sh b/scripts/common.sh index fdb1aa142..cd91b9f8e 100755 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -16,11 +16,7 @@ set -xeou pipefail -if [[ -f $(dirname $0)/common_google.sh ]]; then - source $(dirname $0)/common_google.sh -else - source $(dirname $0)/common_bazel.sh -fi +source $(dirname $0)/common_build.sh # Ensure it attempts to collect logs in all cases. trap collect_logs EXIT diff --git a/scripts/common_bazel.sh b/scripts/common_bazel.sh deleted file mode 100755 index a473a88a4..000000000 --- a/scripts/common_bazel.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Install the latest version of Bazel and log the version. -(which use_bazel.sh && use_bazel.sh latest) || which bazel -bazel version - -# Switch into the workspace; only necessary if run with kokoro. -if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then - cd git/repo -elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then - cd github/repo -fi - -# Set the standard bazel flags. -declare -r BAZEL_FLAGS=( - "--show_timestamps" - "--test_output=errors" - "--keep_going" - "--verbose_failures=true" -) -if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then - declare -r BAZEL_RBE_AUTH_FLAGS=( - "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}" - ) - declare -r BAZEL_RBE_FLAGS=("--config=remote") -fi - -# Wrap bazel. -function build() { - bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 | - tee /dev/fd/2 | grep -E '^ bazel-bin/' | awk '{ print $1; }' -} - -function test() { - bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" -} - -function run() { - local binary=$1 - shift - bazel run "${binary}" -- "$@" -} - -function run_as_root() { - local binary=$1 - shift - bazel run --run_under="sudo" "${binary}" -- "$@" -} - -function collect_logs() { - # Zip out everything into a convenient form. - if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then - # Merge results files of all shards for each test suite. - for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do - junitparser merge `find $d -name test.xml` $d/test.xml - cat $d/shard_*_of_*/test.log > $d/test.log - ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip - done - find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf - # Move test logs to Kokoro directory. tar is used to conveniently perform - # renames while moving files. - find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" | - tar --create --files-from - --transform 's/test\./sponge_log./' | - tar --extract --directory ${KOKORO_ARTIFACTS_DIR} - - # Collect sentry logs, if any. - if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then - # Check if the directory is empty or not (only the first line it needed). - local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1) - if [[ "${logs}" ]]; then - local -r archive=runsc_logs_"${RUNTIME}".tar.gz - if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then - echo "runsc logs will be uploaded to:" - echo " gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp" - echo " https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}" - fi - tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" . - fi - fi - fi -} - -function find_branch_name() { - git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename -} diff --git a/scripts/common_build.sh b/scripts/common_build.sh new file mode 100755 index 000000000..a473a88a4 --- /dev/null +++ b/scripts/common_build.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Install the latest version of Bazel and log the version. +(which use_bazel.sh && use_bazel.sh latest) || which bazel +bazel version + +# Switch into the workspace; only necessary if run with kokoro. +if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then + cd git/repo +elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then + cd github/repo +fi + +# Set the standard bazel flags. +declare -r BAZEL_FLAGS=( + "--show_timestamps" + "--test_output=errors" + "--keep_going" + "--verbose_failures=true" +) +if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then + declare -r BAZEL_RBE_AUTH_FLAGS=( + "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}" + ) + declare -r BAZEL_RBE_FLAGS=("--config=remote") +fi + +# Wrap bazel. +function build() { + bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 | + tee /dev/fd/2 | grep -E '^ bazel-bin/' | awk '{ print $1; }' +} + +function test() { + bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" +} + +function run() { + local binary=$1 + shift + bazel run "${binary}" -- "$@" +} + +function run_as_root() { + local binary=$1 + shift + bazel run --run_under="sudo" "${binary}" -- "$@" +} + +function collect_logs() { + # Zip out everything into a convenient form. + if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then + # Merge results files of all shards for each test suite. + for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do + junitparser merge `find $d -name test.xml` $d/test.xml + cat $d/shard_*_of_*/test.log > $d/test.log + ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip + done + find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf + # Move test logs to Kokoro directory. tar is used to conveniently perform + # renames while moving files. + find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" | + tar --create --files-from - --transform 's/test\./sponge_log./' | + tar --extract --directory ${KOKORO_ARTIFACTS_DIR} + + # Collect sentry logs, if any. + if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then + # Check if the directory is empty or not (only the first line it needed). + local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1) + if [[ "${logs}" ]]; then + local -r archive=runsc_logs_"${RUNTIME}".tar.gz + if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then + echo "runsc logs will be uploaded to:" + echo " gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp" + echo " https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}" + fi + tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" . + fi + fi + fi +} + +function find_branch_name() { + git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename +} diff --git a/test/BUILD b/test/BUILD index bf834d994..34b950644 100644 --- a/test/BUILD +++ b/test/BUILD @@ -1,44 +1 @@ -package(licenses = ["notice"]) # Apache 2.0 - -# We need to define a bazel platform and toolchain to specify dockerPrivileged -# and dockerRunAsRoot options, they are required to run tests on the RBE -# cluster in Kokoro. -alias( - name = "rbe_ubuntu1604", - actual = ":rbe_ubuntu1604_r346485", -) - -platform( - name = "rbe_ubuntu1604_r346485", - constraint_values = [ - "@bazel_tools//platforms:x86_64", - "@bazel_tools//platforms:linux", - "@bazel_tools//tools/cpp:clang", - "@bazel_toolchains//constraints:xenial", - "@bazel_toolchains//constraints/sanitizers:support_msan", - ], - remote_execution_properties = """ - properties: { - name: "container-image" - value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50" - } - properties: { - name: "dockerAddCapabilities" - value: "SYS_ADMIN" - } - properties: { - name: "dockerPrivileged" - value: "true" - } - """, -) - -toolchain( - name = "cc-toolchain-clang-x86_64-default", - exec_compatible_with = [ - ], - target_compatible_with = [ - ], - toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8", - toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", -) +package(licenses = ["notice"]) diff --git a/test/e2e/BUILD b/test/e2e/BUILD index 4fe03a220..76e04f878 100644 --- a/test/e2e/BUILD +++ b/test/e2e/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +10,7 @@ go_test( "integration_test.go", "regression_test.go", ], - embed = [":integration"], + library = ":integration", tags = [ # Requires docker and runsc to be configured before the test runs. "manual", @@ -29,5 +29,4 @@ go_test( go_library( name = "integration", srcs = ["integration.go"], - importpath = "gvisor.dev/gvisor/test/integration", ) diff --git a/test/image/BUILD b/test/image/BUILD index 09b0a0ad5..7392ac54e 100644 --- a/test/image/BUILD +++ b/test/image/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -14,7 +14,7 @@ go_test( "ruby.rb", "ruby.sh", ], - embed = [":image"], + library = ":image", tags = [ # Requires docker and runsc to be configured before the test runs. "manual", @@ -30,5 +30,4 @@ go_test( go_library( name = "image", srcs = ["image.go"], - importpath = "gvisor.dev/gvisor/test/image", ) diff --git a/test/iptables/BUILD b/test/iptables/BUILD index 22f470092..6bb3b82b5 100644 --- a/test/iptables/BUILD +++ b/test/iptables/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +12,6 @@ go_library( "iptables_util.go", "nat.go", ], - importpath = "gvisor.dev/gvisor/test/iptables", visibility = ["//test/iptables:__subpackages__"], deps = [ "//runsc/testutil", @@ -24,7 +23,7 @@ go_test( srcs = [ "iptables_test.go", ], - embed = [":iptables"], + library = ":iptables", tags = [ "local", "manual", diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD index a5b6f082c..b9199387a 100644 --- a/test/iptables/runner/BUILD +++ b/test/iptables/runner/BUILD @@ -1,15 +1,21 @@ -load("@io_bazel_rules_docker//go:image.bzl", "go_image") -load("@io_bazel_rules_docker//container:container.bzl", "container_image") +load("//tools:defs.bzl", "container_image", "go_binary", "go_image") package(licenses = ["notice"]) +go_binary( + name = "runner", + testonly = 1, + srcs = ["main.go"], + deps = ["//test/iptables"], +) + container_image( name = "iptables-base", base = "@iptables-test//image", ) go_image( - name = "runner", + name = "runner-image", testonly = 1, srcs = ["main.go"], base = ":iptables-base", diff --git a/test/root/BUILD b/test/root/BUILD index d5dd9bca2..23ce2a70f 100644 --- a/test/root/BUILD +++ b/test/root/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "root", srcs = ["root.go"], - importpath = "gvisor.dev/gvisor/test/root", ) go_test( @@ -21,7 +20,7 @@ go_test( data = [ "//runsc", ], - embed = [":root"], + library = ":root", tags = [ # Requires docker and runsc to be configured before the test runs. # Also test only runs as root. diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD index 125633680..bca5f9cab 100644 --- a/test/root/testdata/BUILD +++ b/test/root/testdata/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -12,7 +12,6 @@ go_library( "sandbox.go", "simple.go", ], - importpath = "gvisor.dev/gvisor/test/root/testdata", visibility = [ "//visibility:public", ], diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD index 367295206..2c472bf8d 100644 --- a/test/runtimes/BUILD +++ b/test/runtimes/BUILD @@ -1,6 +1,6 @@ # These packages are used to run language runtime tests inside gVisor sandboxes. -load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test") +load("//tools:defs.bzl", "go_binary", "go_test") load("//test/runtimes:build_defs.bzl", "runtime_test") package(licenses = ["notice"]) @@ -49,5 +49,5 @@ go_test( name = "blacklist_test", size = "small", srcs = ["blacklist_test.go"], - embed = [":runner"], + library = ":runner", ) diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl index 6f84ca852..92e275a76 100644 --- a/test/runtimes/build_defs.bzl +++ b/test/runtimes/build_defs.bzl @@ -1,6 +1,6 @@ """Defines a rule for runtime test targets.""" -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_test", "loopback") def runtime_test( name, @@ -34,6 +34,7 @@ def runtime_test( ] data = [ ":runner", + loopback, ] if blacklist_file: args += ["--blacklist_file", "test/runtimes/" + blacklist_file] @@ -61,7 +62,7 @@ def blacklist_test(name, blacklist_file): """Test that a blacklist parses correctly.""" go_test( name = name + "_blacklist_test", - embed = [":runner"], + library = ":runner", srcs = ["blacklist_test.go"], args = ["--blacklist_file", "test/runtimes/" + blacklist_file], data = [blacklist_file], diff --git a/test/runtimes/images/proctor/BUILD b/test/runtimes/images/proctor/BUILD index 09dc6c42f..85e004c45 100644 --- a/test/runtimes/images/proctor/BUILD +++ b/test/runtimes/images/proctor/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test") +load("//tools:defs.bzl", "go_binary", "go_test") package(licenses = ["notice"]) @@ -19,7 +19,7 @@ go_test( name = "proctor_test", size = "small", srcs = ["proctor_test.go"], - embed = [":proctor"], + library = ":proctor", deps = [ "//runsc/testutil", ], diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 90d52e73b..40e974314 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") load("//test/syscalls:build_defs.bzl", "syscall_test") package(licenses = ["notice"]) diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl index aaf77c65b..1df761dd0 100644 --- a/test/syscalls/build_defs.bzl +++ b/test/syscalls/build_defs.bzl @@ -1,5 +1,7 @@ """Defines a rule for syscall test targets.""" +load("//tools:defs.bzl", "loopback") + # syscall_test is a macro that will create targets to run the given test target # on the host (native) and runsc. def syscall_test( @@ -135,6 +137,7 @@ def _syscall_test( name = name, data = [ ":syscall_test_runner", + loopback, test, ], args = args, @@ -148,6 +151,3 @@ def sh_test(**kwargs): native.sh_test( **kwargs ) - -def select_for_linux(for_linux, for_others = []): - return for_linux diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD index 9293f25cb..de4b2727c 100644 --- a/test/syscalls/gtest/BUILD +++ b/test/syscalls/gtest/BUILD @@ -1,12 +1,9 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "gtest", srcs = ["gtest.go"], - importpath = "gvisor.dev/gvisor/test/syscalls/gtest", - visibility = [ - "//test:__subpackages__", - ], + visibility = ["//:sandbox"], ) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 4c7ec3f06..c2ef50c1d 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1,5 +1,4 @@ -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") -load("//test/syscalls:build_defs.bzl", "select_for_linux") +load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "select_system") package( default_visibility = ["//:sandbox"], @@ -126,13 +125,11 @@ cc_library( testonly = 1, srcs = [ "socket_test_util.cc", - ] + select_for_linux( - [ - "socket_test_util_impl.cc", - ], - ), + "socket_test_util_impl.cc", + ], hdrs = ["socket_test_util.h"], - deps = [ + defines = select_system(), + deps = default_net_util() + [ "@com_google_googletest//:gtest", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", @@ -143,8 +140,7 @@ cc_library( "//test/util:temp_path", "//test/util:test_util", "//test/util:thread_util", - ] + select_for_linux([ - ]), + ], ) cc_library( @@ -1443,6 +1439,7 @@ cc_binary( srcs = ["arch_prctl.cc"], linkstatic = 1, deps = [ + "//test/util:file_descriptor", "//test/util:test_main", "//test/util:test_util", "@com_google_googletest//:gtest", @@ -3383,11 +3380,11 @@ cc_library( name = "udp_socket_test_cases", testonly = 1, srcs = [ - "udp_socket_test_cases.cc", - ] + select_for_linux([ "udp_socket_errqueue_test_case.cc", - ]), + "udp_socket_test_cases.cc", + ], hdrs = ["udp_socket_test_cases.h"], + defines = select_system(), deps = [ ":socket_test_util", ":unix_domain_socket_test_util", diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc index 81bf5a775..3a901faf5 100644 --- a/test/syscalls/linux/arch_prctl.cc +++ b/test/syscalls/linux/arch_prctl.cc @@ -14,8 +14,10 @@ #include #include +#include #include "gtest/gtest.h" +#include "test/util/file_descriptor.h" #include "test/util/test_util.h" // glibc does not provide a prototype for arch_prctl() so declare it here. diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD index 5cfe4e56f..ed488dbc2 100644 --- a/test/syscalls/linux/rseq/BUILD +++ b/test/syscalls/linux/rseq/BUILD @@ -1,8 +1,7 @@ # This package contains a standalone rseq test binary. This binary must not # depend on libc, which might use rseq itself. -load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier") -load("@rules_cc//cc:defs.bzl", "cc_library") +load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain") package(licenses = ["notice"]) @@ -37,8 +36,8 @@ genrule( "$(location start.S)", ]), toolchains = [ + cc_toolchain, ":no_pie_cc_flags", - "@bazel_tools//tools/cpp:current_cc_toolchain", ], visibility = ["//:sandbox"], ) diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc index 147978f46..9a24e1df0 100644 --- a/test/syscalls/linux/udp_socket_errqueue_test_case.cc +++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef __fuchsia__ + #include "test/syscalls/linux/udp_socket_test_cases.h" #include @@ -52,3 +54,5 @@ TEST_P(UdpSocketTest, ErrorQueue) { } // namespace testing } // namespace gvisor + +#endif // __fuchsia__ diff --git a/test/uds/BUILD b/test/uds/BUILD index a3843e699..51e2c7ce8 100644 --- a/test/uds/BUILD +++ b/test/uds/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package( default_visibility = ["//:sandbox"], @@ -9,7 +9,6 @@ go_library( name = "uds", testonly = 1, srcs = ["uds.go"], - importpath = "gvisor.dev/gvisor/test/uds", deps = [ "//pkg/log", "//pkg/unet", diff --git a/test/util/BUILD b/test/util/BUILD index cbc728159..3c732be62 100644 --- a/test/util/BUILD +++ b/test/util/BUILD @@ -1,5 +1,4 @@ -load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") -load("//test/syscalls:build_defs.bzl", "select_for_linux") +load("//tools:defs.bzl", "cc_library", "cc_test", "select_system") package( default_visibility = ["//:sandbox"], @@ -142,12 +141,13 @@ cc_library( cc_library( name = "save_util", testonly = 1, - srcs = ["save_util.cc"] + - select_for_linux( - ["save_util_linux.cc"], - ["save_util_other.cc"], - ), + srcs = [ + "save_util.cc", + "save_util_linux.cc", + "save_util_other.cc", + ], hdrs = ["save_util.h"], + defines = select_system(), ) cc_library( @@ -234,13 +234,16 @@ cc_library( testonly = 1, srcs = [ "test_util.cc", - ] + select_for_linux( - [ - "test_util_impl.cc", - "test_util_runfiles.cc", + "test_util_impl.cc", + "test_util_runfiles.cc", + ], + hdrs = ["test_util.h"], + defines = select_system( + fuchsia = [ + "__opensource__", + "__fuchsia__", ], ), - hdrs = ["test_util.h"], deps = [ ":fs_util", ":logging", diff --git a/test/util/save_util_linux.cc b/test/util/save_util_linux.cc index cd56118c0..d0aea8e6a 100644 --- a/test/util/save_util_linux.cc +++ b/test/util/save_util_linux.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __linux__ + #include #include #include @@ -43,3 +45,5 @@ void MaybeSave() { } // namespace testing } // namespace gvisor + +#endif diff --git a/test/util/save_util_other.cc b/test/util/save_util_other.cc index 1aca663b7..931af2c29 100644 --- a/test/util/save_util_other.cc +++ b/test/util/save_util_other.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef __linux__ + namespace gvisor { namespace testing { @@ -21,3 +23,5 @@ void MaybeSave() { } // namespace testing } // namespace gvisor + +#endif diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc index 7210094eb..694d21692 100644 --- a/test/util/test_util_runfiles.cc +++ b/test/util/test_util_runfiles.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef __fuchsia__ + #include #include @@ -44,3 +46,5 @@ std::string RunfilePath(std::string path) { } // namespace testing } // namespace gvisor + +#endif // __fuchsia__ diff --git a/tools/BUILD b/tools/BUILD new file mode 100644 index 000000000..e73a9c885 --- /dev/null +++ b/tools/BUILD @@ -0,0 +1,3 @@ +package(licenses = ["notice"]) + +exports_files(["nogo.js"]) diff --git a/tools/build/BUILD b/tools/build/BUILD new file mode 100644 index 000000000..0c0ce3f4d --- /dev/null +++ b/tools/build/BUILD @@ -0,0 +1,10 @@ +package(licenses = ["notice"]) + +# In bazel, no special support is required for loopback networking. This is +# just a dummy data target that does not change the test environment. +genrule( + name = "loopback", + outs = ["loopback.txt"], + cmd = "touch $@", + visibility = ["//visibility:public"], +) diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl new file mode 100644 index 000000000..d0556abd1 --- /dev/null +++ b/tools/build/defs.bzl @@ -0,0 +1,91 @@ +"""Bazel implementations of standard rules.""" + +load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier") +load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library") +load("@io_bazel_rules_go//proto:def.bzl", _go_proto_library = "go_proto_library") +load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test") +load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar") +load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image") +load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image") +load("@pydeps//:requirements.bzl", _py_requirement = "requirement") + +container_image = _container_image +cc_binary = _cc_binary +cc_library = _cc_library +cc_flags_supplier = _cc_flags_supplier +cc_proto_library = _cc_proto_library +cc_test = _cc_test +cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain" +go_image = _go_image +go_embed_data = _go_embed_data +loopback = "//tools/build:loopback" +proto_library = native.proto_library +pkg_deb = _pkg_deb +pkg_tar = _pkg_tar +py_library = native.py_library +py_binary = native.py_binary +py_test = native.py_test + +def go_binary(name, static = False, pure = False, **kwargs): + if static: + kwargs["static"] = "on" + if pure: + kwargs["pure"] = "on" + _go_binary( + name = name, + **kwargs + ) + +def go_library(name, **kwargs): + _go_library( + name = name, + importpath = "gvisor.dev/gvisor/" + native.package_name(), + **kwargs + ) + +def go_tool_library(name, **kwargs): + _go_tool_library( + name = name, + importpath = "gvisor.dev/gvisor/" + native.package_name(), + **kwargs + ) + +def go_proto_library(name, proto, **kwargs): + deps = kwargs.pop("deps", []) + _go_proto_library( + name = name, + importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name, + proto = proto, + deps = [dep.replace("_proto", "_go_proto") for dep in deps], + **kwargs + ) + +def go_test(name, **kwargs): + library = kwargs.pop("library", None) + if library: + kwargs["embed"] = [library] + _go_test( + name = name, + **kwargs + ) + +def py_requirement(name, direct = False): + return _py_requirement(name) + +def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs): + values = { + "@bazel_tools//src/conditions:linux_x86_64": amd64, + "@bazel_tools//src/conditions:linux_aarch64": arm64, + } + if default: + values["//conditions:default"] = default + return select(values, **kwargs) + +def select_system(linux = ["__linux__"], **kwargs): + return linux # Only Linux supported. + +def default_installer(): + return None + +def default_net_util(): + return [] # Nothing needed. diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD index d85c56131..92ba8ab06 100644 --- a/tools/checkunsafe/BUILD +++ b/tools/checkunsafe/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_tool_library") +load("//tools:defs.bzl", "go_tool_library") package(licenses = ["notice"]) go_tool_library( name = "checkunsafe", srcs = ["check_unsafe.go"], - importpath = "checkunsafe", visibility = ["//visibility:public"], deps = [ "@org_golang_x_tools//go/analysis:go_tool_library", diff --git a/tools/defs.bzl b/tools/defs.bzl new file mode 100644 index 000000000..819f12b0d --- /dev/null +++ b/tools/defs.bzl @@ -0,0 +1,154 @@ +"""Wrappers for common build rules. + +These wrappers apply common BUILD configurations (e.g., proto_library +automagically creating cc_ and go_ proto targets) and act as a single point of +change for Google-internal and bazel-compatible rules. +""" + +load("//tools/go_stateify:defs.bzl", "go_stateify") +load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps") +load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system") + +# Delegate directly. +cc_binary = _cc_binary +cc_library = _cc_library +cc_test = _cc_test +cc_toolchain = _cc_toolchain +cc_flags_supplier = _cc_flags_supplier +container_image = _container_image +go_embed_data = _go_embed_data +go_image = _go_image +go_test = _go_test +go_tool_library = _go_tool_library +pkg_deb = _pkg_deb +pkg_tar = _pkg_tar +py_library = _py_library +py_binary = _py_binary +py_test = _py_test +py_requirement = _py_requirement +select_arch = _select_arch +select_system = _select_system +loopback = _loopback +default_installer = _default_installer +default_net_util = _default_net_util + +def go_binary(name, **kwargs): + """Wraps the standard go_binary. + + Args: + name: the rule name. + **kwargs: standard go_binary arguments. + """ + _go_binary( + name = name, + **kwargs + ) + +def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs): + """Wraps the standard go_library and does stateification and marshalling. + + The recommended way is to use this rule with mostly identical configuration as the native + go_library rule. + + These definitions provide additional flags (stateify, marshal) that can be used + with the generators to automatically supplement the library code. + + load("//tools:defs.bzl", "go_library") + + go_library( + name = "foo", + srcs = ["foo.go"], + ) + + Args: + name: the rule name. + srcs: the library sources. + deps: the library dependencies. + imports: imports required for stateify. + stateify: whether statify is enabled (default: true). + marshal: whether marshal is enabled (default: false). + **kwargs: standard go_library arguments. + """ + if stateify: + # Only do stateification for non-state packages without manual autogen. + go_stateify( + name = name + "_state_autogen", + srcs = [src for src in srcs if src.endswith(".go")], + imports = imports, + package = name, + arch = select_arch(), + out = name + "_state_autogen.go", + ) + all_srcs = srcs + [name + "_state_autogen.go"] + if "//pkg/state" not in deps: + all_deps = deps + ["//pkg/state"] + else: + all_deps = deps + else: + all_deps = deps + all_srcs = srcs + if marshal: + go_marshal( + name = name + "_abi_autogen", + srcs = [src for src in srcs if src.endswith(".go")], + debug = False, + imports = imports, + package = name, + ) + extra_deps = [ + dep + for dep in marshal_deps + if not dep in all_deps + ] + all_deps = all_deps + extra_deps + all_srcs = srcs + [name + "_abi_autogen_unsafe.go"] + + _go_library( + name = name, + srcs = all_srcs, + deps = all_deps, + **kwargs + ) + + if marshal: + # Ignore importpath for go_test. + kwargs.pop("importpath", None) + + _go_test( + name = name + "_abi_autogen_test", + srcs = [name + "_abi_autogen_test.go"], + library = ":" + name, + deps = marshal_test_deps, + **kwargs + ) + +def proto_library(name, srcs, **kwargs): + """Wraps the standard proto_library. + + Given a proto_library named "foo", this produces three different targets: + - foo_proto: proto_library rule. + - foo_go_proto: go_proto_library rule. + - foo_cc_proto: cc_proto_library rule. + + Args: + srcs: the proto sources. + **kwargs: standard proto_library arguments. + """ + deps = kwargs.pop("deps", []) + _proto_library( + name = name + "_proto", + srcs = srcs, + deps = deps, + **kwargs + ) + _go_proto_library( + name = name + "_go_proto", + proto = ":" + name + "_proto", + deps = deps, + **kwargs + ) + _cc_proto_library( + name = name + "_cc_proto", + deps = [":" + name + "_proto"], + **kwargs + ) diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD index 39318b877..069df3856 100644 --- a/tools/go_generics/BUILD +++ b/tools/go_generics/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/tools/go_generics/globals/BUILD b/tools/go_generics/globals/BUILD index 74853c7d2..38caa3ce7 100644 --- a/tools/go_generics/globals/BUILD +++ b/tools/go_generics/globals/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,6 +8,6 @@ go_library( "globals_visitor.go", "scope.go", ], - importpath = "gvisor.dev/gvisor/tools/go_generics/globals", + stateify = False, visibility = ["//tools/go_generics:__pkg__"], ) diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD index 02b09120e..b7d35e272 100644 --- a/tools/go_generics/go_merge/BUILD +++ b/tools/go_generics/go_merge/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD index 9d26a88b7..8a329dfc6 100644 --- a/tools/go_generics/rules_tests/BUILD +++ b/tools/go_generics/rules_tests/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) diff --git a/tools/go_marshal/BUILD b/tools/go_marshal/BUILD index c862b277c..80d9c0504 100644 --- a/tools/go_marshal/BUILD +++ b/tools/go_marshal/BUILD @@ -1,6 +1,6 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") -package(licenses = ["notice"]) +licenses(["notice"]) go_binary( name = "go_marshal", diff --git a/tools/go_marshal/README.md b/tools/go_marshal/README.md index 481575bd3..4886efddf 100644 --- a/tools/go_marshal/README.md +++ b/tools/go_marshal/README.md @@ -20,19 +20,7 @@ comment `// +marshal`. # Usage -See `defs.bzl`: two new rules are provided, `go_marshal` and `go_library`. - -The recommended way to generate a go library with marshalling is to use the -`go_library` with mostly identical configuration as the native go_library rule. - -``` -load("/gvisor/tools/go_marshal:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], -) -``` +See `defs.bzl`: a new rule is provided, `go_marshal`. Under the hood, the `go_marshal` rule is used to generate a file that will appear in a Go target; the output file should appear explicitly in a srcs list. @@ -54,11 +42,7 @@ go_library( "foo.go", "foo_abi.go", ], - deps = [ - "/gvisor/pkg/abi", - "/gvisor/pkg/sentry/safemem/safemem", - "/gvisor/pkg/sentry/usermem/usermem", - ], + ... ) ``` @@ -69,22 +53,6 @@ These tests use reflection to verify properties of the ABI struct, and should be considered part of the generated interfaces (but are too expensive to execute at runtime). Ensure these tests run at some point. -``` -$ cat BUILD -load("/gvisor/tools/go_marshal:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], -) -$ blaze build :foo -$ blaze query ... -:foo_abi_autogen -:foo_abi_autogen_test -$ blaze test :foo_abi_autogen_test - -``` - # Restrictions Not all valid go type definitions can be used with `go_marshal`. `go_marshal` is @@ -131,22 +99,6 @@ for embedded structs that are not aligned. Because of this, it's generally best to avoid using `marshal:"unaligned"` and insert explicit padding fields instead. -## Debugging go_marshal - -To enable debugging output from the go marshal tool, pass the `-debug` flag to -the tool. When using the build rules from above, add a `debug = True` field to -the build rule like this: - -``` -load("/gvisor/tools/go_marshal:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], - debug = True, -) -``` - ## Modifying the `go_marshal` Tool The following are some guidelines for modifying the `go_marshal` tool: diff --git a/tools/go_marshal/analysis/BUILD b/tools/go_marshal/analysis/BUILD index c859ced77..c2a4d45c4 100644 --- a/tools/go_marshal/analysis/BUILD +++ b/tools/go_marshal/analysis/BUILD @@ -1,12 +1,11 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "analysis", testonly = 1, srcs = ["analysis_unsafe.go"], - importpath = "gvisor.dev/gvisor/tools/go_marshal/analysis", visibility = [ "//:sandbox", ], diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl index c32eb559f..2918ceffe 100644 --- a/tools/go_marshal/defs.bzl +++ b/tools/go_marshal/defs.bzl @@ -1,57 +1,14 @@ -"""Marshal is a tool for generating marshalling interfaces for Go types. - -The recommended way is to use the go_library rule defined below with mostly -identical configuration as the native go_library rule. - -load("//tools/go_marshal:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], -) - -Under the hood, the go_marshal rule is used to generate a file that will -appear in a Go target; the output file should appear explicitly in a srcs list. -For example (the above is still the preferred way): - -load("//tools/go_marshal:defs.bzl", "go_marshal") - -go_marshal( - name = "foo_abi", - srcs = ["foo.go"], - out = "foo_abi.go", - package = "foo", -) - -go_library( - name = "foo", - srcs = [ - "foo.go", - "foo_abi.go", - ], - deps = [ - "//tools/go_marshal:marshal", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", - ], -) -""" - -load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library", _go_test = "go_test") +"""Marshal is a tool for generating marshalling interfaces for Go types.""" def _go_marshal_impl(ctx): """Execute the go_marshal tool.""" output = ctx.outputs.lib output_test = ctx.outputs.test - (build_dir, _, _) = ctx.build_file_path.rpartition("/BUILD") - - decl = "/".join(["gvisor.dev/gvisor", build_dir]) # Run the marshal command. args = ["-output=%s" % output.path] args += ["-pkg=%s" % ctx.attr.package] args += ["-output_test=%s" % output_test.path] - args += ["-declarationPkg=%s" % decl] if ctx.attr.debug: args += ["-debug"] @@ -83,7 +40,6 @@ go_marshal = rule( implementation = _go_marshal_impl, attrs = { "srcs": attr.label_list(mandatory = True, allow_files = True), - "libname": attr.string(mandatory = True), "imports": attr.string_list(mandatory = False), "package": attr.string(mandatory = True), "debug": attr.bool(doc = "enable debugging output from the go_marshal tool"), @@ -95,58 +51,14 @@ go_marshal = rule( }, ) -def go_library(name, srcs, deps = [], imports = [], debug = False, **kwargs): - """wraps the standard go_library and does mashalling interface generation. - - Args: - name: Same as native go_library. - srcs: Same as native go_library. - deps: Same as native go_library. - imports: Extra import paths to pass to the go_marshal tool. - debug: Enables debugging output from the go_marshal tool. - **kwargs: Remaining args to pass to the native go_library rule unmodified. - """ - go_marshal( - name = name + "_abi_autogen", - libname = name, - srcs = [src for src in srcs if src.endswith(".go")], - debug = debug, - imports = imports, - package = name, - ) - - extra_deps = [ - "//tools/go_marshal/marshal", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", - ] - - all_srcs = srcs + [name + "_abi_autogen_unsafe.go"] - all_deps = deps + [] # + extra_deps - - for extra in extra_deps: - if extra not in deps: - all_deps.append(extra) - - _go_library( - name = name, - srcs = all_srcs, - deps = all_deps, - **kwargs - ) - - # Don't pass importpath arg to go_test. - kwargs.pop("importpath", "") - - _go_test( - name = name + "_abi_autogen_test", - srcs = [name + "_abi_autogen_test.go"], - # Generated test has a fixed set of dependencies since we generate these - # tests. They should only depend on the library generated above, and the - # Marshallable interface. - deps = [ - ":" + name, - "//tools/go_marshal/analysis", - ], - **kwargs - ) +# marshal_deps are the dependencies requied by generated code. +marshal_deps = [ + "//tools/go_marshal/marshal", + "//pkg/sentry/platform/safecopy", + "//pkg/sentry/usermem", +] + +# marshal_test_deps are required by test targets. +marshal_test_deps = [ + "//tools/go_marshal/analysis", +] diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD index a0eae6492..c92b59dd6 100644 --- a/tools/go_marshal/gomarshal/BUILD +++ b/tools/go_marshal/gomarshal/BUILD @@ -1,6 +1,6 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "gomarshal", @@ -10,7 +10,7 @@ go_library( "generator_tests.go", "util.go", ], - importpath = "gvisor.dev/gvisor/tools/go_marshal/gomarshal", + stateify = False, visibility = [ "//:sandbox", ], diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go index 641ccd938..8392f3f6d 100644 --- a/tools/go_marshal/gomarshal/generator.go +++ b/tools/go_marshal/gomarshal/generator.go @@ -62,15 +62,12 @@ type Generator struct { outputTest *os.File // Package name for the generated file. pkg string - // Go import path for package we're processing. This package should directly - // declare the type we're generating code for. - declaration string // Set of extra packages to import in the generated file. imports *importTable } // NewGenerator creates a new code Generator. -func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports []string) (*Generator, error) { +func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*Generator, error) { f, err := os.OpenFile(out, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return nil, fmt.Errorf("Couldn't open output file %q: %v", out, err) @@ -80,12 +77,11 @@ func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports return nil, fmt.Errorf("Couldn't open test output file %q: %v", out, err) } g := Generator{ - inputs: srcs, - output: f, - outputTest: fTest, - pkg: pkg, - declaration: declaration, - imports: newImportTable(), + inputs: srcs, + output: f, + outputTest: fTest, + pkg: pkg, + imports: newImportTable(), } for _, i := range imports { // All imports on the extra imports list are unconditionally marked as @@ -264,7 +260,7 @@ func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interface // generateOneTestSuite generates a test suite for the automatically generated // implementations type t. func (g *Generator) generateOneTestSuite(t *ast.TypeSpec) *testGenerator { - i := newTestGenerator(t, g.declaration) + i := newTestGenerator(t) i.emitTests() return i } @@ -359,7 +355,7 @@ func (g *Generator) Run() error { // source file. func (g *Generator) writeTests(ts []*testGenerator) error { var b sourceBuffer - b.emit("package %s_test\n\n", g.pkg) + b.emit("package %s\n\n", g.pkg) if err := b.write(g.outputTest); err != nil { return err } diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go index df25cb5b2..bcda17c3b 100644 --- a/tools/go_marshal/gomarshal/generator_tests.go +++ b/tools/go_marshal/gomarshal/generator_tests.go @@ -46,7 +46,7 @@ type testGenerator struct { decl *importStmt } -func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator { +func newTestGenerator(t *ast.TypeSpec) *testGenerator { if _, ok := t.Type.(*ast.StructType); !ok { panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t)) } @@ -59,14 +59,12 @@ func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator { for _, i := range standardImports { g.imports.add(i).markUsed() } - g.decl = g.imports.add(declaration) - g.decl.markUsed() return g } func (g *testGenerator) typeName() string { - return fmt.Sprintf("%s.%s", g.decl.name, g.t.Name.Name) + return g.t.Name.Name } func (g *testGenerator) forEachField(fn func(f *ast.Field)) { diff --git a/tools/go_marshal/main.go b/tools/go_marshal/main.go index 3d12eb93c..e1a97b311 100644 --- a/tools/go_marshal/main.go +++ b/tools/go_marshal/main.go @@ -31,11 +31,10 @@ import ( ) var ( - pkg = flag.String("pkg", "", "output package") - output = flag.String("output", "", "output file") - outputTest = flag.String("output_test", "", "output file for tests") - imports = flag.String("imports", "", "comma-separated list of extra packages to import in generated code") - declarationPkg = flag.String("declarationPkg", "", "import path of target declaring the types we're generating on") + pkg = flag.String("pkg", "", "output package") + output = flag.String("output", "", "output file") + outputTest = flag.String("output_test", "", "output file for tests") + imports = flag.String("imports", "", "comma-separated list of extra packages to import in generated code") ) func main() { @@ -62,7 +61,7 @@ func main() { // as an import. extraImports = strings.Split(*imports, ",") } - g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, *declarationPkg, extraImports) + g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, extraImports) if err != nil { panic(err) } diff --git a/tools/go_marshal/marshal/BUILD b/tools/go_marshal/marshal/BUILD index 47dda97a1..ad508c72f 100644 --- a/tools/go_marshal/marshal/BUILD +++ b/tools/go_marshal/marshal/BUILD @@ -1,13 +1,12 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "marshal", srcs = [ "marshal.go", ], - importpath = "gvisor.dev/gvisor/tools/go_marshal/marshal", visibility = [ "//:sandbox", ], diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD index d412e1ccf..38ba49fed 100644 --- a/tools/go_marshal/test/BUILD +++ b/tools/go_marshal/test/BUILD @@ -1,7 +1,6 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_marshal:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) package_group( name = "gomarshal_test", @@ -25,6 +24,6 @@ go_library( name = "test", testonly = 1, srcs = ["test.go"], - importpath = "gvisor.dev/gvisor/tools/go_marshal/test", + marshal = True, deps = ["//tools/go_marshal/test/external"], ) diff --git a/tools/go_marshal/test/external/BUILD b/tools/go_marshal/test/external/BUILD index 9bb89e1da..0cf6da603 100644 --- a/tools/go_marshal/test/external/BUILD +++ b/tools/go_marshal/test/external/BUILD @@ -1,11 +1,11 @@ -load("//tools/go_marshal:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "external", testonly = 1, srcs = ["external.go"], - importpath = "gvisor.dev/gvisor/tools/go_marshal/test/external", + marshal = True, visibility = ["//tools/go_marshal/test:gomarshal_test"], ) diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD index bb53f8ae9..a133d6f8b 100644 --- a/tools/go_stateify/BUILD +++ b/tools/go_stateify/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl index 33267c074..0f261d89f 100644 --- a/tools/go_stateify/defs.bzl +++ b/tools/go_stateify/defs.bzl @@ -1,41 +1,4 @@ -"""Stateify is a tool for generating state wrappers for Go types. - -The recommended way is to use the go_library rule defined below with mostly -identical configuration as the native go_library rule. - -load("//tools/go_stateify:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], -) - -Under the hood, the go_stateify rule is used to generate a file that will -appear in a Go target; the output file should appear explicitly in a srcs list. -For example (the above is still the preferred way): - -load("//tools/go_stateify:defs.bzl", "go_stateify") - -go_stateify( - name = "foo_state", - srcs = ["foo.go"], - out = "foo_state.go", - package = "foo", -) - -go_library( - name = "foo", - srcs = [ - "foo.go", - "foo_state.go", - ], - deps = [ - "//pkg/state", - ], -) -""" - -load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library") +"""Stateify is a tool for generating state wrappers for Go types.""" def _go_stateify_impl(ctx): """Implementation for the stateify tool.""" @@ -103,43 +66,3 @@ files and must be added to the srcs of the relevant go_library. "_statepkg": attr.string(default = "gvisor.dev/gvisor/pkg/state"), }, ) - -def go_library(name, srcs, deps = [], imports = [], **kwargs): - """Standard go_library wrapped which generates state source files. - - Args: - name: the name of the go_library rule. - srcs: sources of the go_library. Each will be processed for stateify - annotations. - deps: dependencies for the go_library. - imports: an optional list of extra non-aliased, Go-style absolute import - paths required for stateified types. - **kwargs: passed to go_library. - """ - if "encode_unsafe.go" not in srcs and (name + "_state_autogen.go") not in srcs: - # Only do stateification for non-state packages without manual autogen. - go_stateify( - name = name + "_state_autogen", - srcs = [src for src in srcs if src.endswith(".go")], - imports = imports, - package = name, - arch = select({ - "@bazel_tools//src/conditions:linux_aarch64": "arm64", - "//conditions:default": "amd64", - }), - out = name + "_state_autogen.go", - ) - all_srcs = srcs + [name + "_state_autogen.go"] - if "//pkg/state" not in deps: - all_deps = deps + ["//pkg/state"] - else: - all_deps = deps - else: - all_deps = deps - all_srcs = srcs - _go_library( - name = name, - srcs = all_srcs, - deps = all_deps, - **kwargs - ) diff --git a/tools/images/BUILD b/tools/images/BUILD index 2b77c2737..f1699b184 100644 --- a/tools/images/BUILD +++ b/tools/images/BUILD @@ -1,4 +1,4 @@ -load("@rules_cc//cc:defs.bzl", "cc_binary") +load("//tools:defs.bzl", "cc_binary") load("//tools/images:defs.bzl", "vm_image", "vm_test") package( diff --git a/tools/images/defs.bzl b/tools/images/defs.bzl index d8e422a5d..32235813a 100644 --- a/tools/images/defs.bzl +++ b/tools/images/defs.bzl @@ -28,6 +28,8 @@ The vm_test rule can be used to execute a command remotely. For example, ) """ +load("//tools:defs.bzl", "default_installer") + def _vm_image_impl(ctx): script_paths = [] for script in ctx.files.scripts: @@ -165,8 +167,8 @@ def vm_test( targets = kwargs.pop("targets", []) if installer: targets = [installer] + targets - targets = [ - ] + targets + if default_installer(): + targets = [default_installer()] + targets _vm_test( tags = [ "local", diff --git a/tools/issue_reviver/BUILD b/tools/issue_reviver/BUILD index ee7ea11fd..4ef1a3124 100644 --- a/tools/issue_reviver/BUILD +++ b/tools/issue_reviver/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/tools/issue_reviver/github/BUILD b/tools/issue_reviver/github/BUILD index 6da22ba1c..da4133472 100644 --- a/tools/issue_reviver/github/BUILD +++ b/tools/issue_reviver/github/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "github", srcs = ["github.go"], - importpath = "gvisor.dev/gvisor/tools/issue_reviver/github", visibility = [ "//tools/issue_reviver:__subpackages__", ], diff --git a/tools/issue_reviver/reviver/BUILD b/tools/issue_reviver/reviver/BUILD index 2c3675977..d262932bd 100644 --- a/tools/issue_reviver/reviver/BUILD +++ b/tools/issue_reviver/reviver/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "reviver", srcs = ["reviver.go"], - importpath = "gvisor.dev/gvisor/tools/issue_reviver/reviver", visibility = [ "//tools/issue_reviver:__subpackages__", ], @@ -15,5 +14,5 @@ go_test( name = "reviver_test", size = "small", srcs = ["reviver_test.go"], - embed = [":reviver"], + library = ":reviver", ) diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh index fb09ff331..a22c8c9f2 100755 --- a/tools/workspace_status.sh +++ b/tools/workspace_status.sh @@ -15,4 +15,4 @@ # limitations under the License. # The STABLE_ prefix will trigger a re-link if it changes. -echo STABLE_VERSION $(git describe --always --tags --abbrev=12 --dirty) +echo STABLE_VERSION $(git describe --always --tags --abbrev=12 --dirty || echo 0.0.0) diff --git a/vdso/BUILD b/vdso/BUILD index 2b6744c26..d37d4266d 100644 --- a/vdso/BUILD +++ b/vdso/BUILD @@ -3,20 +3,10 @@ # normal system VDSO (time, gettimeofday, clock_gettimeofday) but which uses # timekeeping parameters managed by the sandbox kernel. -load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier") +load("//tools:defs.bzl", "cc_flags_supplier", "cc_toolchain", "select_arch") package(licenses = ["notice"]) -config_setting( - name = "x86_64", - constraint_values = ["@bazel_tools//platforms:x86_64"], -) - -config_setting( - name = "aarch64", - constraint_values = ["@bazel_tools//platforms:aarch64"], -) - genrule( name = "vdso", srcs = [ @@ -39,14 +29,15 @@ genrule( "-O2 " + "-std=c++11 " + "-fPIC " + + "-fno-sanitize=all " + # Some toolchains enable stack protector by default. Disable it, the # VDSO has no hooks to handle failures. "-fno-stack-protector " + "-fuse-ld=gold " + - select({ - ":x86_64": "-m64 ", - "//conditions:default": "", - }) + + select_arch( + amd64 = "-m64 ", + arm64 = "", + ) + "-shared " + "-nostdlib " + "-Wl,-soname=linux-vdso.so.1 " + @@ -55,12 +46,10 @@ genrule( "-Wl,-Bsymbolic " + "-Wl,-z,max-page-size=4096 " + "-Wl,-z,common-page-size=4096 " + - select( - { - ":x86_64": "-Wl,-T$(location vdso_amd64.lds) ", - ":aarch64": "-Wl,-T$(location vdso_arm64.lds) ", - }, - no_match_error = "Unsupported architecture", + select_arch( + amd64 = "-Wl,-T$(location vdso_amd64.lds) ", + arm64 = "-Wl,-T$(location vdso_arm64.lds) ", + no_match_error = "unsupported architecture", ) + "-o $(location vdso.so) " + "$(location vdso.cc) " + @@ -73,7 +62,7 @@ genrule( ], features = ["-pie"], toolchains = [ - "@bazel_tools//tools/cpp:current_cc_toolchain", + cc_toolchain, ":no_pie_cc_flags", ], visibility = ["//:sandbox"], -- cgit v1.2.3 From 90ec5961667a1c4a21702e64adb383403af8ad25 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 27 Jan 2020 13:22:50 -0800 Subject: Fix licenses. The preferred Copyright holder is "The gVisor Authors". PiperOrigin-RevId: 291786657 --- pkg/sentry/kernel/fd_table.go | 2 +- pkg/sentry/kernel/fd_table_test.go | 2 +- pkg/sentry/kernel/fd_table_unsafe.go | 2 +- pkg/sentry/platform/ring0/entry_arm64.go | 2 +- pkg/sentry/platform/ring0/kernel_arm64.go | 2 +- pkg/sentry/platform/ring0/lib_arm64.go | 2 +- pkg/sentry/platform/ring0/offsets_arm64.go | 2 +- pkg/tcpip/iptables/iptables.go | 2 +- pkg/tcpip/iptables/types.go | 2 +- runsc/cmd/help.go | 2 +- tools/go_marshal/main.go | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index cd1501f85..0ad4135b3 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -1,4 +1,4 @@ -// Copyright 2018 Google LLC +// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index eccb7d1e7..86164df49 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -1,4 +1,4 @@ -// Copyright 2018 Google LLC +// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index e009df974..e9fdb0917 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -1,4 +1,4 @@ -// Copyright 2018 Google LLC +// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/sentry/platform/ring0/entry_arm64.go b/pkg/sentry/platform/ring0/entry_arm64.go index 0dfa42c36..62a93f3d6 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.go +++ b/pkg/sentry/platform/ring0/entry_arm64.go @@ -1,4 +1,4 @@ -// Copyright 2019 Google Inc. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index ed82a131e..c3d341998 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -1,4 +1,4 @@ -// Copyright 2019 Google Inc. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go index 8bcfe1032..af075aae4 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -1,4 +1,4 @@ -// Copyright 2019 Google Inc. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index cd2a65f97..8c960c749 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -1,4 +1,4 @@ -// Copyright 2019 Google Inc. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go index fc06b5b87..4bfb3149e 100644 --- a/pkg/tcpip/iptables/iptables.go +++ b/pkg/tcpip/iptables/iptables.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go index a8b972f1b..50893cc55 100644 --- a/pkg/tcpip/iptables/types.go +++ b/pkg/tcpip/iptables/types.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go index ff4f901cb..930e8454f 100644 --- a/runsc/cmd/help.go +++ b/runsc/cmd/help.go @@ -1,4 +1,4 @@ -// Copyright 2018 Google LLC +// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/tools/go_marshal/main.go b/tools/go_marshal/main.go index e1a97b311..f74be5c29 100644 --- a/tools/go_marshal/main.go +++ b/tools/go_marshal/main.go @@ -1,4 +1,4 @@ -// Copyright 2019 Google LLC +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. -- cgit v1.2.3 From 0e2f1b7abd219f39d67cc2cecd00c441a13eeb29 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 27 Jan 2020 15:17:58 -0800 Subject: Update package locations. Because the abi will depend on the core types for marshalling (usermem, context, safemem, safecopy), these need to be flattened from the sentry directory. These packages contain no sentry-specific details. PiperOrigin-RevId: 291811289 --- pkg/abi/abi.go | 4 + pkg/context/BUILD | 13 + pkg/context/context.go | 141 +++++ pkg/safecopy/BUILD | 29 + pkg/safecopy/LICENSE | 27 + pkg/safecopy/atomic_amd64.s | 136 +++++ pkg/safecopy/atomic_arm64.s | 126 +++++ pkg/safecopy/memclr_amd64.s | 147 +++++ pkg/safecopy/memclr_arm64.s | 74 +++ pkg/safecopy/memcpy_amd64.s | 250 +++++++++ pkg/safecopy/memcpy_arm64.s | 78 +++ pkg/safecopy/safecopy.go | 144 +++++ pkg/safecopy/safecopy_test.go | 617 +++++++++++++++++++++ pkg/safecopy/safecopy_unsafe.go | 335 +++++++++++ pkg/safecopy/sighandler_amd64.s | 133 +++++ pkg/safecopy/sighandler_arm64.s | 143 +++++ pkg/safemem/BUILD | 27 + pkg/safemem/block_unsafe.go | 279 ++++++++++ pkg/safemem/io.go | 392 +++++++++++++ pkg/safemem/io_test.go | 199 +++++++ pkg/safemem/safemem.go | 16 + pkg/safemem/seq_test.go | 196 +++++++ pkg/safemem/seq_unsafe.go | 299 ++++++++++ pkg/sentry/arch/BUILD | 4 +- pkg/sentry/arch/arch.go | 2 +- pkg/sentry/arch/arch_aarch64.go | 2 +- pkg/sentry/arch/arch_amd64.go | 2 +- pkg/sentry/arch/arch_arm64.go | 2 +- pkg/sentry/arch/arch_state_x86.go | 2 +- pkg/sentry/arch/arch_x86.go | 2 +- pkg/sentry/arch/auxv.go | 2 +- pkg/sentry/arch/signal.go | 2 +- pkg/sentry/arch/signal_amd64.go | 2 +- pkg/sentry/arch/signal_arm64.go | 2 +- pkg/sentry/arch/signal_stack.go | 2 +- pkg/sentry/arch/stack.go | 4 +- pkg/sentry/context/BUILD | 13 - pkg/sentry/context/context.go | 141 ----- pkg/sentry/context/contexttest/BUILD | 21 - pkg/sentry/context/contexttest/contexttest.go | 188 ------- pkg/sentry/contexttest/BUILD | 21 + pkg/sentry/contexttest/contexttest.go | 188 +++++++ pkg/sentry/fs/BUILD | 12 +- pkg/sentry/fs/anon/BUILD | 4 +- pkg/sentry/fs/anon/anon.go | 4 +- pkg/sentry/fs/attr.go | 2 +- pkg/sentry/fs/context.go | 2 +- pkg/sentry/fs/copy_up.go | 4 +- pkg/sentry/fs/copy_up_test.go | 2 +- pkg/sentry/fs/dev/BUILD | 6 +- pkg/sentry/fs/dev/dev.go | 4 +- pkg/sentry/fs/dev/fs.go | 2 +- pkg/sentry/fs/dev/full.go | 4 +- pkg/sentry/fs/dev/null.go | 2 +- pkg/sentry/fs/dev/random.go | 6 +- pkg/sentry/fs/dev/tty.go | 2 +- pkg/sentry/fs/dirent.go | 2 +- pkg/sentry/fs/dirent_refs_test.go | 4 +- pkg/sentry/fs/fdpipe/BUILD | 12 +- pkg/sentry/fs/fdpipe/pipe.go | 6 +- pkg/sentry/fs/fdpipe/pipe_opener.go | 2 +- pkg/sentry/fs/fdpipe/pipe_opener_test.go | 6 +- pkg/sentry/fs/fdpipe/pipe_state.go | 2 +- pkg/sentry/fs/fdpipe/pipe_test.go | 4 +- pkg/sentry/fs/file.go | 4 +- pkg/sentry/fs/file_operations.go | 4 +- pkg/sentry/fs/file_overlay.go | 4 +- pkg/sentry/fs/file_overlay_test.go | 2 +- pkg/sentry/fs/filesystems.go | 2 +- pkg/sentry/fs/filetest/BUILD | 6 +- pkg/sentry/fs/filetest/filetest.go | 6 +- pkg/sentry/fs/fs.go | 2 +- pkg/sentry/fs/fsutil/BUILD | 14 +- pkg/sentry/fs/fsutil/dirty_set.go | 6 +- pkg/sentry/fs/fsutil/dirty_set_test.go | 2 +- pkg/sentry/fs/fsutil/file.go | 4 +- pkg/sentry/fs/fsutil/file_range_set.go | 6 +- pkg/sentry/fs/fsutil/host_file_mapper.go | 4 +- pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go | 2 +- pkg/sentry/fs/fsutil/host_mappable.go | 6 +- pkg/sentry/fs/fsutil/inode.go | 2 +- pkg/sentry/fs/fsutil/inode_cached.go | 6 +- pkg/sentry/fs/fsutil/inode_cached_test.go | 8 +- pkg/sentry/fs/gofer/BUILD | 10 +- pkg/sentry/fs/gofer/attr.go | 4 +- pkg/sentry/fs/gofer/cache_policy.go | 2 +- pkg/sentry/fs/gofer/context_file.go | 2 +- pkg/sentry/fs/gofer/file.go | 4 +- pkg/sentry/fs/gofer/file_state.go | 2 +- pkg/sentry/fs/gofer/fs.go | 2 +- pkg/sentry/fs/gofer/gofer_test.go | 4 +- pkg/sentry/fs/gofer/handles.go | 4 +- pkg/sentry/fs/gofer/inode.go | 4 +- pkg/sentry/fs/gofer/inode_state.go | 2 +- pkg/sentry/fs/gofer/path.go | 2 +- pkg/sentry/fs/gofer/session.go | 2 +- pkg/sentry/fs/gofer/session_state.go | 2 +- pkg/sentry/fs/gofer/socket.go | 2 +- pkg/sentry/fs/gofer/util.go | 2 +- pkg/sentry/fs/host/BUILD | 12 +- pkg/sentry/fs/host/control.go | 2 +- pkg/sentry/fs/host/file.go | 6 +- pkg/sentry/fs/host/fs.go | 2 +- pkg/sentry/fs/host/fs_test.go | 4 +- pkg/sentry/fs/host/inode.go | 4 +- pkg/sentry/fs/host/inode_state.go | 2 +- pkg/sentry/fs/host/inode_test.go | 2 +- pkg/sentry/fs/host/socket.go | 2 +- pkg/sentry/fs/host/socket_test.go | 4 +- pkg/sentry/fs/host/tty.go | 4 +- pkg/sentry/fs/host/wait_test.go | 2 +- pkg/sentry/fs/inode.go | 2 +- pkg/sentry/fs/inode_operations.go | 2 +- pkg/sentry/fs/inode_overlay.go | 2 +- pkg/sentry/fs/inode_overlay_test.go | 2 +- pkg/sentry/fs/inotify.go | 4 +- pkg/sentry/fs/inotify_event.go | 4 +- pkg/sentry/fs/mock.go | 2 +- pkg/sentry/fs/mount.go | 2 +- pkg/sentry/fs/mount_overlay.go | 2 +- pkg/sentry/fs/mount_test.go | 2 +- pkg/sentry/fs/mounts.go | 2 +- pkg/sentry/fs/mounts_test.go | 2 +- pkg/sentry/fs/offset.go | 2 +- pkg/sentry/fs/overlay.go | 4 +- pkg/sentry/fs/proc/BUILD | 8 +- pkg/sentry/fs/proc/cgroup.go | 2 +- pkg/sentry/fs/proc/cpuinfo.go | 2 +- pkg/sentry/fs/proc/exec_args.go | 4 +- pkg/sentry/fs/proc/fds.go | 2 +- pkg/sentry/fs/proc/filesystems.go | 2 +- pkg/sentry/fs/proc/fs.go | 2 +- pkg/sentry/fs/proc/inode.go | 4 +- pkg/sentry/fs/proc/loadavg.go | 2 +- pkg/sentry/fs/proc/meminfo.go | 4 +- pkg/sentry/fs/proc/mounts.go | 2 +- pkg/sentry/fs/proc/net.go | 4 +- pkg/sentry/fs/proc/proc.go | 2 +- pkg/sentry/fs/proc/seqfile/BUILD | 10 +- pkg/sentry/fs/proc/seqfile/seqfile.go | 4 +- pkg/sentry/fs/proc/seqfile/seqfile_test.go | 6 +- pkg/sentry/fs/proc/stat.go | 2 +- pkg/sentry/fs/proc/sys.go | 4 +- pkg/sentry/fs/proc/sys_net.go | 4 +- pkg/sentry/fs/proc/sys_net_test.go | 4 +- pkg/sentry/fs/proc/task.go | 4 +- pkg/sentry/fs/proc/uid_gid_map.go | 4 +- pkg/sentry/fs/proc/uptime.go | 4 +- pkg/sentry/fs/proc/version.go | 2 +- pkg/sentry/fs/ramfs/BUILD | 6 +- pkg/sentry/fs/ramfs/dir.go | 2 +- pkg/sentry/fs/ramfs/socket.go | 2 +- pkg/sentry/fs/ramfs/symlink.go | 2 +- pkg/sentry/fs/ramfs/tree.go | 4 +- pkg/sentry/fs/ramfs/tree_test.go | 2 +- pkg/sentry/fs/splice.go | 2 +- pkg/sentry/fs/sys/BUILD | 4 +- pkg/sentry/fs/sys/devices.go | 2 +- pkg/sentry/fs/sys/fs.go | 2 +- pkg/sentry/fs/sys/sys.go | 4 +- pkg/sentry/fs/timerfd/BUILD | 4 +- pkg/sentry/fs/timerfd/timerfd.go | 4 +- pkg/sentry/fs/tmpfs/BUILD | 10 +- pkg/sentry/fs/tmpfs/file_regular.go | 4 +- pkg/sentry/fs/tmpfs/file_test.go | 4 +- pkg/sentry/fs/tmpfs/fs.go | 2 +- pkg/sentry/fs/tmpfs/inode_file.go | 6 +- pkg/sentry/fs/tmpfs/tmpfs.go | 4 +- pkg/sentry/fs/tty/BUILD | 10 +- pkg/sentry/fs/tty/dir.go | 4 +- pkg/sentry/fs/tty/fs.go | 2 +- pkg/sentry/fs/tty/line_discipline.go | 4 +- pkg/sentry/fs/tty/master.go | 4 +- pkg/sentry/fs/tty/queue.go | 6 +- pkg/sentry/fs/tty/slave.go | 4 +- pkg/sentry/fs/tty/terminal.go | 4 +- pkg/sentry/fs/tty/tty_test.go | 4 +- pkg/sentry/fsimpl/ext/BUILD | 12 +- pkg/sentry/fsimpl/ext/benchmark/BUILD | 4 +- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 4 +- pkg/sentry/fsimpl/ext/directory.go | 2 +- pkg/sentry/fsimpl/ext/ext.go | 2 +- pkg/sentry/fsimpl/ext/ext_test.go | 6 +- pkg/sentry/fsimpl/ext/file_description.go | 2 +- pkg/sentry/fsimpl/ext/filesystem.go | 2 +- pkg/sentry/fsimpl/ext/regular_file.go | 6 +- pkg/sentry/fsimpl/ext/symlink.go | 4 +- pkg/sentry/fsimpl/kernfs/BUILD | 10 +- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 4 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 4 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 6 +- pkg/sentry/fsimpl/kernfs/symlink.go | 2 +- pkg/sentry/fsimpl/proc/BUILD | 12 +- pkg/sentry/fsimpl/proc/filesystem.go | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 2 +- pkg/sentry/fsimpl/proc/task.go | 2 +- pkg/sentry/fsimpl/proc/task_files.go | 6 +- pkg/sentry/fsimpl/proc/tasks.go | 2 +- pkg/sentry/fsimpl/proc/tasks_files.go | 4 +- pkg/sentry/fsimpl/proc/tasks_net.go | 4 +- pkg/sentry/fsimpl/proc/tasks_sys.go | 2 +- pkg/sentry/fsimpl/proc/tasks_sys_test.go | 2 +- pkg/sentry/fsimpl/proc/tasks_test.go | 4 +- pkg/sentry/fsimpl/sys/BUILD | 2 +- pkg/sentry/fsimpl/sys/sys.go | 2 +- pkg/sentry/fsimpl/testutil/BUILD | 4 +- pkg/sentry/fsimpl/testutil/kernel.go | 2 +- pkg/sentry/fsimpl/testutil/testutil.go | 4 +- pkg/sentry/fsimpl/tmpfs/BUILD | 16 +- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 4 +- pkg/sentry/fsimpl/tmpfs/directory.go | 2 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/named_pipe.go | 4 +- pkg/sentry/fsimpl/tmpfs/pipe_test.go | 6 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 6 +- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 4 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 +- pkg/sentry/hostmm/BUILD | 2 +- pkg/sentry/hostmm/hostmm.go | 2 +- pkg/sentry/inet/BUILD | 2 +- pkg/sentry/inet/context.go | 2 +- pkg/sentry/kernel/BUILD | 12 +- pkg/sentry/kernel/auth/BUILD | 2 +- pkg/sentry/kernel/auth/context.go | 2 +- pkg/sentry/kernel/auth/id_map.go | 2 +- pkg/sentry/kernel/context.go | 2 +- pkg/sentry/kernel/contexttest/BUILD | 4 +- pkg/sentry/kernel/contexttest/contexttest.go | 4 +- pkg/sentry/kernel/epoll/BUILD | 6 +- pkg/sentry/kernel/epoll/epoll.go | 4 +- pkg/sentry/kernel/epoll/epoll_test.go | 2 +- pkg/sentry/kernel/eventfd/BUILD | 8 +- pkg/sentry/kernel/eventfd/eventfd.go | 4 +- pkg/sentry/kernel/eventfd/eventfd_test.go | 4 +- pkg/sentry/kernel/fd_table.go | 2 +- pkg/sentry/kernel/fd_table_test.go | 4 +- pkg/sentry/kernel/futex/BUILD | 6 +- pkg/sentry/kernel/futex/futex.go | 2 +- pkg/sentry/kernel/futex/futex_test.go | 2 +- pkg/sentry/kernel/kernel.go | 2 +- pkg/sentry/kernel/pipe/BUILD | 12 +- pkg/sentry/kernel/pipe/buffer.go | 2 +- pkg/sentry/kernel/pipe/buffer_test.go | 2 +- pkg/sentry/kernel/pipe/node.go | 2 +- pkg/sentry/kernel/pipe/node_test.go | 6 +- pkg/sentry/kernel/pipe/pipe.go | 2 +- pkg/sentry/kernel/pipe/pipe_test.go | 4 +- pkg/sentry/kernel/pipe/pipe_util.go | 4 +- pkg/sentry/kernel/pipe/reader_writer.go | 4 +- pkg/sentry/kernel/pipe/vfs.go | 4 +- pkg/sentry/kernel/ptrace.go | 2 +- pkg/sentry/kernel/ptrace_amd64.go | 2 +- pkg/sentry/kernel/ptrace_arm64.go | 2 +- pkg/sentry/kernel/rseq.go | 2 +- pkg/sentry/kernel/seccomp.go | 2 +- pkg/sentry/kernel/semaphore/BUILD | 6 +- pkg/sentry/kernel/semaphore/semaphore.go | 2 +- pkg/sentry/kernel/semaphore/semaphore_test.go | 4 +- pkg/sentry/kernel/shm/BUILD | 4 +- pkg/sentry/kernel/shm/shm.go | 4 +- pkg/sentry/kernel/signalfd/BUILD | 4 +- pkg/sentry/kernel/signalfd/signalfd.go | 4 +- pkg/sentry/kernel/syscalls.go | 2 +- pkg/sentry/kernel/task.go | 4 +- pkg/sentry/kernel/task_clone.go | 2 +- pkg/sentry/kernel/task_context.go | 4 +- pkg/sentry/kernel/task_futex.go | 2 +- pkg/sentry/kernel/task_log.go | 2 +- pkg/sentry/kernel/task_run.go | 2 +- pkg/sentry/kernel/task_signals.go | 2 +- pkg/sentry/kernel/task_start.go | 2 +- pkg/sentry/kernel/task_syscall.go | 2 +- pkg/sentry/kernel/task_usermem.go | 2 +- pkg/sentry/kernel/time/BUILD | 2 +- pkg/sentry/kernel/time/context.go | 2 +- pkg/sentry/kernel/timekeeper_test.go | 4 +- pkg/sentry/kernel/vdso.go | 4 +- pkg/sentry/limits/BUILD | 2 +- pkg/sentry/limits/context.go | 2 +- pkg/sentry/loader/BUILD | 6 +- pkg/sentry/loader/elf.go | 4 +- pkg/sentry/loader/interpreter.go | 4 +- pkg/sentry/loader/loader.go | 4 +- pkg/sentry/loader/vdso.go | 6 +- pkg/sentry/memmap/BUILD | 6 +- pkg/sentry/memmap/mapping_set.go | 2 +- pkg/sentry/memmap/mapping_set_test.go | 2 +- pkg/sentry/memmap/memmap.go | 4 +- pkg/sentry/mm/BUILD | 18 +- pkg/sentry/mm/address_space.go | 2 +- pkg/sentry/mm/aio_context.go | 4 +- pkg/sentry/mm/debug.go | 2 +- pkg/sentry/mm/io.go | 6 +- pkg/sentry/mm/lifecycle.go | 4 +- pkg/sentry/mm/metadata.go | 2 +- pkg/sentry/mm/mm.go | 4 +- pkg/sentry/mm/mm_test.go | 6 +- pkg/sentry/mm/pma.go | 8 +- pkg/sentry/mm/procfs.go | 4 +- pkg/sentry/mm/save_restore.go | 2 +- pkg/sentry/mm/shm.go | 4 +- pkg/sentry/mm/special_mappable.go | 4 +- pkg/sentry/mm/syscalls.go | 4 +- pkg/sentry/mm/vma.go | 4 +- pkg/sentry/pgalloc/BUILD | 8 +- pkg/sentry/pgalloc/context.go | 2 +- pkg/sentry/pgalloc/pgalloc.go | 6 +- pkg/sentry/pgalloc/pgalloc_test.go | 2 +- pkg/sentry/pgalloc/save_restore.go | 2 +- pkg/sentry/platform/BUILD | 8 +- pkg/sentry/platform/context.go | 2 +- pkg/sentry/platform/kvm/BUILD | 6 +- pkg/sentry/platform/kvm/address_space.go | 2 +- pkg/sentry/platform/kvm/bluepill.go | 2 +- pkg/sentry/platform/kvm/bluepill_fault.go | 2 +- pkg/sentry/platform/kvm/context.go | 2 +- pkg/sentry/platform/kvm/kvm.go | 2 +- pkg/sentry/platform/kvm/kvm_test.go | 2 +- pkg/sentry/platform/kvm/machine.go | 2 +- pkg/sentry/platform/kvm/machine_amd64.go | 2 +- pkg/sentry/platform/kvm/machine_arm64.go | 2 +- pkg/sentry/platform/kvm/machine_arm64_unsafe.go | 2 +- pkg/sentry/platform/kvm/physical_map.go | 2 +- pkg/sentry/platform/kvm/virtual_map.go | 2 +- pkg/sentry/platform/kvm/virtual_map_test.go | 2 +- pkg/sentry/platform/mmap_min_addr.go | 2 +- pkg/sentry/platform/platform.go | 4 +- pkg/sentry/platform/ptrace/BUILD | 4 +- pkg/sentry/platform/ptrace/ptrace.go | 2 +- pkg/sentry/platform/ptrace/ptrace_unsafe.go | 2 +- pkg/sentry/platform/ptrace/stub_unsafe.go | 4 +- pkg/sentry/platform/ptrace/subprocess.go | 2 +- pkg/sentry/platform/ring0/BUILD | 2 +- pkg/sentry/platform/ring0/defs_amd64.go | 2 +- pkg/sentry/platform/ring0/defs_arm64.go | 2 +- pkg/sentry/platform/ring0/gen_offsets/BUILD | 2 +- pkg/sentry/platform/ring0/pagetables/BUILD | 4 +- .../platform/ring0/pagetables/allocator_unsafe.go | 2 +- pkg/sentry/platform/ring0/pagetables/pagetables.go | 2 +- .../ring0/pagetables/pagetables_aarch64.go | 2 +- .../ring0/pagetables/pagetables_amd64_test.go | 2 +- .../ring0/pagetables/pagetables_arm64_test.go | 2 +- .../platform/ring0/pagetables/pagetables_test.go | 2 +- .../platform/ring0/pagetables/pagetables_x86.go | 2 +- pkg/sentry/platform/safecopy/BUILD | 29 - pkg/sentry/platform/safecopy/LICENSE | 27 - pkg/sentry/platform/safecopy/atomic_amd64.s | 136 ----- pkg/sentry/platform/safecopy/atomic_arm64.s | 126 ----- pkg/sentry/platform/safecopy/memclr_amd64.s | 147 ----- pkg/sentry/platform/safecopy/memclr_arm64.s | 74 --- pkg/sentry/platform/safecopy/memcpy_amd64.s | 250 --------- pkg/sentry/platform/safecopy/memcpy_arm64.s | 78 --- pkg/sentry/platform/safecopy/safecopy.go | 144 ----- pkg/sentry/platform/safecopy/safecopy_test.go | 617 --------------------- pkg/sentry/platform/safecopy/safecopy_unsafe.go | 335 ----------- pkg/sentry/platform/safecopy/sighandler_amd64.s | 133 ----- pkg/sentry/platform/safecopy/sighandler_arm64.s | 143 ----- pkg/sentry/safemem/BUILD | 27 - pkg/sentry/safemem/block_unsafe.go | 279 ---------- pkg/sentry/safemem/io.go | 392 ------------- pkg/sentry/safemem/io_test.go | 199 ------- pkg/sentry/safemem/safemem.go | 16 - pkg/sentry/safemem/seq_test.go | 196 ------- pkg/sentry/safemem/seq_unsafe.go | 299 ---------- pkg/sentry/socket/BUILD | 4 +- pkg/sentry/socket/control/BUILD | 4 +- pkg/sentry/socket/control/control.go | 4 +- pkg/sentry/socket/hostinet/BUILD | 6 +- pkg/sentry/socket/hostinet/socket.go | 6 +- pkg/sentry/socket/hostinet/socket_unsafe.go | 4 +- pkg/sentry/socket/hostinet/stack.go | 4 +- pkg/sentry/socket/netfilter/BUILD | 2 +- pkg/sentry/socket/netfilter/netfilter.go | 2 +- pkg/sentry/socket/netlink/BUILD | 4 +- pkg/sentry/socket/netlink/message.go | 2 +- pkg/sentry/socket/netlink/provider.go | 2 +- pkg/sentry/socket/netlink/route/BUILD | 2 +- pkg/sentry/socket/netlink/route/protocol.go | 2 +- pkg/sentry/socket/netlink/socket.go | 4 +- pkg/sentry/socket/netlink/uevent/BUILD | 2 +- pkg/sentry/socket/netlink/uevent/protocol.go | 2 +- pkg/sentry/socket/netstack/BUILD | 6 +- pkg/sentry/socket/netstack/netstack.go | 6 +- pkg/sentry/socket/netstack/provider.go | 2 +- pkg/sentry/socket/socket.go | 4 +- pkg/sentry/socket/unix/BUILD | 6 +- pkg/sentry/socket/unix/io.go | 4 +- pkg/sentry/socket/unix/transport/BUILD | 2 +- pkg/sentry/socket/unix/transport/connectioned.go | 2 +- pkg/sentry/socket/unix/transport/connectionless.go | 2 +- pkg/sentry/socket/unix/transport/unix.go | 2 +- pkg/sentry/socket/unix/unix.go | 4 +- pkg/sentry/strace/BUILD | 2 +- pkg/sentry/strace/poll.go | 2 +- pkg/sentry/strace/select.go | 2 +- pkg/sentry/strace/signal.go | 2 +- pkg/sentry/strace/socket.go | 2 +- pkg/sentry/strace/strace.go | 2 +- pkg/sentry/syscalls/linux/BUILD | 6 +- pkg/sentry/syscalls/linux/linux64_amd64.go | 2 +- pkg/sentry/syscalls/linux/linux64_arm64.go | 2 +- pkg/sentry/syscalls/linux/sigset.go | 2 +- pkg/sentry/syscalls/linux/sys_aio.go | 2 +- pkg/sentry/syscalls/linux/sys_epoll.go | 2 +- pkg/sentry/syscalls/linux/sys_file.go | 4 +- pkg/sentry/syscalls/linux/sys_futex.go | 2 +- pkg/sentry/syscalls/linux/sys_getdents.go | 2 +- pkg/sentry/syscalls/linux/sys_mempolicy.go | 2 +- pkg/sentry/syscalls/linux/sys_mmap.go | 2 +- pkg/sentry/syscalls/linux/sys_mount.go | 2 +- pkg/sentry/syscalls/linux/sys_pipe.go | 2 +- pkg/sentry/syscalls/linux/sys_poll.go | 2 +- pkg/sentry/syscalls/linux/sys_random.go | 4 +- pkg/sentry/syscalls/linux/sys_read.go | 2 +- pkg/sentry/syscalls/linux/sys_rlimit.go | 2 +- pkg/sentry/syscalls/linux/sys_seccomp.go | 2 +- pkg/sentry/syscalls/linux/sys_sem.go | 2 +- pkg/sentry/syscalls/linux/sys_signal.go | 2 +- pkg/sentry/syscalls/linux/sys_socket.go | 2 +- pkg/sentry/syscalls/linux/sys_stat.go | 2 +- pkg/sentry/syscalls/linux/sys_stat_amd64.go | 2 +- pkg/sentry/syscalls/linux/sys_stat_arm64.go | 2 +- pkg/sentry/syscalls/linux/sys_thread.go | 2 +- pkg/sentry/syscalls/linux/sys_time.go | 2 +- pkg/sentry/syscalls/linux/sys_timer.go | 2 +- pkg/sentry/syscalls/linux/sys_write.go | 2 +- pkg/sentry/syscalls/linux/sys_xattr.go | 2 +- pkg/sentry/syscalls/linux/timespec.go | 2 +- pkg/sentry/unimpl/BUILD | 2 +- pkg/sentry/unimpl/events.go | 2 +- pkg/sentry/uniqueid/BUILD | 2 +- pkg/sentry/uniqueid/context.go | 2 +- pkg/sentry/usermem/BUILD | 55 -- pkg/sentry/usermem/README.md | 31 -- pkg/sentry/usermem/access_type.go | 128 ----- pkg/sentry/usermem/addr.go | 108 ---- pkg/sentry/usermem/addr_range_seq_test.go | 197 ------- pkg/sentry/usermem/addr_range_seq_unsafe.go | 277 --------- pkg/sentry/usermem/bytes_io.go | 141 ----- pkg/sentry/usermem/bytes_io_unsafe.go | 47 -- pkg/sentry/usermem/usermem.go | 597 -------------------- pkg/sentry/usermem/usermem_arm64.go | 53 -- pkg/sentry/usermem/usermem_test.go | 424 -------------- pkg/sentry/usermem/usermem_unsafe.go | 27 - pkg/sentry/usermem/usermem_x86.go | 38 -- pkg/sentry/vfs/BUILD | 10 +- pkg/sentry/vfs/context.go | 2 +- pkg/sentry/vfs/device.go | 2 +- pkg/sentry/vfs/file_description.go | 4 +- pkg/sentry/vfs/file_description_impl_util.go | 4 +- pkg/sentry/vfs/file_description_impl_util_test.go | 6 +- pkg/sentry/vfs/filesystem.go | 2 +- pkg/sentry/vfs/filesystem_type.go | 2 +- pkg/sentry/vfs/mount.go | 2 +- pkg/sentry/vfs/pathname.go | 2 +- pkg/sentry/vfs/testutil.go | 2 +- pkg/sentry/vfs/vfs.go | 2 +- pkg/usermem/BUILD | 55 ++ pkg/usermem/README.md | 31 ++ pkg/usermem/access_type.go | 128 +++++ pkg/usermem/addr.go | 108 ++++ pkg/usermem/addr_range_seq_test.go | 197 +++++++ pkg/usermem/addr_range_seq_unsafe.go | 277 +++++++++ pkg/usermem/bytes_io.go | 141 +++++ pkg/usermem/bytes_io_unsafe.go | 47 ++ pkg/usermem/usermem.go | 597 ++++++++++++++++++++ pkg/usermem/usermem_arm64.go | 53 ++ pkg/usermem/usermem_test.go | 424 ++++++++++++++ pkg/usermem/usermem_unsafe.go | 27 + pkg/usermem/usermem_x86.go | 38 ++ runsc/boot/BUILD | 6 +- runsc/boot/fds.go | 2 +- runsc/boot/fs.go | 2 +- runsc/boot/loader_test.go | 2 +- runsc/boot/user.go | 4 +- runsc/boot/user_test.go | 2 +- tools/go_marshal/defs.bzl | 4 +- tools/go_marshal/gomarshal/generator.go | 4 +- tools/go_marshal/test/BUILD | 2 +- tools/go_marshal/test/benchmark_test.go | 2 +- 483 files changed, 6839 insertions(+), 6835 deletions(-) create mode 100644 pkg/context/BUILD create mode 100644 pkg/context/context.go create mode 100644 pkg/safecopy/BUILD create mode 100644 pkg/safecopy/LICENSE create mode 100644 pkg/safecopy/atomic_amd64.s create mode 100644 pkg/safecopy/atomic_arm64.s create mode 100644 pkg/safecopy/memclr_amd64.s create mode 100644 pkg/safecopy/memclr_arm64.s create mode 100644 pkg/safecopy/memcpy_amd64.s create mode 100644 pkg/safecopy/memcpy_arm64.s create mode 100644 pkg/safecopy/safecopy.go create mode 100644 pkg/safecopy/safecopy_test.go create mode 100644 pkg/safecopy/safecopy_unsafe.go create mode 100644 pkg/safecopy/sighandler_amd64.s create mode 100644 pkg/safecopy/sighandler_arm64.s create mode 100644 pkg/safemem/BUILD create mode 100644 pkg/safemem/block_unsafe.go create mode 100644 pkg/safemem/io.go create mode 100644 pkg/safemem/io_test.go create mode 100644 pkg/safemem/safemem.go create mode 100644 pkg/safemem/seq_test.go create mode 100644 pkg/safemem/seq_unsafe.go delete mode 100644 pkg/sentry/context/BUILD delete mode 100644 pkg/sentry/context/context.go delete mode 100644 pkg/sentry/context/contexttest/BUILD delete mode 100644 pkg/sentry/context/contexttest/contexttest.go create mode 100644 pkg/sentry/contexttest/BUILD create mode 100644 pkg/sentry/contexttest/contexttest.go delete mode 100644 pkg/sentry/platform/safecopy/BUILD delete mode 100644 pkg/sentry/platform/safecopy/LICENSE delete mode 100644 pkg/sentry/platform/safecopy/atomic_amd64.s delete mode 100644 pkg/sentry/platform/safecopy/atomic_arm64.s delete mode 100644 pkg/sentry/platform/safecopy/memclr_amd64.s delete mode 100644 pkg/sentry/platform/safecopy/memclr_arm64.s delete mode 100644 pkg/sentry/platform/safecopy/memcpy_amd64.s delete mode 100644 pkg/sentry/platform/safecopy/memcpy_arm64.s delete mode 100644 pkg/sentry/platform/safecopy/safecopy.go delete mode 100644 pkg/sentry/platform/safecopy/safecopy_test.go delete mode 100644 pkg/sentry/platform/safecopy/safecopy_unsafe.go delete mode 100644 pkg/sentry/platform/safecopy/sighandler_amd64.s delete mode 100644 pkg/sentry/platform/safecopy/sighandler_arm64.s delete mode 100644 pkg/sentry/safemem/BUILD delete mode 100644 pkg/sentry/safemem/block_unsafe.go delete mode 100644 pkg/sentry/safemem/io.go delete mode 100644 pkg/sentry/safemem/io_test.go delete mode 100644 pkg/sentry/safemem/safemem.go delete mode 100644 pkg/sentry/safemem/seq_test.go delete mode 100644 pkg/sentry/safemem/seq_unsafe.go delete mode 100644 pkg/sentry/usermem/BUILD delete mode 100644 pkg/sentry/usermem/README.md delete mode 100644 pkg/sentry/usermem/access_type.go delete mode 100644 pkg/sentry/usermem/addr.go delete mode 100644 pkg/sentry/usermem/addr_range_seq_test.go delete mode 100644 pkg/sentry/usermem/addr_range_seq_unsafe.go delete mode 100644 pkg/sentry/usermem/bytes_io.go delete mode 100644 pkg/sentry/usermem/bytes_io_unsafe.go delete mode 100644 pkg/sentry/usermem/usermem.go delete mode 100644 pkg/sentry/usermem/usermem_arm64.go delete mode 100644 pkg/sentry/usermem/usermem_test.go delete mode 100644 pkg/sentry/usermem/usermem_unsafe.go delete mode 100644 pkg/sentry/usermem/usermem_x86.go create mode 100644 pkg/usermem/BUILD create mode 100644 pkg/usermem/README.md create mode 100644 pkg/usermem/access_type.go create mode 100644 pkg/usermem/addr.go create mode 100644 pkg/usermem/addr_range_seq_test.go create mode 100644 pkg/usermem/addr_range_seq_unsafe.go create mode 100644 pkg/usermem/bytes_io.go create mode 100644 pkg/usermem/bytes_io_unsafe.go create mode 100644 pkg/usermem/usermem.go create mode 100644 pkg/usermem/usermem_arm64.go create mode 100644 pkg/usermem/usermem_test.go create mode 100644 pkg/usermem/usermem_unsafe.go create mode 100644 pkg/usermem/usermem_x86.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/abi.go b/pkg/abi/abi.go index d56c481c9..e6be93c3a 100644 --- a/pkg/abi/abi.go +++ b/pkg/abi/abi.go @@ -39,3 +39,7 @@ func (o OS) String() string { return fmt.Sprintf("OS(%d)", o) } } + +// ABI is an interface that defines OS-specific interactions. +type ABI interface { +} diff --git a/pkg/context/BUILD b/pkg/context/BUILD new file mode 100644 index 000000000..239f31149 --- /dev/null +++ b/pkg/context/BUILD @@ -0,0 +1,13 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "context", + srcs = ["context.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/amutex", + "//pkg/log", + ], +) diff --git a/pkg/context/context.go b/pkg/context/context.go new file mode 100644 index 000000000..23e009ef3 --- /dev/null +++ b/pkg/context/context.go @@ -0,0 +1,141 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package context defines an internal context type. +// +// The given Context conforms to the standard Go context, but mandates +// additional methods that are specific to the kernel internals. Note however, +// that the Context described by this package carries additional constraints +// regarding concurrent access and retaining beyond the scope of a call. +// +// See the Context type for complete details. +package context + +import ( + "context" + "time" + + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/log" +) + +type contextID int + +// Globally accessible values from a context. These keys are defined in the +// context package to resolve dependency cycles by not requiring the caller to +// import packages usually required to get these information. +const ( + // CtxThreadGroupID is the current thread group ID when a context represents + // a task context. The value is represented as an int32. + CtxThreadGroupID contextID = iota +) + +// ThreadGroupIDFromContext returns the current thread group ID when ctx +// represents a task context. +func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) { + if tgid := ctx.Value(CtxThreadGroupID); tgid != nil { + return tgid.(int32), true + } + return 0, false +} + +// A Context represents a thread of execution (hereafter "goroutine" to reflect +// Go idiosyncrasy). It carries state associated with the goroutine across API +// boundaries. +// +// While Context exists for essentially the same reasons as Go's standard +// context.Context, the standard type represents the state of an operation +// rather than that of a goroutine. This is a critical distinction: +// +// - Unlike context.Context, which "may be passed to functions running in +// different goroutines", it is *not safe* to use the same Context in multiple +// concurrent goroutines. +// +// - It is *not safe* to retain a Context passed to a function beyond the scope +// of that function call. +// +// In both cases, values extracted from the Context should be used instead. +type Context interface { + log.Logger + amutex.Sleeper + context.Context + + // UninterruptibleSleepStart indicates the beginning of an uninterruptible + // sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate + // is true and the Context represents a Task, the Task's AddressSpace is + // deactivated. + UninterruptibleSleepStart(deactivate bool) + + // UninterruptibleSleepFinish indicates the end of an uninterruptible sleep + // state that was begun by a previous call to UninterruptibleSleepStart. If + // activate is true and the Context represents a Task, the Task's + // AddressSpace is activated. Normally activate is the same value as the + // deactivate parameter passed to UninterruptibleSleepStart. + UninterruptibleSleepFinish(activate bool) +} + +// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep +// methods for anonymous embedding in other types that do not implement sleeps. +type NoopSleeper struct { + amutex.NoopSleeper +} + +// UninterruptibleSleepStart does nothing. +func (NoopSleeper) UninterruptibleSleepStart(bool) {} + +// UninterruptibleSleepFinish does nothing. +func (NoopSleeper) UninterruptibleSleepFinish(bool) {} + +// Deadline returns zero values, meaning no deadline. +func (NoopSleeper) Deadline() (time.Time, bool) { + return time.Time{}, false +} + +// Done returns nil. +func (NoopSleeper) Done() <-chan struct{} { + return nil +} + +// Err returns nil. +func (NoopSleeper) Err() error { + return nil +} + +// logContext implements basic logging. +type logContext struct { + log.Logger + NoopSleeper +} + +// Value implements Context.Value. +func (logContext) Value(key interface{}) interface{} { + return nil +} + +// bgContext is the context returned by context.Background. +var bgContext = &logContext{Logger: log.Log()} + +// Background returns an empty context using the default logger. +// +// Users should be wary of using a Background context. Please tag any use with +// FIXME(b/38173783) and a note to remove this use. +// +// Generally, one should use the Task as their context when available, or avoid +// having to use a context in places where a Task is unavailable. +// +// Using a Background context for tests is fine, as long as no values are +// needed from the context in the tested code paths. +func Background() Context { + return bgContext +} diff --git a/pkg/safecopy/BUILD b/pkg/safecopy/BUILD new file mode 100644 index 000000000..426ef30c9 --- /dev/null +++ b/pkg/safecopy/BUILD @@ -0,0 +1,29 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "safecopy", + srcs = [ + "atomic_amd64.s", + "atomic_arm64.s", + "memclr_amd64.s", + "memclr_arm64.s", + "memcpy_amd64.s", + "memcpy_arm64.s", + "safecopy.go", + "safecopy_unsafe.go", + "sighandler_amd64.s", + "sighandler_arm64.s", + ], + visibility = ["//:sandbox"], + deps = ["//pkg/syserror"], +) + +go_test( + name = "safecopy_test", + srcs = [ + "safecopy_test.go", + ], + library = ":safecopy", +) diff --git a/pkg/safecopy/LICENSE b/pkg/safecopy/LICENSE new file mode 100644 index 000000000..6a66aea5e --- /dev/null +++ b/pkg/safecopy/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/safecopy/atomic_amd64.s b/pkg/safecopy/atomic_amd64.s new file mode 100644 index 000000000..a0cd78f33 --- /dev/null +++ b/pkg/safecopy/atomic_amd64.s @@ -0,0 +1,136 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// handleSwapUint32Fault returns the value stored in DI. Control is transferred +// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as swapUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 + MOVL DI, sig+20(FP) + RET + +// swapUint32 atomically stores new into *addr and returns (the previous *addr +// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the +// value of old is unspecified, and sig is the number of the signal that was +// received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) +TEXT ·swapUint32(SB), NOSPLIT, $0-24 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleSwapUint32Fault will store a different value in this address. + MOVL $0, sig+20(FP) + + MOVQ addr+0(FP), DI + MOVL new+8(FP), AX + XCHGL AX, 0(DI) + MOVL AX, old+16(FP) + RET + +// handleSwapUint64Fault returns the value stored in DI. Control is transferred +// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as swapUint64 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 + MOVL DI, sig+24(FP) + RET + +// swapUint64 atomically stores new into *addr and returns (the previous *addr +// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the +// value of old is unspecified, and sig is the number of the signal that was +// received. +// +// Preconditions: addr must be aligned to a 8-byte boundary. +// +//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) +TEXT ·swapUint64(SB), NOSPLIT, $0-28 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleSwapUint64Fault will store a different value in this address. + MOVL $0, sig+24(FP) + + MOVQ addr+0(FP), DI + MOVQ new+8(FP), AX + XCHGQ AX, 0(DI) + MOVQ AX, old+16(FP) + RET + +// handleCompareAndSwapUint32Fault returns the value stored in DI. Control is +// transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the +// signal number stored in DI. +// +// It must have the same frame configuration as compareAndSwapUint32 so that it +// can undo any potential call frame set up by the assembler. +TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 + MOVL DI, sig+20(FP) + RET + +// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns +// (the value previously stored at addr, 0). If a SIGSEGV or SIGBUS signal is +// received during the operation, the value of prev is unspecified, and sig is +// the number of the signal that was received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) +TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 + // Store 0 as the returned signal number. If we run to completion, this is + // the value the caller will see; if a signal is received, + // handleCompareAndSwapUint32Fault will store a different value in this + // address. + MOVL $0, sig+20(FP) + + MOVQ addr+0(FP), DI + MOVL old+8(FP), AX + MOVL new+12(FP), DX + LOCK + CMPXCHGL DX, 0(DI) + MOVL AX, prev+16(FP) + RET + +// handleLoadUint32Fault returns the value stored in DI. Control is transferred +// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as loadUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 + MOVL DI, sig+12(FP) + RET + +// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS +// signal is received, the value returned is unspecified, and sig is the number +// of the signal that was received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) +TEXT ·loadUint32(SB), NOSPLIT, $0-16 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleLoadUint32Fault will store a different value in this address. + MOVL $0, sig+12(FP) + + MOVQ addr+0(FP), AX + MOVL (AX), BX + MOVL BX, val+8(FP) + RET diff --git a/pkg/safecopy/atomic_arm64.s b/pkg/safecopy/atomic_arm64.s new file mode 100644 index 000000000..d58ed71f7 --- /dev/null +++ b/pkg/safecopy/atomic_arm64.s @@ -0,0 +1,126 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// handleSwapUint32Fault returns the value stored in R1. Control is transferred +// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in R1. +// +// It must have the same frame configuration as swapUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 + MOVW R1, sig+20(FP) + RET + +// See the corresponding doc in safecopy_unsafe.go +// +// The code is derived from Go source runtime/internal/atomic.Xchg. +// +//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) +TEXT ·swapUint32(SB), NOSPLIT, $0-24 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleSwapUint32Fault will store a different value in this address. + MOVW $0, sig+20(FP) +again: + MOVD addr+0(FP), R0 + MOVW new+8(FP), R1 + LDAXRW (R0), R2 + STLXRW R1, (R0), R3 + CBNZ R3, again + MOVW R2, old+16(FP) + RET + +// handleSwapUint64Fault returns the value stored in R1. Control is transferred +// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal +// number stored in R1. +// +// It must have the same frame configuration as swapUint64 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 + MOVW R1, sig+24(FP) + RET + +// See the corresponding doc in safecopy_unsafe.go +// +// The code is derived from Go source runtime/internal/atomic.Xchg64. +// +//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) +TEXT ·swapUint64(SB), NOSPLIT, $0-28 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleSwapUint64Fault will store a different value in this address. + MOVW $0, sig+24(FP) +again: + MOVD addr+0(FP), R0 + MOVD new+8(FP), R1 + LDAXR (R0), R2 + STLXR R1, (R0), R3 + CBNZ R3, again + MOVD R2, old+16(FP) + RET + +// handleCompareAndSwapUint32Fault returns the value stored in R1. Control is +// transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS, +// with the signal number stored in R1. +// +// It must have the same frame configuration as compareAndSwapUint32 so that it +// can undo any potential call frame set up by the assembler. +TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 + MOVW R1, sig+20(FP) + RET + +// See the corresponding doc in safecopy_unsafe.go +// +// The code is derived from Go source runtime/internal/atomic.Cas. +// +//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) +TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 + // Store 0 as the returned signal number. If we run to completion, this is + // the value the caller will see; if a signal is received, + // handleCompareAndSwapUint32Fault will store a different value in this + // address. + MOVW $0, sig+20(FP) + + MOVD addr+0(FP), R0 + MOVW old+8(FP), R1 + MOVW new+12(FP), R2 +again: + LDAXRW (R0), R3 + CMPW R1, R3 + BNE done + STLXRW R2, (R0), R4 + CBNZ R4, again +done: + MOVW R3, prev+16(FP) + RET + +// handleLoadUint32Fault returns the value stored in DI. Control is transferred +// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as loadUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 + MOVW R1, sig+12(FP) + RET + +// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS +// signal is received, the value returned is unspecified, and sig is the number +// of the signal that was received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) +TEXT ·loadUint32(SB), NOSPLIT, $0-16 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleLoadUint32Fault will store a different value in this address. + MOVW $0, sig+12(FP) + + MOVD addr+0(FP), R0 + LDARW (R0), R1 + MOVW R1, val+8(FP) + RET diff --git a/pkg/safecopy/memclr_amd64.s b/pkg/safecopy/memclr_amd64.s new file mode 100644 index 000000000..64cf32f05 --- /dev/null +++ b/pkg/safecopy/memclr_amd64.s @@ -0,0 +1,147 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// handleMemclrFault returns (the value stored in AX, the value stored in DI). +// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, +// with the faulting address stored in AX and the signal number stored in DI. +// +// It must have the same frame configuration as memclr so that it can undo any +// potential call frame set up by the assembler. +TEXT handleMemclrFault(SB), NOSPLIT, $0-28 + MOVQ AX, addr+16(FP) + MOVL DI, sig+24(FP) + RET + +// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS +// signal is received during the write, it returns the address that caused the +// fault and the number of the signal that was received. Otherwise, it returns +// an unspecified address and a signal number of 0. +// +// Data is written in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully written. +// +// The code is derived from runtime.memclrNoHeapPointers. +// +// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +TEXT ·memclr(SB), NOSPLIT, $0-28 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleMemclrFault will store a different value in this address. + MOVL $0, sig+24(FP) + + MOVQ ptr+0(FP), DI + MOVQ n+8(FP), BX + XORQ AX, AX + + // MOVOU seems always faster than REP STOSQ. +tail: + TESTQ BX, BX + JEQ _0 + CMPQ BX, $2 + JBE _1or2 + CMPQ BX, $4 + JBE _3or4 + CMPQ BX, $8 + JB _5through7 + JE _8 + CMPQ BX, $16 + JBE _9through16 + PXOR X0, X0 + CMPQ BX, $32 + JBE _17through32 + CMPQ BX, $64 + JBE _33through64 + CMPQ BX, $128 + JBE _65through128 + CMPQ BX, $256 + JBE _129through256 + // TODO: use branch table and BSR to make this just a single dispatch + // TODO: for really big clears, use MOVNTDQ, even without AVX2. + +loop: + MOVOU X0, 0(DI) + MOVOU X0, 16(DI) + MOVOU X0, 32(DI) + MOVOU X0, 48(DI) + MOVOU X0, 64(DI) + MOVOU X0, 80(DI) + MOVOU X0, 96(DI) + MOVOU X0, 112(DI) + MOVOU X0, 128(DI) + MOVOU X0, 144(DI) + MOVOU X0, 160(DI) + MOVOU X0, 176(DI) + MOVOU X0, 192(DI) + MOVOU X0, 208(DI) + MOVOU X0, 224(DI) + MOVOU X0, 240(DI) + SUBQ $256, BX + ADDQ $256, DI + CMPQ BX, $256 + JAE loop + JMP tail + +_1or2: + MOVB AX, (DI) + MOVB AX, -1(DI)(BX*1) + RET +_0: + RET +_3or4: + MOVW AX, (DI) + MOVW AX, -2(DI)(BX*1) + RET +_5through7: + MOVL AX, (DI) + MOVL AX, -4(DI)(BX*1) + RET +_8: + // We need a separate case for 8 to make sure we clear pointers atomically. + MOVQ AX, (DI) + RET +_9through16: + MOVQ AX, (DI) + MOVQ AX, -8(DI)(BX*1) + RET +_17through32: + MOVOU X0, (DI) + MOVOU X0, -16(DI)(BX*1) + RET +_33through64: + MOVOU X0, (DI) + MOVOU X0, 16(DI) + MOVOU X0, -32(DI)(BX*1) + MOVOU X0, -16(DI)(BX*1) + RET +_65through128: + MOVOU X0, (DI) + MOVOU X0, 16(DI) + MOVOU X0, 32(DI) + MOVOU X0, 48(DI) + MOVOU X0, -64(DI)(BX*1) + MOVOU X0, -48(DI)(BX*1) + MOVOU X0, -32(DI)(BX*1) + MOVOU X0, -16(DI)(BX*1) + RET +_129through256: + MOVOU X0, (DI) + MOVOU X0, 16(DI) + MOVOU X0, 32(DI) + MOVOU X0, 48(DI) + MOVOU X0, 64(DI) + MOVOU X0, 80(DI) + MOVOU X0, 96(DI) + MOVOU X0, 112(DI) + MOVOU X0, -128(DI)(BX*1) + MOVOU X0, -112(DI)(BX*1) + MOVOU X0, -96(DI)(BX*1) + MOVOU X0, -80(DI)(BX*1) + MOVOU X0, -64(DI)(BX*1) + MOVOU X0, -48(DI)(BX*1) + MOVOU X0, -32(DI)(BX*1) + MOVOU X0, -16(DI)(BX*1) + RET diff --git a/pkg/safecopy/memclr_arm64.s b/pkg/safecopy/memclr_arm64.s new file mode 100644 index 000000000..7361b9067 --- /dev/null +++ b/pkg/safecopy/memclr_arm64.s @@ -0,0 +1,74 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// handleMemclrFault returns (the value stored in R0, the value stored in R1). +// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, +// with the faulting address stored in R0 and the signal number stored in R1. +// +// It must have the same frame configuration as memclr so that it can undo any +// potential call frame set up by the assembler. +TEXT handleMemclrFault(SB), NOSPLIT, $0-28 + MOVD R0, addr+16(FP) + MOVW R1, sig+24(FP) + RET + +// See the corresponding doc in safecopy_unsafe.go +// +// The code is derived from runtime.memclrNoHeapPointers. +// +// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +TEXT ·memclr(SB), NOSPLIT, $0-28 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleMemclrFault will store a different value in this address. + MOVW $0, sig+24(FP) + MOVD ptr+0(FP), R0 + MOVD n+8(FP), R1 + + // If size is less than 16 bytes, use tail_zero to zero what remains + CMP $16, R1 + BLT tail_zero + // Get buffer offset into 16 byte aligned address for better performance + ANDS $15, R0, ZR + BNE unaligned_to_16 +aligned_to_16: + LSR $4, R1, R2 +zero_by_16: + STP.P (ZR, ZR), 16(R0) // Store pair with post index. + SUBS $1, R2, R2 + BNE zero_by_16 + ANDS $15, R1, R1 + BEQ end + + // Zero buffer with size=R1 < 16 +tail_zero: + TBZ $3, R1, tail_zero_4 + MOVD.P ZR, 8(R0) +tail_zero_4: + TBZ $2, R1, tail_zero_2 + MOVW.P ZR, 4(R0) +tail_zero_2: + TBZ $1, R1, tail_zero_1 + MOVH.P ZR, 2(R0) +tail_zero_1: + TBZ $0, R1, end + MOVB ZR, (R0) +end: + RET + +unaligned_to_16: + MOVD R0, R2 +head_loop: + MOVBU.P ZR, 1(R0) + ANDS $15, R0, ZR + BNE head_loop + // Adjust length for what remains + SUB R2, R0, R3 + SUB R3, R1 + // If size is less than 16 bytes, use tail_zero to zero what remains + CMP $16, R1 + BLT tail_zero + B aligned_to_16 diff --git a/pkg/safecopy/memcpy_amd64.s b/pkg/safecopy/memcpy_amd64.s new file mode 100644 index 000000000..129691d68 --- /dev/null +++ b/pkg/safecopy/memcpy_amd64.s @@ -0,0 +1,250 @@ +// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. +// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. +// Portions Copyright 2009 The Go Authors. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "textflag.h" + +// handleMemcpyFault returns (the value stored in AX, the value stored in DI). +// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, +// with the faulting address stored in AX and the signal number stored in DI. +// +// It must have the same frame configuration as memcpy so that it can undo any +// potential call frame set up by the assembler. +TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 + MOVQ AX, addr+24(FP) + MOVL DI, sig+32(FP) + RET + +// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received +// during the copy, it returns the address that caused the fault and the number +// of the signal that was received. Otherwise, it returns an unspecified address +// and a signal number of 0. +// +// Data is copied in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully copied. +// +// The code is derived from the forward copying part of runtime.memmove. +// +// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +TEXT ·memcpy(SB), NOSPLIT, $0-36 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleMemcpyFault will store a different value in this address. + MOVL $0, sig+32(FP) + + MOVQ to+0(FP), DI + MOVQ from+8(FP), SI + MOVQ n+16(FP), BX + + // REP instructions have a high startup cost, so we handle small sizes + // with some straightline code. The REP MOVSQ instruction is really fast + // for large sizes. The cutover is approximately 2K. +tail: + // move_129through256 or smaller work whether or not the source and the + // destination memory regions overlap because they load all data into + // registers before writing it back. move_256through2048 on the other + // hand can be used only when the memory regions don't overlap or the copy + // direction is forward. + TESTQ BX, BX + JEQ move_0 + CMPQ BX, $2 + JBE move_1or2 + CMPQ BX, $4 + JBE move_3or4 + CMPQ BX, $8 + JB move_5through7 + JE move_8 + CMPQ BX, $16 + JBE move_9through16 + CMPQ BX, $32 + JBE move_17through32 + CMPQ BX, $64 + JBE move_33through64 + CMPQ BX, $128 + JBE move_65through128 + CMPQ BX, $256 + JBE move_129through256 + // TODO: use branch table and BSR to make this just a single dispatch + +/* + * forward copy loop + */ + CMPQ BX, $2048 + JLS move_256through2048 + + // Check alignment + MOVL SI, AX + ORL DI, AX + TESTL $7, AX + JEQ fwdBy8 + + // Do 1 byte at a time + MOVQ BX, CX + REP; MOVSB + RET + +fwdBy8: + // Do 8 bytes at a time + MOVQ BX, CX + SHRQ $3, CX + ANDQ $7, BX + REP; MOVSQ + JMP tail + +move_1or2: + MOVB (SI), AX + MOVB AX, (DI) + MOVB -1(SI)(BX*1), CX + MOVB CX, -1(DI)(BX*1) + RET +move_0: + RET +move_3or4: + MOVW (SI), AX + MOVW AX, (DI) + MOVW -2(SI)(BX*1), CX + MOVW CX, -2(DI)(BX*1) + RET +move_5through7: + MOVL (SI), AX + MOVL AX, (DI) + MOVL -4(SI)(BX*1), CX + MOVL CX, -4(DI)(BX*1) + RET +move_8: + // We need a separate case for 8 to make sure we write pointers atomically. + MOVQ (SI), AX + MOVQ AX, (DI) + RET +move_9through16: + MOVQ (SI), AX + MOVQ AX, (DI) + MOVQ -8(SI)(BX*1), CX + MOVQ CX, -8(DI)(BX*1) + RET +move_17through32: + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU -16(SI)(BX*1), X1 + MOVOU X1, -16(DI)(BX*1) + RET +move_33through64: + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU 16(SI), X1 + MOVOU X1, 16(DI) + MOVOU -32(SI)(BX*1), X2 + MOVOU X2, -32(DI)(BX*1) + MOVOU -16(SI)(BX*1), X3 + MOVOU X3, -16(DI)(BX*1) + RET +move_65through128: + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU 16(SI), X1 + MOVOU X1, 16(DI) + MOVOU 32(SI), X2 + MOVOU X2, 32(DI) + MOVOU 48(SI), X3 + MOVOU X3, 48(DI) + MOVOU -64(SI)(BX*1), X4 + MOVOU X4, -64(DI)(BX*1) + MOVOU -48(SI)(BX*1), X5 + MOVOU X5, -48(DI)(BX*1) + MOVOU -32(SI)(BX*1), X6 + MOVOU X6, -32(DI)(BX*1) + MOVOU -16(SI)(BX*1), X7 + MOVOU X7, -16(DI)(BX*1) + RET +move_129through256: + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU 16(SI), X1 + MOVOU X1, 16(DI) + MOVOU 32(SI), X2 + MOVOU X2, 32(DI) + MOVOU 48(SI), X3 + MOVOU X3, 48(DI) + MOVOU 64(SI), X4 + MOVOU X4, 64(DI) + MOVOU 80(SI), X5 + MOVOU X5, 80(DI) + MOVOU 96(SI), X6 + MOVOU X6, 96(DI) + MOVOU 112(SI), X7 + MOVOU X7, 112(DI) + MOVOU -128(SI)(BX*1), X8 + MOVOU X8, -128(DI)(BX*1) + MOVOU -112(SI)(BX*1), X9 + MOVOU X9, -112(DI)(BX*1) + MOVOU -96(SI)(BX*1), X10 + MOVOU X10, -96(DI)(BX*1) + MOVOU -80(SI)(BX*1), X11 + MOVOU X11, -80(DI)(BX*1) + MOVOU -64(SI)(BX*1), X12 + MOVOU X12, -64(DI)(BX*1) + MOVOU -48(SI)(BX*1), X13 + MOVOU X13, -48(DI)(BX*1) + MOVOU -32(SI)(BX*1), X14 + MOVOU X14, -32(DI)(BX*1) + MOVOU -16(SI)(BX*1), X15 + MOVOU X15, -16(DI)(BX*1) + RET +move_256through2048: + SUBQ $256, BX + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU 16(SI), X1 + MOVOU X1, 16(DI) + MOVOU 32(SI), X2 + MOVOU X2, 32(DI) + MOVOU 48(SI), X3 + MOVOU X3, 48(DI) + MOVOU 64(SI), X4 + MOVOU X4, 64(DI) + MOVOU 80(SI), X5 + MOVOU X5, 80(DI) + MOVOU 96(SI), X6 + MOVOU X6, 96(DI) + MOVOU 112(SI), X7 + MOVOU X7, 112(DI) + MOVOU 128(SI), X8 + MOVOU X8, 128(DI) + MOVOU 144(SI), X9 + MOVOU X9, 144(DI) + MOVOU 160(SI), X10 + MOVOU X10, 160(DI) + MOVOU 176(SI), X11 + MOVOU X11, 176(DI) + MOVOU 192(SI), X12 + MOVOU X12, 192(DI) + MOVOU 208(SI), X13 + MOVOU X13, 208(DI) + MOVOU 224(SI), X14 + MOVOU X14, 224(DI) + MOVOU 240(SI), X15 + MOVOU X15, 240(DI) + CMPQ BX, $256 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + JGE move_256through2048 + JMP tail diff --git a/pkg/safecopy/memcpy_arm64.s b/pkg/safecopy/memcpy_arm64.s new file mode 100644 index 000000000..e7e541565 --- /dev/null +++ b/pkg/safecopy/memcpy_arm64.s @@ -0,0 +1,78 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// handleMemcpyFault returns (the value stored in R0, the value stored in R1). +// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, +// with the faulting address stored in R0 and the signal number stored in R1. +// +// It must have the same frame configuration as memcpy so that it can undo any +// potential call frame set up by the assembler. +TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 + MOVD R0, addr+24(FP) + MOVW R1, sig+32(FP) + RET + +// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received +// during the copy, it returns the address that caused the fault and the number +// of the signal that was received. Otherwise, it returns an unspecified address +// and a signal number of 0. +// +// Data is copied in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully copied. +// +// The code is derived from the Go source runtime.memmove. +// +// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +TEXT ·memcpy(SB), NOSPLIT, $-8-36 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleMemcpyFault will store a different value in this address. + MOVW $0, sig+32(FP) + + MOVD to+0(FP), R3 + MOVD from+8(FP), R4 + MOVD n+16(FP), R5 + CMP $0, R5 + BNE check + RET + +check: + AND $~7, R5, R7 // R7 is N&~7. + SUB R7, R5, R6 // R6 is N&7. + + // Copying forward proceeds by copying R7/8 words then copying R6 bytes. + // R3 and R4 are advanced as we copy. + + // (There may be implementations of armv8 where copying by bytes until + // at least one of source or dest is word aligned is a worthwhile + // optimization, but the on the one tested so far (xgene) it did not + // make a significance difference.) + + CMP $0, R7 // Do we need to do any word-by-word copying? + BEQ noforwardlarge + ADD R3, R7, R9 // R9 points just past where we copy by word. + +forwardlargeloop: + MOVD.P 8(R4), R8 // R8 is just a scratch register. + MOVD.P R8, 8(R3) + CMP R3, R9 + BNE forwardlargeloop + +noforwardlarge: + CMP $0, R6 // Do we need to do any byte-by-byte copying? + BNE forwardtail + RET + +forwardtail: + ADD R3, R6, R9 // R9 points just past the destination memory. + +forwardtailloop: + MOVBU.P 1(R4), R8 + MOVBU.P R8, 1(R3) + CMP R3, R9 + BNE forwardtailloop + RET diff --git a/pkg/safecopy/safecopy.go b/pkg/safecopy/safecopy.go new file mode 100644 index 000000000..2fb7e5809 --- /dev/null +++ b/pkg/safecopy/safecopy.go @@ -0,0 +1,144 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package safecopy provides an efficient implementation of functions to access +// memory that may result in SIGSEGV or SIGBUS being sent to the accessor. +package safecopy + +import ( + "fmt" + "reflect" + "runtime" + "syscall" + + "gvisor.dev/gvisor/pkg/syserror" +) + +// SegvError is returned when a safecopy function receives SIGSEGV. +type SegvError struct { + // Addr is the address at which the SIGSEGV occurred. + Addr uintptr +} + +// Error implements error.Error. +func (e SegvError) Error() string { + return fmt.Sprintf("SIGSEGV at %#x", e.Addr) +} + +// BusError is returned when a safecopy function receives SIGBUS. +type BusError struct { + // Addr is the address at which the SIGBUS occurred. + Addr uintptr +} + +// Error implements error.Error. +func (e BusError) Error() string { + return fmt.Sprintf("SIGBUS at %#x", e.Addr) +} + +// AlignmentError is returned when a safecopy function is passed an address +// that does not meet alignment requirements. +type AlignmentError struct { + // Addr is the invalid address. + Addr uintptr + + // Alignment is the required alignment. + Alignment uintptr +} + +// Error implements error.Error. +func (e AlignmentError) Error() string { + return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment) +} + +var ( + // The begin and end addresses below are for the functions that are + // checked by the signal handler. + memcpyBegin uintptr + memcpyEnd uintptr + memclrBegin uintptr + memclrEnd uintptr + swapUint32Begin uintptr + swapUint32End uintptr + swapUint64Begin uintptr + swapUint64End uintptr + compareAndSwapUint32Begin uintptr + compareAndSwapUint32End uintptr + loadUint32Begin uintptr + loadUint32End uintptr + + // savedSigSegVHandler is a pointer to the SIGSEGV handler that was + // configured before we replaced it with our own. We still call into it + // when we get a SIGSEGV that is not interesting to us. + savedSigSegVHandler uintptr + + // same a above, but for SIGBUS signals. + savedSigBusHandler uintptr +) + +// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS +// signals. +func signalHandler() + +// FindEndAddress returns the end address (one byte beyond the last) of the +// function that contains the specified address (begin). +func FindEndAddress(begin uintptr) uintptr { + f := runtime.FuncForPC(begin) + if f != nil { + for p := begin; ; p++ { + g := runtime.FuncForPC(p) + if f != g { + return p + } + } + } + return begin +} + +// initializeAddresses initializes the addresses used by the signal handler. +func initializeAddresses() { + // The following functions are written in assembly language, so they won't + // be inlined by the existing compiler/linker. Tests will fail if this + // assumption is violated. + memcpyBegin = reflect.ValueOf(memcpy).Pointer() + memcpyEnd = FindEndAddress(memcpyBegin) + memclrBegin = reflect.ValueOf(memclr).Pointer() + memclrEnd = FindEndAddress(memclrBegin) + swapUint32Begin = reflect.ValueOf(swapUint32).Pointer() + swapUint32End = FindEndAddress(swapUint32Begin) + swapUint64Begin = reflect.ValueOf(swapUint64).Pointer() + swapUint64End = FindEndAddress(swapUint64Begin) + compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer() + compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin) + loadUint32Begin = reflect.ValueOf(loadUint32).Pointer() + loadUint32End = FindEndAddress(loadUint32Begin) +} + +func init() { + initializeAddresses() + if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err)) + } + if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err)) + } + syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) { + switch e.(type) { + case SegvError, BusError, AlignmentError: + return syscall.EFAULT, true + default: + return 0, false + } + }) +} diff --git a/pkg/safecopy/safecopy_test.go b/pkg/safecopy/safecopy_test.go new file mode 100644 index 000000000..5818f7f9b --- /dev/null +++ b/pkg/safecopy/safecopy_test.go @@ -0,0 +1,617 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safecopy + +import ( + "bytes" + "fmt" + "io/ioutil" + "math/rand" + "os" + "runtime/debug" + "syscall" + "testing" + "unsafe" +) + +// Size of a page in bytes. Cloned from usermem.PageSize to avoid a circular +// dependency. +const pageSize = 4096 + +func initRandom(b []byte) { + for i := range b { + b[i] = byte(rand.Intn(256)) + } +} + +func randBuf(size int) []byte { + b := make([]byte, size) + initRandom(b) + return b +} + +func TestCopyInSuccess(t *testing.T) { + // Test that CopyIn does not return an error when all pages are accessible. + const bufLen = 8192 + a := randBuf(bufLen) + b := make([]byte, bufLen) + + n, err := CopyIn(b, unsafe.Pointer(&a[0])) + if n != bufLen { + t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) + } + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !bytes.Equal(a, b) { + t.Errorf("Buffers are not equal when they should be: %v %v", a, b) + } +} + +func TestCopyOutSuccess(t *testing.T) { + // Test that CopyOut does not return an error when all pages are + // accessible. + const bufLen = 8192 + a := randBuf(bufLen) + b := make([]byte, bufLen) + + n, err := CopyOut(unsafe.Pointer(&b[0]), a) + if n != bufLen { + t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) + } + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !bytes.Equal(a, b) { + t.Errorf("Buffers are not equal when they should be: %v %v", a, b) + } +} + +func TestCopySuccess(t *testing.T) { + // Test that Copy does not return an error when all pages are accessible. + const bufLen = 8192 + a := randBuf(bufLen) + b := make([]byte, bufLen) + + n, err := Copy(unsafe.Pointer(&b[0]), unsafe.Pointer(&a[0]), bufLen) + if n != bufLen { + t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) + } + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !bytes.Equal(a, b) { + t.Errorf("Buffers are not equal when they should be: %v %v", a, b) + } +} + +func TestZeroOutSuccess(t *testing.T) { + // Test that ZeroOut does not return an error when all pages are + // accessible. + const bufLen = 8192 + a := make([]byte, bufLen) + b := randBuf(bufLen) + + n, err := ZeroOut(unsafe.Pointer(&b[0]), bufLen) + if n != bufLen { + t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) + } + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !bytes.Equal(a, b) { + t.Errorf("Buffers are not equal when they should be: %v %v", a, b) + } +} + +func TestSwapUint32Success(t *testing.T) { + // Test that SwapUint32 does not return an error when the page is + // accessible. + before := uint32(rand.Int31()) + after := uint32(rand.Int31()) + val := before + + old, err := SwapUint32(unsafe.Pointer(&val), after) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if old != before { + t.Errorf("Unexpected old value: got %v, want %v", old, before) + } + if val != after { + t.Errorf("Unexpected new value: got %v, want %v", val, after) + } +} + +func TestSwapUint32AlignmentError(t *testing.T) { + // Test that SwapUint32 returns an AlignmentError when passed an unaligned + // address. + data := new(struct{ val uint64 }) + addr := uintptr(unsafe.Pointer(&data.val)) + 1 + want := AlignmentError{Addr: addr, Alignment: 4} + if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } +} + +func TestSwapUint64Success(t *testing.T) { + // Test that SwapUint64 does not return an error when the page is + // accessible. + before := uint64(rand.Int63()) + after := uint64(rand.Int63()) + // "The first word in ... an allocated struct or slice can be relied upon + // to be 64-bit aligned." - sync/atomic docs + data := new(struct{ val uint64 }) + data.val = before + + old, err := SwapUint64(unsafe.Pointer(&data.val), after) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if old != before { + t.Errorf("Unexpected old value: got %v, want %v", old, before) + } + if data.val != after { + t.Errorf("Unexpected new value: got %v, want %v", data.val, after) + } +} + +func TestSwapUint64AlignmentError(t *testing.T) { + // Test that SwapUint64 returns an AlignmentError when passed an unaligned + // address. + data := new(struct{ val1, val2 uint64 }) + addr := uintptr(unsafe.Pointer(&data.val1)) + 1 + want := AlignmentError{Addr: addr, Alignment: 8} + if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } +} + +func TestCompareAndSwapUint32Success(t *testing.T) { + // Test that CompareAndSwapUint32 does not return an error when the page is + // accessible. + before := uint32(rand.Int31()) + after := uint32(rand.Int31()) + val := before + + old, err := CompareAndSwapUint32(unsafe.Pointer(&val), before, after) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if old != before { + t.Errorf("Unexpected old value: got %v, want %v", old, before) + } + if val != after { + t.Errorf("Unexpected new value: got %v, want %v", val, after) + } +} + +func TestCompareAndSwapUint32AlignmentError(t *testing.T) { + // Test that CompareAndSwapUint32 returns an AlignmentError when passed an + // unaligned address. + data := new(struct{ val uint64 }) + addr := uintptr(unsafe.Pointer(&data.val)) + 1 + want := AlignmentError{Addr: addr, Alignment: 4} + if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } +} + +// withSegvErrorTestMapping calls fn with a two-page mapping. The first page +// contains random data, and the second page generates SIGSEGV when accessed. +func withSegvErrorTestMapping(t *testing.T, fn func(m []byte)) { + mapping, err := syscall.Mmap(-1, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE) + if err != nil { + t.Fatalf("Mmap failed: %v", err) + } + defer syscall.Munmap(mapping) + if err := syscall.Mprotect(mapping[pageSize:], syscall.PROT_NONE); err != nil { + t.Fatalf("Mprotect failed: %v", err) + } + initRandom(mapping[:pageSize]) + + fn(mapping) +} + +// withBusErrorTestMapping calls fn with a two-page mapping. The first page +// contains random data, and the second page generates SIGBUS when accessed. +func withBusErrorTestMapping(t *testing.T, fn func(m []byte)) { + f, err := ioutil.TempFile("", "sigbus_test") + if err != nil { + t.Fatalf("TempFile failed: %v", err) + } + defer f.Close() + if err := f.Truncate(pageSize); err != nil { + t.Fatalf("Truncate failed: %v", err) + } + mapping, err := syscall.Mmap(int(f.Fd()), 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) + if err != nil { + t.Fatalf("Mmap failed: %v", err) + } + defer syscall.Munmap(mapping) + initRandom(mapping[:pageSize]) + + fn(mapping) +} + +func TestCopyInSegvError(t *testing.T) { + // Test that CopyIn returns a SegvError when reaching a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + dst := randBuf(pageSize) + n, err := CopyIn(dst, src) + if n != bytesBeforeFault { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyInBusError(t *testing.T) { + // Test that CopyIn returns a BusError when reaching a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + dst := randBuf(pageSize) + n, err := CopyIn(dst, src) + if n != bytesBeforeFault { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyOutSegvError(t *testing.T) { + // Test that CopyOut returns a SegvError when reaching a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + src := randBuf(pageSize) + n, err := CopyOut(dst, src) + if n != bytesBeforeFault { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyOutBusError(t *testing.T) { + // Test that CopyOut returns a BusError when reaching a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + src := randBuf(pageSize) + n, err := CopyOut(dst, src) + if n != bytesBeforeFault { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopySourceSegvError(t *testing.T) { + // Test that Copy returns a SegvError when copying from a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + dst := randBuf(pageSize) + n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopySourceBusError(t *testing.T) { + // Test that Copy returns a BusError when copying from a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + dst := randBuf(pageSize) + n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyDestinationSegvError(t *testing.T) { + // Test that Copy returns a SegvError when copying to a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + src := randBuf(pageSize) + n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyDestinationBusError(t *testing.T) { + // Test that Copy returns a BusError when copying to a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + src := randBuf(pageSize) + n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestZeroOutSegvError(t *testing.T) { + // Test that ZeroOut returns a SegvError when reaching a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + n, err := ZeroOut(dst, pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) { + t.Errorf("Non-zero bytes in written part of mapping: %v", got) + } + }) + }) + } +} + +func TestZeroOutBusError(t *testing.T) { + // Test that ZeroOut returns a BusError when reaching a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + n, err := ZeroOut(dst, pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) { + t.Errorf("Non-zero bytes in written part of mapping: %v", got) + } + }) + }) + } +} + +func TestSwapUint32SegvError(t *testing.T) { + // Test that SwapUint32 returns a SegvError when reaching a page that + // signals SIGSEGV. + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := SwapUint32(unsafe.Pointer(secondPage), 1) + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestSwapUint32BusError(t *testing.T) { + // Test that SwapUint32 returns a BusError when reaching a page that + // signals SIGBUS. + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := SwapUint32(unsafe.Pointer(secondPage), 1) + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestSwapUint64SegvError(t *testing.T) { + // Test that SwapUint64 returns a SegvError when reaching a page that + // signals SIGSEGV. + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := SwapUint64(unsafe.Pointer(secondPage), 1) + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestSwapUint64BusError(t *testing.T) { + // Test that SwapUint64 returns a BusError when reaching a page that + // signals SIGBUS. + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := SwapUint64(unsafe.Pointer(secondPage), 1) + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestCompareAndSwapUint32SegvError(t *testing.T) { + // Test that CompareAndSwapUint32 returns a SegvError when reaching a page + // that signals SIGSEGV. + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestCompareAndSwapUint32BusError(t *testing.T) { + // Test that CompareAndSwapUint32 returns a BusError when reaching a page + // that signals SIGBUS. + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func testCopy(dst, src []byte) (panicked bool) { + defer func() { + if r := recover(); r != nil { + panicked = true + } + }() + debug.SetPanicOnFault(true) + copy(dst, src) + return +} + +func TestSegVOnMemmove(t *testing.T) { + // Test that SIGSEGVs received by runtime.memmove when *not* doing + // CopyIn or CopyOut work gets propagated to the runtime. + const bufLen = pageSize + a, err := syscall.Mmap(-1, 0, bufLen, syscall.PROT_NONE, syscall.MAP_ANON|syscall.MAP_PRIVATE) + if err != nil { + t.Fatalf("Mmap failed: %v", err) + + } + defer syscall.Munmap(a) + b := randBuf(bufLen) + + if !testCopy(b, a) { + t.Fatalf("testCopy didn't panic when it should have") + } + + if !testCopy(a, b) { + t.Fatalf("testCopy didn't panic when it should have") + } +} + +func TestSigbusOnMemmove(t *testing.T) { + // Test that SIGBUS received by runtime.memmove when *not* doing + // CopyIn or CopyOut work gets propagated to the runtime. + const bufLen = pageSize + f, err := ioutil.TempFile("", "sigbus_test") + if err != nil { + t.Fatalf("TempFile failed: %v", err) + } + os.Remove(f.Name()) + defer f.Close() + + a, err := syscall.Mmap(int(f.Fd()), 0, bufLen, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) + if err != nil { + t.Fatalf("Mmap failed: %v", err) + + } + defer syscall.Munmap(a) + b := randBuf(bufLen) + + if !testCopy(b, a) { + t.Fatalf("testCopy didn't panic when it should have") + } + + if !testCopy(a, b) { + t.Fatalf("testCopy didn't panic when it should have") + } +} diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go new file mode 100644 index 000000000..eef028e68 --- /dev/null +++ b/pkg/safecopy/safecopy_unsafe.go @@ -0,0 +1,335 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safecopy + +import ( + "fmt" + "syscall" + "unsafe" +) + +// maxRegisterSize is the maximum register size used in memcpy and memclr. It +// is used to decide by how much to rewind the copy (for memcpy) or zeroing +// (for memclr) before proceeding. +const maxRegisterSize = 16 + +// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received +// during the copy, it returns the address that caused the fault and the number +// of the signal that was received. Otherwise, it returns an unspecified address +// and a signal number of 0. +// +// Data is copied in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully copied. +// +//go:noescape +func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) + +// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS +// signal is received during the write, it returns the address that caused the +// fault and the number of the signal that was received. Otherwise, it returns +// an unspecified address and a signal number of 0. +// +// Data is written in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully written. +// +//go:noescape +func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) + +// swapUint32 atomically stores new into *ptr and returns (the previous *ptr +// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the +// value of old is unspecified, and sig is the number of the signal that was +// received. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +// +//go:noescape +func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) + +// swapUint64 atomically stores new into *ptr and returns (the previous *ptr +// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the +// value of old is unspecified, and sig is the number of the signal that was +// received. +// +// Preconditions: ptr must be aligned to a 8-byte boundary. +// +//go:noescape +func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) + +// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns +// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is +// received during the operation, the value of prev is unspecified, and sig is +// the number of the signal that was received. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +// +//go:noescape +func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) + +// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It +// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +// +//go:noescape +func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) + +// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes +// copied and an error if SIGSEGV or SIGBUS is received while reading from src. +func CopyIn(dst []byte, src unsafe.Pointer) (int, error) { + toCopy := uintptr(len(dst)) + if len(dst) == 0 { + return 0, nil + } + + fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy) + if sig == 0 { + return len(dst), nil + } + + faultN, srcN := uintptr(fault), uintptr(src) + if faultN < srcN || faultN >= srcN+toCopy { + panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy)) + } + + // memcpy might have ended the copy up to maxRegisterSize bytes before + // fault, if an instruction caused a memory access that straddled two + // pages, and the second one faulted. Try to copy up to the fault. + var done int + if faultN-srcN > maxRegisterSize { + done = int(faultN - srcN - maxRegisterSize) + } + n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done))) + done += n + if err != nil { + return done, err + } + return done, errorFromFaultSignal(fault, sig) +} + +// CopyOut copies len(src) bytes from src to dst. If returns the number of +// bytes done and an error if SIGSEGV or SIGBUS is received while writing to +// dst. +func CopyOut(dst unsafe.Pointer, src []byte) (int, error) { + toCopy := uintptr(len(src)) + if toCopy == 0 { + return 0, nil + } + + fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy) + if sig == 0 { + return len(src), nil + } + + faultN, dstN := uintptr(fault), uintptr(dst) + if faultN < dstN || faultN >= dstN+toCopy { + panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy)) + } + + // memcpy might have ended the copy up to maxRegisterSize bytes before + // fault, if an instruction caused a memory access that straddled two + // pages, and the second one faulted. Try to copy up to the fault. + var done int + if faultN-dstN > maxRegisterSize { + done = int(faultN - dstN - maxRegisterSize) + } + n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)]) + done += n + if err != nil { + return done, err + } + return done, errorFromFaultSignal(fault, sig) +} + +// Copy copies toCopy bytes from src to dst. It returns the number of bytes +// copied and an error if SIGSEGV or SIGBUS is received while reading from src +// or writing to dst. +// +// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap, +// the resulting contents of dst are unspecified. +func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { + if toCopy == 0 { + return 0, nil + } + + fault, sig := memcpy(dst, src, toCopy) + if sig == 0 { + return toCopy, nil + } + + // Did the fault occur while reading from src or writing to dst? + faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst) + faultAfterSrc := ^uintptr(0) + if faultN >= srcN { + faultAfterSrc = faultN - srcN + } + faultAfterDst := ^uintptr(0) + if faultN >= dstN { + faultAfterDst = faultN - dstN + } + if faultAfterSrc >= toCopy && faultAfterDst >= toCopy { + panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy)) + } + faultedAfter := faultAfterSrc + if faultedAfter > faultAfterDst { + faultedAfter = faultAfterDst + } + + // memcpy might have ended the copy up to maxRegisterSize bytes before + // fault, if an instruction caused a memory access that straddled two + // pages, and the second one faulted. Try to copy up to the fault. + var done uintptr + if faultedAfter > maxRegisterSize { + done = faultedAfter - maxRegisterSize + } + n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done) + done += n + if err != nil { + return done, err + } + return done, errorFromFaultSignal(fault, sig) +} + +// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes +// written and an error if SIGSEGV or SIGBUS is received while writing to dst. +func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) { + if toZero == 0 { + return 0, nil + } + + fault, sig := memclr(dst, toZero) + if sig == 0 { + return toZero, nil + } + + faultN, dstN := uintptr(fault), uintptr(dst) + if faultN < dstN || faultN >= dstN+toZero { + panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero)) + } + + // memclr might have ended the write up to maxRegisterSize bytes before + // fault, if an instruction caused a memory access that straddled two + // pages, and the second one faulted. Try to write up to the fault. + var done uintptr + if faultN-dstN > maxRegisterSize { + done = faultN - dstN - maxRegisterSize + } + n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done) + done += n + if err != nil { + return done, err + } + return done, errorFromFaultSignal(fault, sig) +} + +// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns +// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is +// not aligned to a 4-byte boundary. +func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) { + if addr := uintptr(ptr); addr&3 != 0 { + return 0, AlignmentError{addr, 4} + } + old, sig := swapUint32(ptr, new) + return old, errorFromFaultSignal(ptr, sig) +} + +// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns +// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is +// not aligned to an 8-byte boundary. +func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) { + if addr := uintptr(ptr); addr&7 != 0 { + return 0, AlignmentError{addr, 8} + } + old, sig := swapUint64(ptr, new) + return old, errorFromFaultSignal(ptr, sig) +} + +// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32, +// except that it returns an error if SIGSEGV or SIGBUS is received while +// accessing ptr, or if ptr is not aligned to a 4-byte boundary. +func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) { + if addr := uintptr(ptr); addr&3 != 0 { + return 0, AlignmentError{addr, 4} + } + prev, sig := compareAndSwapUint32(ptr, old, new) + return prev, errorFromFaultSignal(ptr, sig) +} + +// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It +// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +func LoadUint32(ptr unsafe.Pointer) (uint32, error) { + if addr := uintptr(ptr); addr&3 != 0 { + return 0, AlignmentError{addr, 4} + } + val, sig := loadUint32(ptr) + return val, errorFromFaultSignal(ptr, sig) +} + +func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error { + switch sig { + case 0: + return nil + case int32(syscall.SIGSEGV): + return SegvError{uintptr(addr)} + case int32(syscall.SIGBUS): + return BusError{uintptr(addr)} + default: + panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr)) + } +} + +// ReplaceSignalHandler replaces the existing signal handler for the provided +// signal with the one that handles faults in safecopy-protected functions. +// +// It stores the value of the previously set handler in previous. +// +// This function will be called on initialization in order to install safecopy +// handlers for appropriate signals. These handlers will call the previous +// handler however, and if this is function is being used externally then the +// same courtesy is expected. +func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error { + var sa struct { + handler uintptr + flags uint64 + restorer uintptr + mask uint64 + } + const maskLen = 8 + + // Get the existing signal handler information, and save the current + // handler. Once we replace it, we will use this pointer to fall back to + // it when we receive other signals. + if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 { + return e + } + + // Fail if there isn't a previous handler. + if sa.handler == 0 { + return fmt.Errorf("previous handler for signal %x isn't set", sig) + } + + *previous = sa.handler + + // Install our own handler. + sa.handler = handler + if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 { + return e + } + + return nil +} diff --git a/pkg/safecopy/sighandler_amd64.s b/pkg/safecopy/sighandler_amd64.s new file mode 100644 index 000000000..475ae48e9 --- /dev/null +++ b/pkg/safecopy/sighandler_amd64.s @@ -0,0 +1,133 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// The signals handled by sigHandler. +#define SIGBUS 7 +#define SIGSEGV 11 + +// Offsets to the registers in context->uc_mcontext.gregs[]. +#define REG_RDI 0x68 +#define REG_RAX 0x90 +#define REG_IP 0xa8 + +// Offset to the si_addr field of siginfo. +#define SI_CODE 0x08 +#define SI_ADDR 0x10 + +// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must +// not be set up as a handler to any other signals. +// +// If the instruction causing the signal is within a safecopy-protected +// function, the signal is handled such that execution resumes in the +// appropriate fault handling stub with AX containing the faulting address and +// DI containing the signal number. Otherwise control is transferred to the +// previously configured signal handler (savedSigSegvHandler or +// savedSigBusHandler). +// +// This function cannot be written in go because it runs whenever a signal is +// received by the thread (preempting whatever was running), which includes when +// garbage collector has stopped or isn't expecting any interactions (like +// barriers). +// +// The arguments are the following: +// DI - The signal number. +// SI - Pointer to siginfo_t structure. +// DX - Pointer to ucontext structure. +TEXT ·signalHandler(SB),NOSPLIT,$0 + // Check if the signal is from the kernel. + MOVQ $0x0, CX + CMPL CX, SI_CODE(SI) + JGE original_handler + + // Check if RIP is within the area we care about. + MOVQ REG_IP(DX), CX + CMPQ CX, ·memcpyBegin(SB) + JB not_memcpy + CMPQ CX, ·memcpyEnd(SB) + JAE not_memcpy + + // Modify the context such that execution will resume in the fault + // handler. + LEAQ handleMemcpyFault(SB), CX + JMP handle_fault + +not_memcpy: + CMPQ CX, ·memclrBegin(SB) + JB not_memclr + CMPQ CX, ·memclrEnd(SB) + JAE not_memclr + + LEAQ handleMemclrFault(SB), CX + JMP handle_fault + +not_memclr: + CMPQ CX, ·swapUint32Begin(SB) + JB not_swapuint32 + CMPQ CX, ·swapUint32End(SB) + JAE not_swapuint32 + + LEAQ handleSwapUint32Fault(SB), CX + JMP handle_fault + +not_swapuint32: + CMPQ CX, ·swapUint64Begin(SB) + JB not_swapuint64 + CMPQ CX, ·swapUint64End(SB) + JAE not_swapuint64 + + LEAQ handleSwapUint64Fault(SB), CX + JMP handle_fault + +not_swapuint64: + CMPQ CX, ·compareAndSwapUint32Begin(SB) + JB not_casuint32 + CMPQ CX, ·compareAndSwapUint32End(SB) + JAE not_casuint32 + + LEAQ handleCompareAndSwapUint32Fault(SB), CX + JMP handle_fault + +not_casuint32: + CMPQ CX, ·loadUint32Begin(SB) + JB not_loaduint32 + CMPQ CX, ·loadUint32End(SB) + JAE not_loaduint32 + + LEAQ handleLoadUint32Fault(SB), CX + JMP handle_fault + +not_loaduint32: +original_handler: + // Jump to the previous signal handler, which is likely the golang one. + XORQ CX, CX + MOVQ ·savedSigBusHandler(SB), AX + CMPL DI, $SIGSEGV + CMOVQEQ ·savedSigSegVHandler(SB), AX + JMP AX + +handle_fault: + // Entered with the address of the fault handler in RCX; store it in + // RIP. + MOVQ CX, REG_IP(DX) + + // Store the faulting address in RAX. + MOVQ SI_ADDR(SI), CX + MOVQ CX, REG_RAX(DX) + + // Store the signal number in EDI. + MOVL DI, REG_RDI(DX) + + RET diff --git a/pkg/safecopy/sighandler_arm64.s b/pkg/safecopy/sighandler_arm64.s new file mode 100644 index 000000000..53e4ac2c1 --- /dev/null +++ b/pkg/safecopy/sighandler_arm64.s @@ -0,0 +1,143 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// The signals handled by sigHandler. +#define SIGBUS 7 +#define SIGSEGV 11 + +// Offsets to the registers in context->uc_mcontext.gregs[]. +#define REG_R0 0xB8 +#define REG_R1 0xC0 +#define REG_PC 0x1B8 + +// Offset to the si_addr field of siginfo. +#define SI_CODE 0x08 +#define SI_ADDR 0x10 + +// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must +// not be set up as a handler to any other signals. +// +// If the instruction causing the signal is within a safecopy-protected +// function, the signal is handled such that execution resumes in the +// appropriate fault handling stub with R0 containing the faulting address and +// R1 containing the signal number. Otherwise control is transferred to the +// previously configured signal handler (savedSigSegvHandler or +// savedSigBusHandler). +// +// This function cannot be written in go because it runs whenever a signal is +// received by the thread (preempting whatever was running), which includes when +// garbage collector has stopped or isn't expecting any interactions (like +// barriers). +// +// The arguments are the following: +// R0 - The signal number. +// R1 - Pointer to siginfo_t structure. +// R2 - Pointer to ucontext structure. +TEXT ·signalHandler(SB),NOSPLIT,$0 + // Check if the signal is from the kernel, si_code > 0 means a kernel signal. + MOVD SI_CODE(R1), R7 + CMPW $0x0, R7 + BLE original_handler + + // Check if PC is within the area we care about. + MOVD REG_PC(R2), R7 + MOVD ·memcpyBegin(SB), R8 + CMP R8, R7 + BLO not_memcpy + MOVD ·memcpyEnd(SB), R8 + CMP R8, R7 + BHS not_memcpy + + // Modify the context such that execution will resume in the fault handler. + MOVD $handleMemcpyFault(SB), R7 + B handle_fault + +not_memcpy: + MOVD ·memclrBegin(SB), R8 + CMP R8, R7 + BLO not_memclr + MOVD ·memclrEnd(SB), R8 + CMP R8, R7 + BHS not_memclr + + MOVD $handleMemclrFault(SB), R7 + B handle_fault + +not_memclr: + MOVD ·swapUint32Begin(SB), R8 + CMP R8, R7 + BLO not_swapuint32 + MOVD ·swapUint32End(SB), R8 + CMP R8, R7 + BHS not_swapuint32 + + MOVD $handleSwapUint32Fault(SB), R7 + B handle_fault + +not_swapuint32: + MOVD ·swapUint64Begin(SB), R8 + CMP R8, R7 + BLO not_swapuint64 + MOVD ·swapUint64End(SB), R8 + CMP R8, R7 + BHS not_swapuint64 + + MOVD $handleSwapUint64Fault(SB), R7 + B handle_fault + +not_swapuint64: + MOVD ·compareAndSwapUint32Begin(SB), R8 + CMP R8, R7 + BLO not_casuint32 + MOVD ·compareAndSwapUint32End(SB), R8 + CMP R8, R7 + BHS not_casuint32 + + MOVD $handleCompareAndSwapUint32Fault(SB), R7 + B handle_fault + +not_casuint32: + MOVD ·loadUint32Begin(SB), R8 + CMP R8, R7 + BLO not_loaduint32 + MOVD ·loadUint32End(SB), R8 + CMP R8, R7 + BHS not_loaduint32 + + MOVD $handleLoadUint32Fault(SB), R7 + B handle_fault + +not_loaduint32: +original_handler: + // Jump to the previous signal handler, which is likely the golang one. + MOVD ·savedSigBusHandler(SB), R7 + MOVD ·savedSigSegVHandler(SB), R8 + CMPW $SIGSEGV, R0 + CSEL EQ, R8, R7, R7 + B (R7) + +handle_fault: + // Entered with the address of the fault handler in R7; store it in PC. + MOVD R7, REG_PC(R2) + + // Store the faulting address in R0. + MOVD SI_ADDR(R1), R7 + MOVD R7, REG_R0(R2) + + // Store the signal number in R1. + MOVW R0, REG_R1(R2) + + RET diff --git a/pkg/safemem/BUILD b/pkg/safemem/BUILD new file mode 100644 index 000000000..ce30382ab --- /dev/null +++ b/pkg/safemem/BUILD @@ -0,0 +1,27 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "safemem", + srcs = [ + "block_unsafe.go", + "io.go", + "safemem.go", + "seq_unsafe.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/safecopy", + ], +) + +go_test( + name = "safemem_test", + size = "small", + srcs = [ + "io_test.go", + "seq_test.go", + ], + library = ":safemem", +) diff --git a/pkg/safemem/block_unsafe.go b/pkg/safemem/block_unsafe.go new file mode 100644 index 000000000..e7fd30743 --- /dev/null +++ b/pkg/safemem/block_unsafe.go @@ -0,0 +1,279 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "fmt" + "reflect" + "unsafe" + + "gvisor.dev/gvisor/pkg/safecopy" +) + +// A Block is a range of contiguous bytes, similar to []byte but with the +// following differences: +// +// - The memory represented by a Block may require the use of safecopy to +// access. +// +// - Block does not carry a capacity and cannot be expanded. +// +// Blocks are immutable and may be copied by value. The zero value of Block +// represents an empty range, analogous to a nil []byte. +type Block struct { + // [start, start+length) is the represented memory. + // + // start is an unsafe.Pointer to ensure that Block prevents the represented + // memory from being garbage-collected. + start unsafe.Pointer + length int + + // needSafecopy is true if accessing the represented memory requires the + // use of safecopy. + needSafecopy bool +} + +// BlockFromSafeSlice returns a Block equivalent to slice, which is safe to +// access without safecopy. +func BlockFromSafeSlice(slice []byte) Block { + return blockFromSlice(slice, false) +} + +// BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to +// access without safecopy. +func BlockFromUnsafeSlice(slice []byte) Block { + return blockFromSlice(slice, true) +} + +func blockFromSlice(slice []byte, needSafecopy bool) Block { + if len(slice) == 0 { + return Block{} + } + return Block{ + start: unsafe.Pointer(&slice[0]), + length: len(slice), + needSafecopy: needSafecopy, + } +} + +// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+len), which is +// safe to access without safecopy. +// +// Preconditions: ptr+len does not overflow. +func BlockFromSafePointer(ptr unsafe.Pointer, len int) Block { + return blockFromPointer(ptr, len, false) +} + +// BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which +// is not safe to access without safecopy. +// +// Preconditions: ptr+len does not overflow. +func BlockFromUnsafePointer(ptr unsafe.Pointer, len int) Block { + return blockFromPointer(ptr, len, true) +} + +func blockFromPointer(ptr unsafe.Pointer, len int, needSafecopy bool) Block { + if uptr := uintptr(ptr); uptr+uintptr(len) < uptr { + panic(fmt.Sprintf("ptr %#x + len %#x overflows", ptr, len)) + } + return Block{ + start: ptr, + length: len, + needSafecopy: needSafecopy, + } +} + +// DropFirst returns a Block equivalent to b, but with the first n bytes +// omitted. It is analogous to the [n:] operation on a slice, except that if n +// > b.Len(), DropFirst returns an empty Block instead of panicking. +// +// Preconditions: n >= 0. +func (b Block) DropFirst(n int) Block { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return b.DropFirst64(uint64(n)) +} + +// DropFirst64 is equivalent to DropFirst but takes a uint64. +func (b Block) DropFirst64(n uint64) Block { + if n >= uint64(b.length) { + return Block{} + } + return Block{ + start: unsafe.Pointer(uintptr(b.start) + uintptr(n)), + length: b.length - int(n), + needSafecopy: b.needSafecopy, + } +} + +// TakeFirst returns a Block equivalent to the first n bytes of b. It is +// analogous to the [:n] operation on a slice, except that if n > b.Len(), +// TakeFirst returns a copy of b instead of panicking. +// +// Preconditions: n >= 0. +func (b Block) TakeFirst(n int) Block { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return b.TakeFirst64(uint64(n)) +} + +// TakeFirst64 is equivalent to TakeFirst but takes a uint64. +func (b Block) TakeFirst64(n uint64) Block { + if n == 0 { + return Block{} + } + if n >= uint64(b.length) { + return b + } + return Block{ + start: b.start, + length: int(n), + needSafecopy: b.needSafecopy, + } +} + +// ToSlice returns a []byte equivalent to b. +func (b Block) ToSlice() []byte { + var bs []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) + hdr.Data = uintptr(b.start) + hdr.Len = b.length + hdr.Cap = b.length + return bs +} + +// Addr returns b's start address as a uintptr. It returns uintptr instead of +// unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers +// without importing the unsafe package explicitly. +// +// Note that a uintptr is not recognized as a pointer by the garbage collector, +// such that if there are no uses of b after a call to b.Addr() and the address +// is to Go-managed memory, the returned uintptr does not prevent garbage +// collection of the pointee. +func (b Block) Addr() uintptr { + return uintptr(b.start) +} + +// Len returns b's length in bytes. +func (b Block) Len() int { + return b.length +} + +// NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy. +func (b Block) NeedSafecopy() bool { + return b.needSafecopy +} + +// String implements fmt.Stringer.String. +func (b Block) String() string { + if uintptr(b.start) == 0 && b.length == 0 { + return "" + } + var suffix string + if b.needSafecopy { + suffix = "*" + } + return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix) +} + +// Copy copies src.Len() or dst.Len() bytes, whichever is less, from src +// to dst and returns the number of bytes copied. +// +// If src and dst overlap, the data stored in dst is unspecified. +func Copy(dst, src Block) (int, error) { + if !dst.needSafecopy && !src.needSafecopy { + return copy(dst.ToSlice(), src.ToSlice()), nil + } + + n := dst.length + if n > src.length { + n = src.length + } + if n == 0 { + return 0, nil + } + + switch { + case dst.needSafecopy && !src.needSafecopy: + return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice()) + case !dst.needSafecopy && src.needSafecopy: + return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start) + case dst.needSafecopy && src.needSafecopy: + n64, err := safecopy.Copy(dst.start, src.start, uintptr(n)) + return int(n64), err + default: + panic("unreachable") + } +} + +// Zero sets all bytes in dst to 0 and returns the number of bytes zeroed. +func Zero(dst Block) (int, error) { + if !dst.needSafecopy { + bs := dst.ToSlice() + for i := range bs { + bs[i] = 0 + } + return len(bs), nil + } + + n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length)) + return int(n64), err +} + +// Safecopy atomics are no slower than non-safecopy atomics, so use the former +// even when !b.needSafecopy to get consistent alignment checking. + +// SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b. +// +// Preconditions: b.Len() >= 4. +func SwapUint32(b Block, new uint32) (uint32, error) { + if b.length < 4 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.SwapUint32(b.start, new) +} + +// SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b. +// +// Preconditions: b.Len() >= 8. +func SwapUint64(b Block, new uint64) (uint64, error) { + if b.length < 8 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.SwapUint64(b.start, new) +} + +// CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4 +// bytes of b. +// +// Preconditions: b.Len() >= 4. +func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) { + if b.length < 4 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.CompareAndSwapUint32(b.start, old, new) +} + +// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b. +// +// Preconditions: b.Len() >= 4. +func LoadUint32(b Block) (uint32, error) { + if b.length < 4 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.LoadUint32(b.start) +} diff --git a/pkg/safemem/io.go b/pkg/safemem/io.go new file mode 100644 index 000000000..f039a5c34 --- /dev/null +++ b/pkg/safemem/io.go @@ -0,0 +1,392 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "errors" + "io" + "math" +) + +// ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write +// beyond the end of the BlockSeq. +var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq") + +// Reader represents a streaming byte source like io.Reader. +type Reader interface { + // ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the + // number of bytes read. It may return a partial read without an error + // (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a + // full read with an error (i.e. (dsts.NumBytes(), err) where err != nil); + // note that this differs from io.Reader.Read (in particular, io.EOF should + // not be returned if ReadToBlocks successfully reads dsts.NumBytes() + // bytes.) + ReadToBlocks(dsts BlockSeq) (uint64, error) +} + +// Writer represents a streaming byte sink like io.Writer. +type Writer interface { + // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns + // the number of bytes written. It may return a partial write without an + // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not + // return a full write with an error (i.e. srcs.NumBytes(), err) where err + // != nil). + WriteFromBlocks(srcs BlockSeq) (uint64, error) +} + +// ReadFullToBlocks repeatedly invokes r.ReadToBlocks until dsts.NumBytes() +// bytes have been read or ReadToBlocks returns an error. +func ReadFullToBlocks(r Reader, dsts BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() { + n, err := r.ReadToBlocks(dsts) + done += n + if err != nil { + return done, err + } + dsts = dsts.DropFirst64(n) + } + return done, nil +} + +// WriteFullFromBlocks repeatedly invokes w.WriteFromBlocks until +// srcs.NumBytes() bytes have been written or WriteFromBlocks returns an error. +func WriteFullFromBlocks(w Writer, srcs BlockSeq) (uint64, error) { + var done uint64 + for !srcs.IsEmpty() { + n, err := w.WriteFromBlocks(srcs) + done += n + if err != nil { + return done, err + } + srcs = srcs.DropFirst64(n) + } + return done, nil +} + +// BlockSeqReader implements Reader by reading from a BlockSeq. +type BlockSeqReader struct { + Blocks BlockSeq +} + +// ReadToBlocks implements Reader.ReadToBlocks. +func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { + n, err := CopySeq(dsts, r.Blocks) + r.Blocks = r.Blocks.DropFirst64(n) + if err != nil { + return n, err + } + if n < dsts.NumBytes() { + return n, io.EOF + } + return n, nil +} + +// BlockSeqWriter implements Writer by writing to a BlockSeq. +type BlockSeqWriter struct { + Blocks BlockSeq +} + +// WriteFromBlocks implements Writer.WriteFromBlocks. +func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { + n, err := CopySeq(w.Blocks, srcs) + w.Blocks = w.Blocks.DropFirst64(n) + if err != nil { + return n, err + } + if n < srcs.NumBytes() { + return n, ErrEndOfBlockSeq + } + return n, nil +} + +// ReaderFunc implements Reader for a function with the semantics of +// Reader.ReadToBlocks. +type ReaderFunc func(dsts BlockSeq) (uint64, error) + +// ReadToBlocks implements Reader.ReadToBlocks. +func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { + return f(dsts) +} + +// WriterFunc implements Writer for a function with the semantics of +// Writer.WriteFromBlocks. +type WriterFunc func(srcs BlockSeq) (uint64, error) + +// WriteFromBlocks implements Writer.WriteFromBlocks. +func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { + return f(srcs) +} + +// ToIOReader implements io.Reader for a (safemem.)Reader. +// +// ToIOReader will return a successful partial read iff Reader.ReadToBlocks does +// so. +type ToIOReader struct { + Reader Reader +} + +// Read implements io.Reader.Read. +func (r ToIOReader) Read(dst []byte) (int, error) { + n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst))) + return int(n), err +} + +// ToIOWriter implements io.Writer for a (safemem.)Writer. +type ToIOWriter struct { + Writer Writer +} + +// Write implements io.Writer.Write. +func (w ToIOWriter) Write(src []byte) (int, error) { + // io.Writer does not permit partial writes. + n, err := WriteFullFromBlocks(w.Writer, BlockSeqOf(BlockFromSafeSlice(src))) + return int(n), err +} + +// FromIOReader implements Reader for an io.Reader by repeatedly invoking +// io.Reader.Read until it returns an error or partial read. This is not +// thread-safe. +// +// FromIOReader will return a successful partial read iff Reader.Read does so. +type FromIOReader struct { + Reader io.Reader +} + +// ReadToBlocks implements Reader.ReadToBlocks. +func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { + var buf []byte + var done uint64 + for !dsts.IsEmpty() { + dst := dsts.Head() + var n int + var err error + n, buf, err = r.readToBlock(dst, buf) + done += uint64(n) + if n != dst.Len() { + return done, err + } + dsts = dsts.Tail() + if err != nil { + if dsts.IsEmpty() && err == io.EOF { + return done, nil + } + return done, err + } + } + return done, nil +} + +func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) { + // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require + // safecopy. + if !dst.NeedSafecopy() { + n, err := r.Reader.Read(dst.ToSlice()) + return n, buf, err + } + if len(buf) < dst.Len() { + buf = make([]byte, dst.Len()) + } + rn, rerr := r.Reader.Read(buf[:dst.Len()]) + wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) + if wberr != nil { + return wbn, buf, wberr + } + return wbn, buf, rerr +} + +// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly +// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial +// read indicates an error. This is not thread-safe. +type FromIOReaderAt struct { + ReaderAt io.ReaderAt + Offset int64 +} + +// ReadToBlocks implements Reader.ReadToBlocks. +func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) { + var buf []byte + var done uint64 + for !dsts.IsEmpty() { + dst := dsts.Head() + var n int + var err error + n, buf, err = r.readToBlock(dst, buf) + done += uint64(n) + if n != dst.Len() { + return done, err + } + dsts = dsts.Tail() + if err != nil { + if dsts.IsEmpty() && err == io.EOF { + return done, nil + } + return done, err + } + } + return done, nil +} + +func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) { + // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require + // safecopy. + if !dst.NeedSafecopy() { + n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset) + r.Offset += int64(n) + return n, buf, err + } + if len(buf) < dst.Len() { + buf = make([]byte, dst.Len()) + } + rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset) + r.Offset += int64(rn) + wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) + if wberr != nil { + return wbn, buf, wberr + } + return wbn, buf, rerr +} + +// FromIOWriter implements Writer for an io.Writer by repeatedly invoking +// io.Writer.Write until it returns an error or partial write. +// +// FromIOWriter will tolerate implementations of io.Writer.Write that return +// partial writes with a nil error in contravention of io.Writer's +// requirements, since Writer is permitted to do so. FromIOWriter will return a +// successful partial write iff Writer.Write does so. +type FromIOWriter struct { + Writer io.Writer +} + +// WriteFromBlocks implements Writer.WriteFromBlocks. +func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { + var buf []byte + var done uint64 + for !srcs.IsEmpty() { + src := srcs.Head() + var n int + var err error + n, buf, err = w.writeFromBlock(src, buf) + done += uint64(n) + if n != src.Len() || err != nil { + return done, err + } + srcs = srcs.Tail() + } + return done, nil +} + +func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) { + // io.Writer isn't safecopy-aware, so we have to buffer Blocks that require + // safecopy. + if !src.NeedSafecopy() { + n, err := w.Writer.Write(src.ToSlice()) + return n, buf, err + } + if len(buf) < src.Len() { + buf = make([]byte, src.Len()) + } + bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src) + wn, werr := w.Writer.Write(buf[:bufn]) + if werr != nil { + return wn, buf, werr + } + return wn, buf, buferr +} + +// FromVecReaderFunc implements Reader for a function that reads data into a +// [][]byte and returns the number of bytes read as an int64. +type FromVecReaderFunc struct { + ReadVec func(dsts [][]byte) (int64, error) +} + +// ReadToBlocks implements Reader.ReadToBlocks. +// +// ReadToBlocks calls r.ReadVec at most once. +func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + // Ensure that we don't pass a [][]byte with a total length > MaxInt64. + dsts = dsts.TakeFirst64(uint64(math.MaxInt64)) + dstSlices := make([][]byte, 0, dsts.NumBlocks()) + // Buffer Blocks that require safecopy. + for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() { + dst := tmp.Head() + if dst.NeedSafecopy() { + dstSlices = append(dstSlices, make([]byte, dst.Len())) + } else { + dstSlices = append(dstSlices, dst.ToSlice()) + } + } + rn, rerr := r.ReadVec(dstSlices) + dsts = dsts.TakeFirst64(uint64(rn)) + var done uint64 + var i int + for !dsts.IsEmpty() { + dst := dsts.Head() + if dst.NeedSafecopy() { + n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i])) + done += uint64(n) + if err != nil { + return done, err + } + } else { + done += uint64(dst.Len()) + } + dsts = dsts.Tail() + i++ + } + return done, rerr +} + +// FromVecWriterFunc implements Writer for a function that writes data from a +// [][]byte and returns the number of bytes written. +type FromVecWriterFunc struct { + WriteVec func(srcs [][]byte) (int64, error) +} + +// WriteFromBlocks implements Writer.WriteFromBlocks. +// +// WriteFromBlocks calls w.WriteVec at most once. +func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + // Ensure that we don't pass a [][]byte with a total length > MaxInt64. + srcs = srcs.TakeFirst64(uint64(math.MaxInt64)) + srcSlices := make([][]byte, 0, srcs.NumBlocks()) + // Buffer Blocks that require safecopy. + var buferr error + for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() { + src := tmp.Head() + if src.NeedSafecopy() { + slice := make([]byte, src.Len()) + n, err := Copy(BlockFromSafeSlice(slice), src) + srcSlices = append(srcSlices, slice[:n]) + if err != nil { + buferr = err + break + } + } else { + srcSlices = append(srcSlices, src.ToSlice()) + } + } + n, err := w.WriteVec(srcSlices) + if err != nil { + return uint64(n), err + } + return uint64(n), buferr +} diff --git a/pkg/safemem/io_test.go b/pkg/safemem/io_test.go new file mode 100644 index 000000000..629741bee --- /dev/null +++ b/pkg/safemem/io_test.go @@ -0,0 +1,199 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "bytes" + "io" + "testing" +) + +func makeBlocks(slices ...[]byte) []Block { + blocks := make([]Block, 0, len(slices)) + for _, s := range slices { + blocks = append(blocks, BlockFromSafeSlice(s)) + } + return blocks +} + +func TestFromIOReaderFullRead(t *testing.T) { + r := FromIOReader{bytes.NewBufferString("foobar")} + dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) + n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) + if wantN := uint64(6); n != wantN || err != nil { + t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + for i, want := range [][]byte{[]byte("foo"), []byte("bar")} { + if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { + t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) + } + } +} + +type eofHidingReader struct { + Reader io.Reader +} + +func (r eofHidingReader) Read(dst []byte) (int, error) { + n, err := r.Reader.Read(dst) + if err == io.EOF { + return n, nil + } + return n, err +} + +func TestFromIOReaderPartialRead(t *testing.T) { + r := FromIOReader{eofHidingReader{bytes.NewBufferString("foob")}} + dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) + n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) + // FromIOReader should stop after the eofHidingReader returns (1, nil) + // for a 3-byte read. + if wantN := uint64(4); n != wantN || err != nil { + t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + for i, want := range [][]byte{[]byte("foo"), []byte("b\x00\x00")} { + if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { + t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) + } + } +} + +type singleByteReader struct { + Reader io.Reader +} + +func (r singleByteReader) Read(dst []byte) (int, error) { + if len(dst) == 0 { + return r.Reader.Read(dst) + } + return r.Reader.Read(dst[:1]) +} + +func TestSingleByteReader(t *testing.T) { + r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}} + dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) + n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) + // FromIOReader should stop after the singleByteReader returns (1, nil) + // for a 3-byte read. + if wantN := uint64(1); n != wantN || err != nil { + t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + for i, want := range [][]byte{[]byte("f\x00\x00"), []byte("\x00\x00\x00")} { + if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { + t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) + } + } +} + +func TestReadFullToBlocks(t *testing.T) { + r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}} + dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) + n, err := ReadFullToBlocks(r, BlockSeqFromSlice(dsts)) + // ReadFullToBlocks should call into FromIOReader => singleByteReader + // repeatedly until dsts is exhausted. + if wantN := uint64(6); n != wantN || err != nil { + t.Errorf("ReadFullToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + for i, want := range [][]byte{[]byte("foo"), []byte("bar")} { + if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { + t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) + } + } +} + +func TestFromIOWriterFullWrite(t *testing.T) { + srcs := makeBlocks([]byte("foo"), []byte("bar")) + var dst bytes.Buffer + w := FromIOWriter{&dst} + n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) + if wantN := uint64(6); n != wantN || err != nil { + t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +type limitedWriter struct { + Writer io.Writer + Done int + Limit int +} + +func (w *limitedWriter) Write(src []byte) (int, error) { + count := len(src) + if count > (w.Limit - w.Done) { + count = w.Limit - w.Done + } + n, err := w.Writer.Write(src[:count]) + w.Done += n + return n, err +} + +func TestFromIOWriterPartialWrite(t *testing.T) { + srcs := makeBlocks([]byte("foo"), []byte("bar")) + var dst bytes.Buffer + w := FromIOWriter{&limitedWriter{&dst, 0, 4}} + n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) + // FromIOWriter should stop after the limitedWriter returns (1, nil) for a + // 3-byte write. + if wantN := uint64(4); n != wantN || err != nil { + t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +type singleByteWriter struct { + Writer io.Writer +} + +func (w singleByteWriter) Write(src []byte) (int, error) { + if len(src) == 0 { + return w.Writer.Write(src) + } + return w.Writer.Write(src[:1]) +} + +func TestSingleByteWriter(t *testing.T) { + srcs := makeBlocks([]byte("foo"), []byte("bar")) + var dst bytes.Buffer + w := FromIOWriter{singleByteWriter{&dst}} + n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) + // FromIOWriter should stop after the singleByteWriter returns (1, nil) + // for a 3-byte write. + if wantN := uint64(1); n != wantN || err != nil { + t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("f"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +func TestWriteFullToBlocks(t *testing.T) { + srcs := makeBlocks([]byte("foo"), []byte("bar")) + var dst bytes.Buffer + w := FromIOWriter{singleByteWriter{&dst}} + n, err := WriteFullFromBlocks(w, BlockSeqFromSlice(srcs)) + // WriteFullToBlocks should call into FromIOWriter => singleByteWriter + // repeatedly until srcs is exhausted. + if wantN := uint64(6); n != wantN || err != nil { + t.Errorf("WriteFullFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} diff --git a/pkg/safemem/safemem.go b/pkg/safemem/safemem.go new file mode 100644 index 000000000..3e70d33a2 --- /dev/null +++ b/pkg/safemem/safemem.go @@ -0,0 +1,16 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package safemem provides the Block and BlockSeq types. +package safemem diff --git a/pkg/safemem/seq_test.go b/pkg/safemem/seq_test.go new file mode 100644 index 000000000..eba4bb535 --- /dev/null +++ b/pkg/safemem/seq_test.go @@ -0,0 +1,196 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "bytes" + "reflect" + "testing" +) + +type blockSeqTest struct { + desc string + + pieces []string + haveOffset bool + offset uint64 + haveLimit bool + limit uint64 + + want string +} + +func (t blockSeqTest) NonEmptyByteSlices() [][]byte { + // t is a value, so we can mutate it freely. + slices := make([][]byte, 0, len(t.pieces)) + for _, str := range t.pieces { + if t.haveOffset { + strOff := t.offset + if strOff > uint64(len(str)) { + strOff = uint64(len(str)) + } + str = str[strOff:] + t.offset -= strOff + } + if t.haveLimit { + strLim := t.limit + if strLim > uint64(len(str)) { + strLim = uint64(len(str)) + } + str = str[:strLim] + t.limit -= strLim + } + if len(str) != 0 { + slices = append(slices, []byte(str)) + } + } + return slices +} + +func (t blockSeqTest) BlockSeq() BlockSeq { + blocks := make([]Block, 0, len(t.pieces)) + for _, str := range t.pieces { + blocks = append(blocks, BlockFromSafeSlice([]byte(str))) + } + bs := BlockSeqFromSlice(blocks) + if t.haveOffset { + bs = bs.DropFirst64(t.offset) + } + if t.haveLimit { + bs = bs.TakeFirst64(t.limit) + } + return bs +} + +var blockSeqTests = []blockSeqTest{ + { + desc: "Empty sequence", + }, + { + desc: "Sequence of length 1", + pieces: []string{"foobar"}, + want: "foobar", + }, + { + desc: "Sequence of length 2", + pieces: []string{"foo", "bar"}, + want: "foobar", + }, + { + desc: "Empty Blocks", + pieces: []string{"", "foo", "", "", "bar", ""}, + want: "foobar", + }, + { + desc: "Sequence with non-zero offset", + pieces: []string{"foo", "bar"}, + haveOffset: true, + offset: 2, + want: "obar", + }, + { + desc: "Sequence with non-maximal limit", + pieces: []string{"foo", "bar"}, + haveLimit: true, + limit: 5, + want: "fooba", + }, + { + desc: "Sequence with offset and limit", + pieces: []string{"foo", "bar"}, + haveOffset: true, + offset: 2, + haveLimit: true, + limit: 3, + want: "oba", + }, +} + +func TestBlockSeqNumBytes(t *testing.T) { + for _, test := range blockSeqTests { + t.Run(test.desc, func(t *testing.T) { + if got, want := test.BlockSeq().NumBytes(), uint64(len(test.want)); got != want { + t.Errorf("NumBytes: got %d, wanted %d", got, want) + } + }) + } +} + +func TestBlockSeqIterBlocks(t *testing.T) { + // Tests BlockSeq iteration using Head/Tail. + for _, test := range blockSeqTests { + t.Run(test.desc, func(t *testing.T) { + srcs := test.BlockSeq() + // "Note that a non-nil empty slice and a nil slice ... are not + // deeply equal." - reflect + slices := make([][]byte, 0, 0) + for !srcs.IsEmpty() { + src := srcs.Head() + slices = append(slices, src.ToSlice()) + nextSrcs := srcs.Tail() + if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-uint64(src.Len()); got != want { + t.Fatalf("%v.Tail(): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want) + } + srcs = nextSrcs + } + if wantSlices := test.NonEmptyByteSlices(); !reflect.DeepEqual(slices, wantSlices) { + t.Errorf("Accumulated slices: got %v, wanted %v", slices, wantSlices) + } + }) + } +} + +func TestBlockSeqIterBytes(t *testing.T) { + // Tests BlockSeq iteration using Head/DropFirst. + for _, test := range blockSeqTests { + t.Run(test.desc, func(t *testing.T) { + srcs := test.BlockSeq() + var dst bytes.Buffer + for !srcs.IsEmpty() { + src := srcs.Head() + var b [1]byte + n, err := Copy(BlockFromSafeSlice(b[:]), src) + if n != 1 || err != nil { + t.Fatalf("Copy: got (%v, %v), wanted (1, nil)", n, err) + } + dst.WriteByte(b[0]) + nextSrcs := srcs.DropFirst(1) + if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-1; got != want { + t.Fatalf("%v.DropFirst(1): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want) + } + srcs = nextSrcs + } + if got := string(dst.Bytes()); got != test.want { + t.Errorf("Copied string: got %q, wanted %q", got, test.want) + } + }) + } +} + +func TestBlockSeqDropBeyondLimit(t *testing.T) { + blocks := []Block{BlockFromSafeSlice([]byte("123")), BlockFromSafeSlice([]byte("4"))} + bs := BlockSeqFromSlice(blocks) + if got, want := bs.NumBytes(), uint64(4); got != want { + t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) + } + bs = bs.TakeFirst(1) + if got, want := bs.NumBytes(), uint64(1); got != want { + t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) + } + bs = bs.DropFirst(2) + if got, want := bs.NumBytes(), uint64(0); got != want { + t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) + } +} diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go new file mode 100644 index 000000000..354a95dde --- /dev/null +++ b/pkg/safemem/seq_unsafe.go @@ -0,0 +1,299 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "bytes" + "fmt" + "reflect" + "unsafe" +) + +// A BlockSeq represents a sequence of Blocks, each of which has non-zero +// length. +// +// BlockSeqs are immutable and may be copied by value. The zero value of +// BlockSeq represents an empty sequence. +type BlockSeq struct { + // If length is 0, then the BlockSeq is empty. Invariants: data == 0; + // offset == 0; limit == 0. + // + // If length is -1, then the BlockSeq represents the single Block{data, + // limit, false}. Invariants: offset == 0; limit > 0; limit does not + // overflow the range of an int. + // + // If length is -2, then the BlockSeq represents the single Block{data, + // limit, true}. Invariants: offset == 0; limit > 0; limit does not + // overflow the range of an int. + // + // Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks + // in the array of Blocks starting at address `data`, starting at `offset` + // bytes into the first Block and limited to the following `limit` bytes. + // Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <= + // the combined length of all Blocks in the array; the first Block in the + // array has non-zero length. + // + // length is never 1; sequences consisting of a single Block are always + // stored inline (with length < 0). + data unsafe.Pointer + length int + offset int + limit uint64 +} + +// BlockSeqOf returns a BlockSeq representing the single Block b. +func BlockSeqOf(b Block) BlockSeq { + bs := BlockSeq{ + data: b.start, + length: -1, + limit: uint64(b.length), + } + if b.needSafecopy { + bs.length = -2 + } + return bs +} + +// BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice. +// If slice contains Blocks with zero length, BlockSeq will skip them during +// iteration. +// +// Whether the returned BlockSeq shares memory with slice is unspecified; +// clients should avoid mutating slices passed to BlockSeqFromSlice. +// +// Preconditions: The combined length of all Blocks in slice <= math.MaxUint64. +func BlockSeqFromSlice(slice []Block) BlockSeq { + slice = skipEmpty(slice) + var limit uint64 + for _, b := range slice { + sum := limit + uint64(b.Len()) + if sum < limit { + panic("BlockSeq length overflows uint64") + } + limit = sum + } + return blockSeqFromSliceLimited(slice, limit) +} + +// Preconditions: The combined length of all Blocks in slice <= limit. If +// len(slice) != 0, the first Block in slice has non-zero length, and limit > +// 0. +func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq { + switch len(slice) { + case 0: + return BlockSeq{} + case 1: + return BlockSeqOf(slice[0].TakeFirst64(limit)) + default: + return BlockSeq{ + data: unsafe.Pointer(&slice[0]), + length: len(slice), + limit: limit, + } + } +} + +func skipEmpty(slice []Block) []Block { + for i, b := range slice { + if b.Len() != 0 { + return slice[i:] + } + } + return nil +} + +// IsEmpty returns true if bs contains no Blocks. +// +// Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0). +// (Of these, prefer to use bs.IsEmpty().) +func (bs BlockSeq) IsEmpty() bool { + return bs.length == 0 +} + +// NumBlocks returns the number of Blocks in bs. +func (bs BlockSeq) NumBlocks() int { + // In general, we have to count: if bs represents a windowed slice then the + // slice may contain Blocks with zero length, and bs.length may be larger + // than the actual number of Blocks due to bs.limit. + var n int + for !bs.IsEmpty() { + n++ + bs = bs.Tail() + } + return n +} + +// NumBytes returns the sum of Block.Len() for all Blocks in bs. +func (bs BlockSeq) NumBytes() uint64 { + return bs.limit +} + +// Head returns the first Block in bs. +// +// Preconditions: !bs.IsEmpty(). +func (bs BlockSeq) Head() Block { + if bs.length == 0 { + panic("empty BlockSeq") + } + if bs.length < 0 { + return bs.internalBlock() + } + return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit) +} + +// Preconditions: bs.length < 0. +func (bs BlockSeq) internalBlock() Block { + return Block{ + start: bs.data, + length: int(bs.limit), + needSafecopy: bs.length == -2, + } +} + +// Tail returns a BlockSeq consisting of all Blocks in bs after the first. +// +// Preconditions: !bs.IsEmpty(). +func (bs BlockSeq) Tail() BlockSeq { + if bs.length == 0 { + panic("empty BlockSeq") + } + if bs.length < 0 { + return BlockSeq{} + } + head := (*Block)(bs.data).DropFirst(bs.offset) + headLen := uint64(head.Len()) + if headLen >= bs.limit { + // The head Block exhausts the limit, so the tail is empty. + return BlockSeq{} + } + var extSlice []Block + extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice)) + extSliceHdr.Data = uintptr(bs.data) + extSliceHdr.Len = bs.length + extSliceHdr.Cap = bs.length + tailSlice := skipEmpty(extSlice[1:]) + tailLimit := bs.limit - headLen + return blockSeqFromSliceLimited(tailSlice, tailLimit) +} + +// DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes +// omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq. +// +// Preconditions: n >= 0. +func (bs BlockSeq) DropFirst(n int) BlockSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return bs.DropFirst64(uint64(n)) +} + +// DropFirst64 is equivalent to DropFirst but takes an uint64. +func (bs BlockSeq) DropFirst64(n uint64) BlockSeq { + if n >= bs.limit { + return BlockSeq{} + } + for { + // Calling bs.Head() here is surprisingly expensive, so inline getting + // the head's length. + var headLen uint64 + if bs.length < 0 { + headLen = bs.limit + } else { + headLen = uint64((*Block)(bs.data).Len() - bs.offset) + } + if n < headLen { + // Dropping ends partway through the head Block. + if bs.length < 0 { + return BlockSeqOf(bs.internalBlock().DropFirst64(n)) + } + bs.offset += int(n) + bs.limit -= n + return bs + } + n -= headLen + bs = bs.Tail() + } +} + +// TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n > +// bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs. +// +// Preconditions: n >= 0. +func (bs BlockSeq) TakeFirst(n int) BlockSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return bs.TakeFirst64(uint64(n)) +} + +// TakeFirst64 is equivalent to TakeFirst but takes a uint64. +func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq { + if n == 0 { + return BlockSeq{} + } + if bs.limit > n { + bs.limit = n + } + return bs +} + +// String implements fmt.Stringer.String. +func (bs BlockSeq) String() string { + var buf bytes.Buffer + buf.WriteByte('[') + var sep string + for !bs.IsEmpty() { + buf.WriteString(sep) + sep = " " + buf.WriteString(bs.Head().String()) + bs = bs.Tail() + } + buf.WriteByte(']') + return buf.String() +} + +// CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less, +// from srcs to dsts and returns the number of bytes copied. +// +// If srcs and dsts overlap, the data stored in dsts is unspecified. +func CopySeq(dsts, srcs BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() && !srcs.IsEmpty() { + dst := dsts.Head() + src := srcs.Head() + n, err := Copy(dst, src) + done += uint64(n) + if err != nil { + return done, err + } + dsts = dsts.DropFirst(n) + srcs = srcs.DropFirst(n) + } + return done, nil +} + +// ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed. +func ZeroSeq(dsts BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() { + n, err := Zero(dsts.Head()) + done += uint64(n) + if err != nil { + return done, err + } + dsts = dsts.DropFirst(n) + } + return done, nil +} diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 51ca09b24..34c0a867d 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -30,13 +30,13 @@ go_library( ":registers_go_proto", "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/cpuid", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/limits", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index 81ec98a77..1d11cc472 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Arch describes an architecture. diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index ea4dedbdf..3b6987665 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 2aa08b1a9..85d6acc0f 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -25,7 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Host specifies the host architecture. diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 0d5b7d317..94f1a808f 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Host specifies the host architecture. diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index 84f11b0d1..d388ee9cf 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -21,7 +21,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/cpuid" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // ErrFloatingPoint indicates a failed restore due to unusable floating point diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index 9f41e566f..a18093155 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -25,9 +25,9 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // System-related constants for x86. diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go index 4546b2ef9..2b4c8f3fc 100644 --- a/pkg/sentry/arch/auxv.go +++ b/pkg/sentry/arch/auxv.go @@ -15,7 +15,7 @@ package arch import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // An AuxEntry represents an entry in an ELF auxiliary vector. diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go index 402e46025..8b03d0187 100644 --- a/pkg/sentry/arch/signal.go +++ b/pkg/sentry/arch/signal.go @@ -16,7 +16,7 @@ package arch import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // SignalAct represents the action that should be taken when a signal is diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 1e4f9c3c2..81b92bb43 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -23,7 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // SignalContext64 is equivalent to struct sigcontext, the type passed as the diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 7d0e98935..4f4cc46a8 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -19,7 +19,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // SignalContext64 is equivalent to struct sigcontext, the type passed as the diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go index d324da705..1a6056171 100644 --- a/pkg/sentry/arch/signal_stack.go +++ b/pkg/sentry/arch/signal_stack.go @@ -17,7 +17,7 @@ package arch import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go index 7472c3c61..09bceabc9 100644 --- a/pkg/sentry/arch/stack.go +++ b/pkg/sentry/arch/stack.go @@ -18,8 +18,8 @@ import ( "encoding/binary" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/usermem" ) // Stack is a simple wrapper around a usermem.IO and an address. diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD deleted file mode 100644 index e13a9ce20..000000000 --- a/pkg/sentry/context/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "context", - srcs = ["context.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/amutex", - "//pkg/log", - ], -) diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go deleted file mode 100644 index 23e009ef3..000000000 --- a/pkg/sentry/context/context.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package context defines an internal context type. -// -// The given Context conforms to the standard Go context, but mandates -// additional methods that are specific to the kernel internals. Note however, -// that the Context described by this package carries additional constraints -// regarding concurrent access and retaining beyond the scope of a call. -// -// See the Context type for complete details. -package context - -import ( - "context" - "time" - - "gvisor.dev/gvisor/pkg/amutex" - "gvisor.dev/gvisor/pkg/log" -) - -type contextID int - -// Globally accessible values from a context. These keys are defined in the -// context package to resolve dependency cycles by not requiring the caller to -// import packages usually required to get these information. -const ( - // CtxThreadGroupID is the current thread group ID when a context represents - // a task context. The value is represented as an int32. - CtxThreadGroupID contextID = iota -) - -// ThreadGroupIDFromContext returns the current thread group ID when ctx -// represents a task context. -func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) { - if tgid := ctx.Value(CtxThreadGroupID); tgid != nil { - return tgid.(int32), true - } - return 0, false -} - -// A Context represents a thread of execution (hereafter "goroutine" to reflect -// Go idiosyncrasy). It carries state associated with the goroutine across API -// boundaries. -// -// While Context exists for essentially the same reasons as Go's standard -// context.Context, the standard type represents the state of an operation -// rather than that of a goroutine. This is a critical distinction: -// -// - Unlike context.Context, which "may be passed to functions running in -// different goroutines", it is *not safe* to use the same Context in multiple -// concurrent goroutines. -// -// - It is *not safe* to retain a Context passed to a function beyond the scope -// of that function call. -// -// In both cases, values extracted from the Context should be used instead. -type Context interface { - log.Logger - amutex.Sleeper - context.Context - - // UninterruptibleSleepStart indicates the beginning of an uninterruptible - // sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate - // is true and the Context represents a Task, the Task's AddressSpace is - // deactivated. - UninterruptibleSleepStart(deactivate bool) - - // UninterruptibleSleepFinish indicates the end of an uninterruptible sleep - // state that was begun by a previous call to UninterruptibleSleepStart. If - // activate is true and the Context represents a Task, the Task's - // AddressSpace is activated. Normally activate is the same value as the - // deactivate parameter passed to UninterruptibleSleepStart. - UninterruptibleSleepFinish(activate bool) -} - -// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep -// methods for anonymous embedding in other types that do not implement sleeps. -type NoopSleeper struct { - amutex.NoopSleeper -} - -// UninterruptibleSleepStart does nothing. -func (NoopSleeper) UninterruptibleSleepStart(bool) {} - -// UninterruptibleSleepFinish does nothing. -func (NoopSleeper) UninterruptibleSleepFinish(bool) {} - -// Deadline returns zero values, meaning no deadline. -func (NoopSleeper) Deadline() (time.Time, bool) { - return time.Time{}, false -} - -// Done returns nil. -func (NoopSleeper) Done() <-chan struct{} { - return nil -} - -// Err returns nil. -func (NoopSleeper) Err() error { - return nil -} - -// logContext implements basic logging. -type logContext struct { - log.Logger - NoopSleeper -} - -// Value implements Context.Value. -func (logContext) Value(key interface{}) interface{} { - return nil -} - -// bgContext is the context returned by context.Background. -var bgContext = &logContext{Logger: log.Log()} - -// Background returns an empty context using the default logger. -// -// Users should be wary of using a Background context. Please tag any use with -// FIXME(b/38173783) and a note to remove this use. -// -// Generally, one should use the Task as their context when available, or avoid -// having to use a context in places where a Task is unavailable. -// -// Using a Background context for tests is fine, as long as no values are -// needed from the context in the tested code paths. -func Background() Context { - return bgContext -} diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD deleted file mode 100644 index f91a6d4ed..000000000 --- a/pkg/sentry/context/contexttest/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "contexttest", - testonly = 1, - srcs = ["contexttest.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/memutil", - "//pkg/sentry/context", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/time", - "//pkg/sentry/limits", - "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", - "//pkg/sentry/platform/ptrace", - "//pkg/sentry/uniqueid", - ], -) diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go deleted file mode 100644 index 15cf086a9..000000000 --- a/pkg/sentry/context/contexttest/contexttest.go +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package contexttest builds a test context.Context. -package contexttest - -import ( - "os" - "sync/atomic" - "testing" - "time" - - "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" - "gvisor.dev/gvisor/pkg/sentry/uniqueid" -) - -// Context returns a Context that may be used in tests. Uses ptrace as the -// platform.Platform. -// -// Note that some filesystems may require a minimal kernel for testing, which -// this test context does not provide. For such tests, see kernel/contexttest. -func Context(tb testing.TB) context.Context { - const memfileName = "contexttest-memory" - memfd, err := memutil.CreateMemFD(memfileName, 0) - if err != nil { - tb.Fatalf("error creating application memory file: %v", err) - } - memfile := os.NewFile(uintptr(memfd), memfileName) - mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) - if err != nil { - memfile.Close() - tb.Fatalf("error creating pgalloc.MemoryFile: %v", err) - } - p, err := ptrace.New() - if err != nil { - tb.Fatal(err) - } - // Test usage of context.Background is fine. - return &TestContext{ - Context: context.Background(), - l: limits.NewLimitSet(), - mf: mf, - platform: p, - creds: auth.NewAnonymousCredentials(), - otherValues: make(map[interface{}]interface{}), - } -} - -// TestContext represents a context with minimal functionality suitable for -// running tests. -type TestContext struct { - context.Context - l *limits.LimitSet - mf *pgalloc.MemoryFile - platform platform.Platform - creds *auth.Credentials - otherValues map[interface{}]interface{} -} - -// globalUniqueID tracks incremental unique identifiers for tests. -var globalUniqueID uint64 - -// globalUniqueIDProvider implements unix.UniqueIDProvider. -type globalUniqueIDProvider struct{} - -// UniqueID implements unix.UniqueIDProvider.UniqueID. -func (*globalUniqueIDProvider) UniqueID() uint64 { - return atomic.AddUint64(&globalUniqueID, 1) -} - -// lastInotifyCookie is a monotonically increasing counter for generating unique -// inotify cookies. Must be accessed using atomic ops. -var lastInotifyCookie uint32 - -// hostClock implements ktime.Clock. -type hostClock struct { - ktime.WallRateClock - ktime.NoClockEvents -} - -// Now implements ktime.Clock.Now. -func (hostClock) Now() ktime.Time { - return ktime.FromNanoseconds(time.Now().UnixNano()) -} - -// RegisterValue registers additional values with this test context. Useful for -// providing values from external packages that contexttest can't depend on. -func (t *TestContext) RegisterValue(key, value interface{}) { - t.otherValues[key] = value -} - -// Value implements context.Context. -func (t *TestContext) Value(key interface{}) interface{} { - switch key { - case auth.CtxCredentials: - return t.creds - case limits.CtxLimits: - return t.l - case pgalloc.CtxMemoryFile: - return t.mf - case pgalloc.CtxMemoryFileProvider: - return t - case platform.CtxPlatform: - return t.platform - case uniqueid.CtxGlobalUniqueID: - return (*globalUniqueIDProvider).UniqueID(nil) - case uniqueid.CtxGlobalUniqueIDProvider: - return &globalUniqueIDProvider{} - case uniqueid.CtxInotifyCookie: - return atomic.AddUint32(&lastInotifyCookie, 1) - case ktime.CtxRealtimeClock: - return hostClock{} - default: - if val, ok := t.otherValues[key]; ok { - return val - } - return t.Context.Value(key) - } -} - -// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. -func (t *TestContext) MemoryFile() *pgalloc.MemoryFile { - return t.mf -} - -// RootContext returns a Context that may be used in tests that need root -// credentials. Uses ptrace as the platform.Platform. -func RootContext(tb testing.TB) context.Context { - return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace())) -} - -// WithCreds returns a copy of ctx carrying creds. -func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context { - return &authContext{ctx, creds} -} - -type authContext struct { - context.Context - creds *auth.Credentials -} - -// Value implements context.Context. -func (ac *authContext) Value(key interface{}) interface{} { - switch key { - case auth.CtxCredentials: - return ac.creds - default: - return ac.Context.Value(key) - } -} - -// WithLimitSet returns a copy of ctx carrying l. -func WithLimitSet(ctx context.Context, l *limits.LimitSet) context.Context { - return limitContext{ctx, l} -} - -type limitContext struct { - context.Context - l *limits.LimitSet -} - -// Value implements context.Context. -func (lc limitContext) Value(key interface{}) interface{} { - switch key { - case limits.CtxLimits: - return lc.l - default: - return lc.Context.Value(key) - } -} diff --git a/pkg/sentry/contexttest/BUILD b/pkg/sentry/contexttest/BUILD new file mode 100644 index 000000000..6f4c86684 --- /dev/null +++ b/pkg/sentry/contexttest/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "contexttest", + testonly = 1, + srcs = ["contexttest.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/memutil", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/platform/ptrace", + "//pkg/sentry/uniqueid", + ], +) diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go new file mode 100644 index 000000000..031fc64ec --- /dev/null +++ b/pkg/sentry/contexttest/contexttest.go @@ -0,0 +1,188 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package contexttest builds a test context.Context. +package contexttest + +import ( + "os" + "sync/atomic" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" +) + +// Context returns a Context that may be used in tests. Uses ptrace as the +// platform.Platform. +// +// Note that some filesystems may require a minimal kernel for testing, which +// this test context does not provide. For such tests, see kernel/contexttest. +func Context(tb testing.TB) context.Context { + const memfileName = "contexttest-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + tb.Fatalf("error creating application memory file: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + tb.Fatalf("error creating pgalloc.MemoryFile: %v", err) + } + p, err := ptrace.New() + if err != nil { + tb.Fatal(err) + } + // Test usage of context.Background is fine. + return &TestContext{ + Context: context.Background(), + l: limits.NewLimitSet(), + mf: mf, + platform: p, + creds: auth.NewAnonymousCredentials(), + otherValues: make(map[interface{}]interface{}), + } +} + +// TestContext represents a context with minimal functionality suitable for +// running tests. +type TestContext struct { + context.Context + l *limits.LimitSet + mf *pgalloc.MemoryFile + platform platform.Platform + creds *auth.Credentials + otherValues map[interface{}]interface{} +} + +// globalUniqueID tracks incremental unique identifiers for tests. +var globalUniqueID uint64 + +// globalUniqueIDProvider implements unix.UniqueIDProvider. +type globalUniqueIDProvider struct{} + +// UniqueID implements unix.UniqueIDProvider.UniqueID. +func (*globalUniqueIDProvider) UniqueID() uint64 { + return atomic.AddUint64(&globalUniqueID, 1) +} + +// lastInotifyCookie is a monotonically increasing counter for generating unique +// inotify cookies. Must be accessed using atomic ops. +var lastInotifyCookie uint32 + +// hostClock implements ktime.Clock. +type hostClock struct { + ktime.WallRateClock + ktime.NoClockEvents +} + +// Now implements ktime.Clock.Now. +func (hostClock) Now() ktime.Time { + return ktime.FromNanoseconds(time.Now().UnixNano()) +} + +// RegisterValue registers additional values with this test context. Useful for +// providing values from external packages that contexttest can't depend on. +func (t *TestContext) RegisterValue(key, value interface{}) { + t.otherValues[key] = value +} + +// Value implements context.Context. +func (t *TestContext) Value(key interface{}) interface{} { + switch key { + case auth.CtxCredentials: + return t.creds + case limits.CtxLimits: + return t.l + case pgalloc.CtxMemoryFile: + return t.mf + case pgalloc.CtxMemoryFileProvider: + return t + case platform.CtxPlatform: + return t.platform + case uniqueid.CtxGlobalUniqueID: + return (*globalUniqueIDProvider).UniqueID(nil) + case uniqueid.CtxGlobalUniqueIDProvider: + return &globalUniqueIDProvider{} + case uniqueid.CtxInotifyCookie: + return atomic.AddUint32(&lastInotifyCookie, 1) + case ktime.CtxRealtimeClock: + return hostClock{} + default: + if val, ok := t.otherValues[key]; ok { + return val + } + return t.Context.Value(key) + } +} + +// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. +func (t *TestContext) MemoryFile() *pgalloc.MemoryFile { + return t.mf +} + +// RootContext returns a Context that may be used in tests that need root +// credentials. Uses ptrace as the platform.Platform. +func RootContext(tb testing.TB) context.Context { + return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace())) +} + +// WithCreds returns a copy of ctx carrying creds. +func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context { + return &authContext{ctx, creds} +} + +type authContext struct { + context.Context + creds *auth.Credentials +} + +// Value implements context.Context. +func (ac *authContext) Value(key interface{}) interface{} { + switch key { + case auth.CtxCredentials: + return ac.creds + default: + return ac.Context.Value(key) + } +} + +// WithLimitSet returns a copy of ctx carrying l. +func WithLimitSet(ctx context.Context, l *limits.LimitSet) context.Context { + return limitContext{ctx, l} +} + +type limitContext struct { + context.Context + l *limits.LimitSet +} + +// Value implements context.Context. +func (lc limitContext) Value(key interface{}) interface{} { + switch key { + case limits.CtxLimits: + return lc.l + default: + return lc.Context.Value(key) + } +} diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 605d61dbe..ea85ab33c 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -47,13 +47,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/context", "//pkg/log", "//pkg/metric", "//pkg/p9", "//pkg/refs", "//pkg/secio", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", @@ -64,10 +64,10 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/state", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -107,14 +107,14 @@ go_test( ], deps = [ ":fs", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/contexttest", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -129,7 +129,7 @@ go_test( ], library = ":fs", deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", ], ) diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD index c14e5405e..aedcecfa1 100644 --- a/pkg/sentry/fs/anon/BUILD +++ b/pkg/sentry/fs/anon/BUILD @@ -11,10 +11,10 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go index 7323c7222..5c421f5fb 100644 --- a/pkg/sentry/fs/anon/anon.go +++ b/pkg/sentry/fs/anon/anon.go @@ -18,10 +18,10 @@ package anon import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // NewInode constructs an anonymous Inode that is not associated diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go index 4f3d6410e..fa9e7d517 100644 --- a/pkg/sentry/fs/attr.go +++ b/pkg/sentry/fs/attr.go @@ -20,8 +20,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go index dd427de5d..0fbd60056 100644 --- a/pkg/sentry/fs/context.go +++ b/pkg/sentry/fs/context.go @@ -16,7 +16,7 @@ package fs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index e03e3e417..f6c79e51b 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -19,12 +19,12 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // copyUp copies a file in an overlay from a lower filesystem to an diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index 738580c5f..91792d9fe 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -24,8 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 0c7247bd7..4c4b7d5cc 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -16,8 +16,9 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/rand", - "//pkg/sentry/context", + "//pkg/safemem", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -26,9 +27,8 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/mm", "//pkg/sentry/pgalloc", - "//pkg/sentry/safemem", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index f739c476c..35bd23991 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -18,11 +18,11 @@ package dev import ( "math" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Memory device numbers are from Linux's drivers/char/mem.c diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go index 55f8af704..5e518fb63 100644 --- a/pkg/sentry/fs/dev/fs.go +++ b/pkg/sentry/fs/dev/fs.go @@ -15,7 +15,7 @@ package dev import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go index 07e0ea010..deb9c6ad8 100644 --- a/pkg/sentry/fs/dev/full.go +++ b/pkg/sentry/fs/dev/full.go @@ -16,11 +16,11 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go index 4404b97ef..aec33d0d9 100644 --- a/pkg/sentry/fs/dev/null.go +++ b/pkg/sentry/fs/dev/null.go @@ -16,7 +16,7 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go index 49cb92f6e..2a9bbeb18 100644 --- a/pkg/sentry/fs/dev/random.go +++ b/pkg/sentry/fs/dev/random.go @@ -16,12 +16,12 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/rand" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/dev/tty.go b/pkg/sentry/fs/dev/tty.go index 87d80e292..760ca563d 100644 --- a/pkg/sentry/fs/dev/tty.go +++ b/pkg/sentry/fs/dev/tty.go @@ -16,7 +16,7 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 31fc4d87b..acab0411a 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -22,8 +22,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go index 47bc72a88..98d69c6f2 100644 --- a/pkg/sentry/fs/dirent_refs_test.go +++ b/pkg/sentry/fs/dirent_refs_test.go @@ -18,8 +18,8 @@ import ( "syscall" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" ) func newMockDirInode(ctx context.Context, cache *DirentCache) *Inode { diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index 25ef96299..1d09e983c 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -12,17 +12,17 @@ go_library( imports = ["gvisor.dev/gvisor/pkg/sentry/fs"], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", "//pkg/log", + "//pkg/safemem", "//pkg/secio", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/safemem", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -36,13 +36,13 @@ go_test( ], library = ":fdpipe", deps = [ + "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", "@com_github_google_uuid//:go_default_library", ], ) diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 5b6cfeb0a..9fce177ad 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -19,17 +19,17 @@ import ( "os" "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go index 64b558975..0c3595998 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener.go @@ -20,8 +20,8 @@ import ( "syscall" "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index 577445148..e556da48a 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -26,12 +26,12 @@ import ( "github.com/google/uuid" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type hostOpener struct { diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go index cee87f726..af8230a7d 100644 --- a/pkg/sentry/fs/fdpipe/pipe_state.go +++ b/pkg/sentry/fs/fdpipe/pipe_state.go @@ -18,7 +18,7 @@ import ( "fmt" "io/ioutil" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" ) diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index 69abc1e71..5aff0cc95 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -23,10 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func singlePipeFD() (int, error) { diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 7c4586296..ca3466f4f 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -20,16 +20,16 @@ import ( "time" "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index b88303f17..beba0f771 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -17,10 +17,10 @@ package fs import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 8991207b4..dcc1df38f 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -17,13 +17,13 @@ package fs import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go index 2fb824d5c..02538bb4f 100644 --- a/pkg/sentry/fs/file_overlay_test.go +++ b/pkg/sentry/fs/file_overlay_test.go @@ -18,7 +18,7 @@ import ( "reflect" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go index c5b51620a..084da2a8d 100644 --- a/pkg/sentry/fs/filesystems.go +++ b/pkg/sentry/fs/filesystems.go @@ -19,7 +19,7 @@ import ( "sort" "strings" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD index 9a7608cae..a8000e010 100644 --- a/pkg/sentry/fs/filetest/BUILD +++ b/pkg/sentry/fs/filetest/BUILD @@ -8,12 +8,12 @@ go_library( srcs = ["filetest.go"], visibility = ["//pkg/sentry:internal"], deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/usermem", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go index 22270a494..8049538f2 100644 --- a/pkg/sentry/fs/filetest/filetest.go +++ b/pkg/sentry/fs/filetest/filetest.go @@ -19,12 +19,12 @@ import ( "fmt" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index 26abf49e2..bdba6efe5 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -54,8 +54,8 @@ package fs import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sync" ) diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 9142f5bdf..4ab2a384f 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -77,22 +77,22 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/state", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -106,13 +106,13 @@ go_test( ], library = ":fsutil", deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/safemem", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go index 12132680b..c6cd45087 100644 --- a/pkg/sentry/fs/fsutil/dirty_set.go +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -17,11 +17,11 @@ package fsutil import ( "math" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go index 75575d994..e3579c23c 100644 --- a/pkg/sentry/fs/fsutil/dirty_set_test.go +++ b/pkg/sentry/fs/fsutil/dirty_set_test.go @@ -19,7 +19,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func TestDirtySet(t *testing.T) { diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index fc5b3b1a1..08695391c 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -17,12 +17,12 @@ package fsutil import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go index f52d712e3..5643cdac9 100644 --- a/pkg/sentry/fs/fsutil/file_range_set.go +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -19,13 +19,13 @@ import ( "io" "math" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // FileRangeSet maps offsets into a memmap.Mappable to offsets into a diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index 837fc70b5..67278aa86 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -19,11 +19,11 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // HostFileMapper caches mappings of an arbitrary host file descriptor. It is diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go index ad11a0573..2d4778d64 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go @@ -17,7 +17,7 @@ package fsutil import ( "unsafe" - "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/safemem" ) func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block { diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index a625f0e26..78fec553e 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -17,13 +17,13 @@ package fsutil import ( "math" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // HostMappable implements memmap.Mappable and platform.File over a diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index df7b74855..252830572 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -16,7 +16,7 @@ package fsutil import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 20a014402..573b8586e 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -18,18 +18,18 @@ import ( "fmt" "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // Lock order (compare the lock order model in mm/mm.go): diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go index 129f314c8..1547584c5 100644 --- a/pkg/sentry/fs/fsutil/inode_cached_test.go +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -19,14 +19,14 @@ import ( "io" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type noopBackingFile struct{} diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index cf48e7c03..971d3718e 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -24,13 +24,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fd", "//pkg/log", "//pkg/metric", "//pkg/p9", "//pkg/refs", + "//pkg/safemem", "//pkg/secio", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fdpipe", @@ -39,13 +40,12 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -56,10 +56,10 @@ go_test( srcs = ["gofer_test.go"], library = ":gofer", deps = [ + "//pkg/context", "//pkg/p9", "//pkg/p9/p9test", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", ], ) diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go index 4848e2374..71cccdc34 100644 --- a/pkg/sentry/fs/gofer/attr.go +++ b/pkg/sentry/fs/gofer/attr.go @@ -17,12 +17,12 @@ package gofer import ( "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // getattr returns the 9p attributes of the p9.File. On success, Mode, Size, and RDev diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go index cc11c6339..ebea03c42 100644 --- a/pkg/sentry/fs/gofer/cache_policy.go +++ b/pkg/sentry/fs/gofer/cache_policy.go @@ -17,7 +17,7 @@ package gofer import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go index 2125dafef..3da818aed 100644 --- a/pkg/sentry/fs/gofer/context_file.go +++ b/pkg/sentry/fs/gofer/context_file.go @@ -15,9 +15,9 @@ package gofer import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" ) // contextFile is a wrapper around p9.File that notifies the context that diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index 7960b9c7b..23296f246 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -19,16 +19,16 @@ import ( "syscall" "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go index bb8312849..ff96b28ba 100644 --- a/pkg/sentry/fs/gofer/file_state.go +++ b/pkg/sentry/fs/gofer/file_state.go @@ -17,7 +17,7 @@ package gofer import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go index cf96dd9fa..9d41fcbdb 100644 --- a/pkg/sentry/fs/gofer/fs.go +++ b/pkg/sentry/fs/gofer/fs.go @@ -20,8 +20,8 @@ import ( "fmt" "strconv" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 7fc3c32ae..0c2f89ae8 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -20,10 +20,10 @@ import ( "testing" "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/p9/p9test" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index b86c49b39..9f7c3e89f 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -17,14 +17,14 @@ package gofer import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/safemem" ) // handles are the open handles of a gofer file. They are reference counted to diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 98d1a8a48..ac28174d2 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -19,17 +19,17 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fdpipe" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index 0b2eedb7c..238f7804c 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -20,8 +20,8 @@ import ( "path/filepath" "strings" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/time" diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index c09f3b71c..0c1be05ef 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -18,9 +18,9 @@ import ( "fmt" "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index edc796ce0..498c4645a 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -17,9 +17,9 @@ package gofer import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index d045e04ff..0285c5361 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -17,8 +17,8 @@ package gofer import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index a45a8f36c..376cfce2c 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -16,9 +16,9 @@ package gofer import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go index 848e6812b..2d8d3a2ea 100644 --- a/pkg/sentry/fs/gofer/util.go +++ b/pkg/sentry/fs/gofer/util.go @@ -17,8 +17,8 @@ package gofer import ( "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index f586f47c1..21003ea45 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -27,13 +27,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", "//pkg/log", "//pkg/refs", + "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -41,18 +42,17 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/uniqueid", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", "//pkg/unet", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -69,17 +69,17 @@ go_test( ], library = ":host", deps = [ + "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/tcpip", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 5532ff5a0..1658979fc 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -17,7 +17,7 @@ package host import ( "syscall" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index f6c626f2c..e08f56d04 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -18,17 +18,17 @@ import ( "fmt" "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go index 68d2697c0..d3e8e3a36 100644 --- a/pkg/sentry/fs/host/fs.go +++ b/pkg/sentry/fs/host/fs.go @@ -23,8 +23,8 @@ import ( "strconv" "strings" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go index c6852ee30..3111d2df9 100644 --- a/pkg/sentry/fs/host/fs_test.go +++ b/pkg/sentry/fs/host/fs_test.go @@ -23,8 +23,8 @@ import ( "sort" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index 873a1c52d..6fa39caab 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -18,14 +18,14 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go index b267ec305..299e0e0b0 100644 --- a/pkg/sentry/fs/host/inode_state.go +++ b/pkg/sentry/fs/host/inode_state.go @@ -18,7 +18,7 @@ import ( "fmt" "syscall" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index 2d959f10d..7221bc825 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -21,7 +21,7 @@ import ( "syscall" "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index c076d5bdd..06fc2d80a 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -19,11 +19,11 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/socket/control" unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index 68b38fd1c..eb4afe520 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -21,13 +21,13 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 753ef8cd6..3f218b4a7 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -16,14 +16,14 @@ package host import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // TTYFileOperations implements fs.FileOperations for a host file descriptor diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go index 88d24d693..d49c3a635 100644 --- a/pkg/sentry/fs/host/wait_test.go +++ b/pkg/sentry/fs/host/wait_test.go @@ -19,7 +19,7 @@ import ( "testing" "time" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index e4cf5a570..b66c091ab 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -16,10 +16,10 @@ package fs import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go index 13261cb81..70f2eae96 100644 --- a/pkg/sentry/fs/inode_operations.go +++ b/pkg/sentry/fs/inode_operations.go @@ -17,7 +17,7 @@ package fs import ( "errors" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index c477de837..4729b4aac 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -19,8 +19,8 @@ import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index 493d98c36..389c219d6 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -17,7 +17,7 @@ package fs_test import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index cc7dd1c92..928c90aa0 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -19,13 +19,13 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go index 9f70a3e82..686e1b1cd 100644 --- a/pkg/sentry/fs/inotify_event.go +++ b/pkg/sentry/fs/inotify_event.go @@ -18,8 +18,8 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/usermem" ) // inotifyEventBaseSize is the base size of linux's struct inotify_event. This diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go index 7a24c6f1b..1d6ea5736 100644 --- a/pkg/sentry/fs/mock.go +++ b/pkg/sentry/fs/mock.go @@ -15,7 +15,7 @@ package fs import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 7a9692800..37bae6810 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -19,8 +19,8 @@ import ( "fmt" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" ) // DirentOperations provide file systems greater control over how long a Dirent diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go index 299712cd7..78e35b1e6 100644 --- a/pkg/sentry/fs/mount_overlay.go +++ b/pkg/sentry/fs/mount_overlay.go @@ -15,7 +15,7 @@ package fs import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // overlayMountSourceOperations implements MountSourceOperations for an overlay diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go index 0b84732aa..e672a438c 100644 --- a/pkg/sentry/fs/mount_test.go +++ b/pkg/sentry/fs/mount_test.go @@ -18,7 +18,7 @@ import ( "fmt" "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" ) // cacheReallyContains iterates through the dirent cache to determine whether diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index a9627a9d1..574a2cc91 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -22,9 +22,9 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go index c4c771f2c..a69b41468 100644 --- a/pkg/sentry/fs/mounts_test.go +++ b/pkg/sentry/fs/mounts_test.go @@ -17,7 +17,7 @@ package fs_test import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go index f7d844ce7..53b5df175 100644 --- a/pkg/sentry/fs/offset.go +++ b/pkg/sentry/fs/offset.go @@ -17,7 +17,7 @@ package fs import ( "math" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // OffsetPageEnd returns the file offset rounded up to the nearest diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index f7702f8f4..a8ae7d81d 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -18,12 +18,12 @@ import ( "fmt" "strings" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // The virtual filesystem implements an overlay configuration. For a high-level diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index b06bead41..280093c5e 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -29,8 +29,8 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/proc/device", @@ -46,10 +46,10 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/header", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -64,8 +64,8 @@ go_test( library = ":proc", deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/inet", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/proc/cgroup.go b/pkg/sentry/fs/proc/cgroup.go index c4abe319d..7c1d9e7e9 100644 --- a/pkg/sentry/fs/proc/cgroup.go +++ b/pkg/sentry/fs/proc/cgroup.go @@ -17,7 +17,7 @@ package proc import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go index df0c4e3a7..c96533401 100644 --- a/pkg/sentry/fs/proc/cpuinfo.go +++ b/pkg/sentry/fs/proc/cpuinfo.go @@ -17,7 +17,7 @@ package proc import ( "bytes" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" ) diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go index 9aaeb780b..8fe626e1c 100644 --- a/pkg/sentry/fs/proc/exec_args.go +++ b/pkg/sentry/fs/proc/exec_args.go @@ -20,12 +20,12 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index 2fa3cfa7d..35972e23c 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -19,7 +19,7 @@ import ( "sort" "strconv" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go index 7b3b974ab..0a58ac34c 100644 --- a/pkg/sentry/fs/proc/filesystems.go +++ b/pkg/sentry/fs/proc/filesystems.go @@ -18,7 +18,7 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" ) diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go index 761d24462..daf1ba781 100644 --- a/pkg/sentry/fs/proc/fs.go +++ b/pkg/sentry/fs/proc/fs.go @@ -17,7 +17,7 @@ package proc import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go index 723f6b661..d2859a4c2 100644 --- a/pkg/sentry/fs/proc/inode.go +++ b/pkg/sentry/fs/proc/inode.go @@ -16,14 +16,14 @@ package proc import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // LINT.IfChange diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go index d7d2afcb7..139d49c34 100644 --- a/pkg/sentry/fs/proc/loadavg.go +++ b/pkg/sentry/fs/proc/loadavg.go @@ -18,7 +18,7 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" ) diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go index 313c6a32b..465b47da9 100644 --- a/pkg/sentry/fs/proc/meminfo.go +++ b/pkg/sentry/fs/proc/meminfo.go @@ -18,11 +18,11 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // LINT.IfChange diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index d4efc86e0..c10888100 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -20,7 +20,7 @@ import ( "sort" "strings" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/kernel" diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index bad445f3f..6f2775344 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -22,8 +22,8 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" @@ -33,9 +33,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/usermem" ) // LINT.IfChange diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index 29867dc3a..c8abb5052 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -20,7 +20,7 @@ import ( "sort" "strconv" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index 310d8dd52..21338d912 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -8,14 +8,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/proc/device", "//pkg/sentry/kernel/time", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -26,10 +26,10 @@ go_test( srcs = ["seqfile_test.go"], library = ":seqfile", deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/fs/ramfs", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go index f9af191d5..6121f0e95 100644 --- a/pkg/sentry/fs/proc/seqfile/seqfile.go +++ b/pkg/sentry/fs/proc/seqfile/seqfile.go @@ -19,14 +19,14 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go index ebfeee835..98e394569 100644 --- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go +++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go @@ -20,11 +20,11 @@ import ( "io" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type seqTest struct { diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go index bc5b2bc7b..d4fbd76ac 100644 --- a/pkg/sentry/fs/proc/stat.go +++ b/pkg/sentry/fs/proc/stat.go @@ -19,7 +19,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/kernel" ) diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go index 2bdcf5f70..f8aad2dbd 100644 --- a/pkg/sentry/fs/proc/sys.go +++ b/pkg/sentry/fs/proc/sys.go @@ -20,13 +20,13 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index b9e8ef35f..0772d4ae4 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -19,14 +19,14 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go index 6abae7a60..355e83d47 100644 --- a/pkg/sentry/fs/proc/sys_net_test.go +++ b/pkg/sentry/fs/proc/sys_net_test.go @@ -17,9 +17,9 @@ package proc import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func TestQuerySendBufferSize(t *testing.T) { diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 7358d6ef9..ca020e11e 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -22,7 +22,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" @@ -32,8 +32,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go index 3eacc9265..8d9517b95 100644 --- a/pkg/sentry/fs/proc/uid_gid_map.go +++ b/pkg/sentry/fs/proc/uid_gid_map.go @@ -20,13 +20,13 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go index adfe58adb..c0f6fb802 100644 --- a/pkg/sentry/fs/proc/uptime.go +++ b/pkg/sentry/fs/proc/uptime.go @@ -19,12 +19,12 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go index 27fd5b1cb..35e258ff6 100644 --- a/pkg/sentry/fs/proc/version.go +++ b/pkg/sentry/fs/proc/version.go @@ -17,7 +17,7 @@ package proc import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/kernel" ) diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 39c4b84f8..8ca823fb3 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -13,14 +13,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -31,7 +31,7 @@ go_test( srcs = ["tree_test.go"], library = ":ramfs", deps = [ - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", ], ) diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index dcbb8eb2e..bfa304552 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -20,7 +20,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go index a24fe2ea2..29ff004f2 100644 --- a/pkg/sentry/fs/ramfs/socket.go +++ b/pkg/sentry/fs/ramfs/socket.go @@ -16,7 +16,7 @@ package ramfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go index fcfaa29aa..d988349aa 100644 --- a/pkg/sentry/fs/ramfs/symlink.go +++ b/pkg/sentry/fs/ramfs/symlink.go @@ -16,7 +16,7 @@ package ramfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go index 702cc4a1e..dfc9d3453 100644 --- a/pkg/sentry/fs/ramfs/tree.go +++ b/pkg/sentry/fs/ramfs/tree.go @@ -19,10 +19,10 @@ import ( "path" "strings" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // MakeDirectoryTree constructs a ramfs tree of all directories containing diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go index 61a7e2900..a6ed8b2c5 100644 --- a/pkg/sentry/fs/ramfs/tree_test.go +++ b/pkg/sentry/fs/ramfs/tree_test.go @@ -17,7 +17,7 @@ package ramfs import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index 389c330a0..791d1526c 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -18,7 +18,7 @@ import ( "io" "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD index cc6b3bfbf..f2e8b9932 100644 --- a/pkg/sentry/fs/sys/BUILD +++ b/pkg/sentry/fs/sys/BUILD @@ -13,12 +13,12 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/kernel", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go index 4f78ca8d2..b67065956 100644 --- a/pkg/sentry/fs/sys/devices.go +++ b/pkg/sentry/fs/sys/devices.go @@ -18,7 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go index e60b63e75..fd03a4e38 100644 --- a/pkg/sentry/fs/sys/fs.go +++ b/pkg/sentry/fs/sys/fs.go @@ -15,7 +15,7 @@ package sys import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go index b14bf3f55..0891645e4 100644 --- a/pkg/sentry/fs/sys/sys.go +++ b/pkg/sentry/fs/sys/sys.go @@ -16,10 +16,10 @@ package sys import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func newFile(ctx context.Context, node fs.InodeOperations, msrc *fs.MountSource) *fs.Inode { diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD index 092668e8d..d16cdb4df 100644 --- a/pkg/sentry/fs/timerfd/BUILD +++ b/pkg/sentry/fs/timerfd/BUILD @@ -7,13 +7,13 @@ go_library( srcs = ["timerfd.go"], visibility = ["//pkg/sentry:internal"], deps = [ - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel/time", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index f8bf663bb..88c344089 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -19,13 +19,13 @@ package timerfd import ( "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 04776555f..aa7199014 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -14,8 +14,9 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/metric", - "//pkg/sentry/context", + "//pkg/safemem", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -25,12 +26,11 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -41,10 +41,10 @@ go_test( srcs = ["file_test.go"], library = ":tmpfs", deps = [ - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/kernel/contexttest", "//pkg/sentry/usage", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go index 9a6943fe4..614f8f8a1 100644 --- a/pkg/sentry/fs/tmpfs/file_regular.go +++ b/pkg/sentry/fs/tmpfs/file_regular.go @@ -15,11 +15,11 @@ package tmpfs import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go index 0075ef023..aaba35502 100644 --- a/pkg/sentry/fs/tmpfs/file_test.go +++ b/pkg/sentry/fs/tmpfs/file_test.go @@ -18,11 +18,11 @@ import ( "bytes" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func newFileInode(ctx context.Context) *fs.Inode { diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index be98ad751..d5be56c3f 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -19,7 +19,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index f1c87fe41..dabc10662 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -20,18 +20,18 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/metric" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 0f718e236..c00cef0a5 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -17,7 +17,7 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) var fsInfo = fs.Info{ diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 29f804c6c..5cb0e0417 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -16,20 +16,20 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/refs", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -41,7 +41,7 @@ go_test( library = ":tty", deps = [ "//pkg/abi/linux", - "//pkg/sentry/context/contexttest", - "//pkg/sentry/usermem", + "//pkg/sentry/contexttest", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 88aa66b24..108654827 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -21,14 +21,14 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index edee56c12..8fe05ebe5 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -15,7 +15,7 @@ package tty import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 9fe02657e..12b1c6097 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -19,11 +19,11 @@ import ( "unicode/utf8" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index 6b07f6bf2..f62da49bd 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -16,13 +16,13 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 21ccc6f32..1ca79c0b2 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -16,12 +16,12 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 2a51e6bab..db55cdc48 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -16,12 +16,12 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index 917f90cc0..5883f26db 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -16,11 +16,11 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Terminal is a pseudoterminal. diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go index 59f07ff8e..2cbc05678 100644 --- a/pkg/sentry/fs/tty/tty_test.go +++ b/pkg/sentry/fs/tty/tty_test.go @@ -18,8 +18,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" ) func TestSimpleMasterToSlave(t *testing.T) { diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index a718920d5..6f78f478f 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -35,21 +35,21 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/fd", "//pkg/fspath", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", "//pkg/sentry/syscalls/linux", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -73,14 +73,14 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", "//runsc/testutil", "@com_github_google_go-cmp//cmp:go_default_library", "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD index 12f3990c1..6c5a559fd 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD @@ -7,9 +7,9 @@ go_test( size = "small", srcs = ["benchmark_test.go"], deps = [ + "//pkg/context", "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/ext", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index a56b03711..d1436b943 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -24,9 +24,9 @@ import ( "strings" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 8944171c8..ebb72b75e 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -17,8 +17,8 @@ package ext import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/memmap" diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index 4b7d17dc6..373d23b74 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -21,9 +21,9 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 6c14a1e2d..05f992826 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -25,14 +25,14 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/runsc/testutil" ) diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 841274daf..92f7da40d 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -16,7 +16,7 @@ package ext import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 9afb1a84c..07bf58953 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -19,8 +19,8 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index d11153c90..30135ddb0 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -18,13 +18,13 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // regularFile represents a regular file's inode. This too follows the diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index bdf8705c1..1447a4dc1 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -15,11 +15,11 @@ package ext import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // symlink represents a symlink inode. diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 7bf83ccba..e73f1f857 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -29,16 +29,16 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", "//pkg/log", "//pkg/refs", - "//pkg/sentry/context", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -49,13 +49,13 @@ go_test( deps = [ ":kernfs", "//pkg/abi/linux", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", "@com_github_google_go-cmp//cmp:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 75624e0b1..373f801ff 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -18,11 +18,11 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // DynamicBytesFile implements kernfs.Inode and represents a read-only diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 5fa1fa67b..6104751c8 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -16,11 +16,11 @@ package kernfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index a4600ad47..9d65d0179 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -20,8 +20,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 1700fffd9..adca2313f 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -19,8 +19,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 85bcdcc57..79ebea8a5 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -56,8 +56,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index fade59491..ee65cf491 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -21,14 +21,14 @@ import ( "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const defaultMode linux.FileMode = 01777 diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index f19f12854..0ee7eb9b7 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -16,7 +16,7 @@ package kernfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 3768f55b2..12aac2e6a 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -16,8 +16,9 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", + "//pkg/safemem", "//pkg/sentry/fs", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", @@ -26,15 +27,14 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/tcpip/header", + "//pkg/usermem", ], ) @@ -48,15 +48,15 @@ go_test( library = ":proc", deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index f49819187..11477b6a9 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -19,7 +19,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 91eded415..353e37195 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -19,7 +19,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index a0580f20d..eb5bc62c0 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -19,7 +19,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 7bc352ae9..efd3b3453 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -20,17 +20,17 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // mm gets the kernel task's MemoryManager. No additional reference is taken on diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 51f634716..e0cb9c47b 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -20,7 +20,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index ad3760e39..434998910 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -20,14 +20,14 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type selfSymlink struct { diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go index 4aaf23e97..608fec017 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -22,8 +22,8 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -32,9 +32,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/usermem" ) func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index aabf2bf0c..ad963870b 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -19,7 +19,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go index 0a1d3f34b..be54897bb 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go @@ -20,7 +20,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/inet" ) diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 2c1635f33..6fc3524db 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -22,14 +22,14 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index beda141f1..66c0d8bc8 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -9,7 +9,7 @@ go_library( ], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 1305ad01d..e35d52d17 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -20,7 +20,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index 12053a5b6..efd5974c4 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -12,10 +12,10 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/cpuid", "//pkg/fspath", "//pkg/memutil", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -27,9 +27,9 @@ go_library( "//pkg/sentry/platform/kvm", "//pkg/sentry/platform/ptrace", "//pkg/sentry/time", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", + "//pkg/usermem", "@com_github_google_go-cmp//cmp:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 295da2d52..89f8c4915 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -21,9 +21,9 @@ import ( "runtime" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 2a723a89f..1c98335c1 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -24,12 +24,12 @@ import ( "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // System represents the context for a single test. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 857e98bc5..fb436860c 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -30,10 +30,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/context", "//pkg/fspath", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", @@ -43,12 +44,11 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/safemem", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -59,10 +59,10 @@ go_test( deps = [ ":tmpfs", "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", "//pkg/refs", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/auth", @@ -82,13 +82,13 @@ go_test( library = ":tmpfs", deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/contexttest", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index d88c83499..54241c8e8 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -21,10 +21,10 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 887ca2619..dc0d27cf9 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -16,7 +16,7 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index d726f03c5..5ee9cf1e9 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -19,8 +19,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 482aabd52..0c57fdca3 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -16,11 +16,11 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" ) type namedPipe struct { diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 70b42a6ec..5ee7f2a72 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -19,13 +19,13 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const fileName = "mypipe" diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 7c633c1b0..e9e6faf67 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -20,17 +20,17 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type regularFile struct { diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 034a29fdb..32552e261 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -22,12 +22,12 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" ) // nextFileID is used to generate unique file names. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 515f033f2..88dbd6e35 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -29,7 +29,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD index a145a5ca3..61c78569d 100644 --- a/pkg/sentry/hostmm/BUILD +++ b/pkg/sentry/hostmm/BUILD @@ -12,6 +12,6 @@ go_library( deps = [ "//pkg/fd", "//pkg/log", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go index 19335ca73..506c7864a 100644 --- a/pkg/sentry/hostmm/hostmm.go +++ b/pkg/sentry/hostmm/hostmm.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // NotifyCurrentMemcgPressureCallback requests that f is called whenever the diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index aa621b724..334432abf 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -13,7 +13,7 @@ go_library( "test_stack.go", ], deps = [ - "//pkg/sentry/context", + "//pkg/context", "//pkg/tcpip/stack", ], ) diff --git a/pkg/sentry/inet/context.go b/pkg/sentry/inet/context.go index 4eda7dd1f..e8cc1bffd 100644 --- a/pkg/sentry/inet/context.go +++ b/pkg/sentry/inet/context.go @@ -15,7 +15,7 @@ package inet import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the inet package's type for context.Context.Value keys. diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index cebaccd92..0738946d9 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -153,14 +153,15 @@ go_library( "//pkg/binary", "//pkg/bits", "//pkg/bpf", + "//pkg/context", "//pkg/cpuid", "//pkg/eventchannel", "//pkg/log", "//pkg/metric", "//pkg/refs", + "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/lock", @@ -180,7 +181,6 @@ go_library( "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/safemem", "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/time", @@ -188,7 +188,6 @@ go_library( "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/state", "//pkg/state/statefile", "//pkg/sync", @@ -196,6 +195,7 @@ go_library( "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -212,9 +212,9 @@ go_test( library = ":kernel", deps = [ "//pkg/abi", + "//pkg/context", "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/fs/filetest", "//pkg/sentry/kernel/sched", @@ -222,8 +222,8 @@ go_test( "//pkg/sentry/pgalloc", "//pkg/sentry/time", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 64537c9be..2bc49483a 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -61,8 +61,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/bits", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sync", "//pkg/syserror", ], diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go index 5c0e7d6b6..ef5723127 100644 --- a/pkg/sentry/kernel/auth/context.go +++ b/pkg/sentry/kernel/auth/context.go @@ -15,7 +15,7 @@ package auth import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the auth package's type for context.Context.Value keys. diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go index 3d74bc610..28cbe159d 100644 --- a/pkg/sentry/kernel/auth/id_map.go +++ b/pkg/sentry/kernel/auth/id_map.go @@ -16,7 +16,7 @@ package auth import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index 3c9dceaba..0c40bf315 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -17,8 +17,8 @@ package kernel import ( "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" ) // contextID is the kernel package's type for context.Context.Value keys. diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD index daff608d7..9d26392c0 100644 --- a/pkg/sentry/kernel/contexttest/BUILD +++ b/pkg/sentry/kernel/contexttest/BUILD @@ -8,8 +8,8 @@ go_library( srcs = ["contexttest.go"], visibility = ["//pkg/sentry:internal"], deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/kernel", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go index 82f9d8922..22c340e56 100644 --- a/pkg/sentry/kernel/contexttest/contexttest.go +++ b/pkg/sentry/kernel/contexttest/contexttest.go @@ -19,8 +19,8 @@ package contexttest import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index 19e16ab3a..dedf0fa15 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -24,13 +24,13 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/context", "//pkg/refs", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -43,7 +43,7 @@ go_test( ], library = ":epoll", deps = [ - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs/filetest", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index e84742993..8bffb78fc 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -20,13 +20,13 @@ import ( "fmt" "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go index 4a20d4c82..22630e9c5 100644 --- a/pkg/sentry/kernel/epoll/epoll_test.go +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -17,7 +17,7 @@ package epoll import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/filetest" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index ee2d74864..9983a32e5 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -8,14 +8,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fdnotifier", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -26,8 +26,8 @@ go_test( srcs = ["eventfd_test.go"], library = ":eventfd", deps = [ - "//pkg/sentry/context/contexttest", - "//pkg/sentry/usermem", + "//pkg/sentry/contexttest", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 687690679..87951adeb 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -21,14 +21,14 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go index 018c7f3ef..9b4892f74 100644 --- a/pkg/sentry/kernel/eventfd/eventfd_test.go +++ b/pkg/sentry/kernel/eventfd/eventfd_test.go @@ -17,8 +17,8 @@ package eventfd import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 0ad4135b3..9460bb235 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -22,8 +22,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 86164df49..261b815f2 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -18,8 +18,8 @@ import ( "runtime" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/filetest" "gvisor.dev/gvisor/pkg/sentry/limits" diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index f413d8ae2..c5021f2db 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -36,12 +36,12 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/memmap", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -51,7 +51,7 @@ go_test( srcs = ["futex_test.go"], library = ":futex", deps = [ - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index d1931c8f4..732e66da4 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -20,9 +20,9 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // KeyKind indicates the type of a Key. diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index c23126ca5..7c5c7665b 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -22,8 +22,8 @@ import ( "testing" "unsafe" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // testData implements the Target interface, and allows us to diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index c85e97fef..7b90fac5a 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -40,12 +40,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/hostcpu" diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 2c7b6206f..4c049d5b4 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -33,16 +33,16 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/context", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/safemem", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -57,11 +57,11 @@ go_test( ], library = ":pipe", deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go index 1c0f34269..fe3be5dbd 100644 --- a/pkg/sentry/kernel/pipe/buffer.go +++ b/pkg/sentry/kernel/pipe/buffer.go @@ -17,7 +17,7 @@ package pipe import ( "io" - "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sync" ) diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/pipe/buffer_test.go index ee1b90115..4d54b8b8f 100644 --- a/pkg/sentry/kernel/pipe/buffer_test.go +++ b/pkg/sentry/kernel/pipe/buffer_test.go @@ -18,7 +18,7 @@ import ( "testing" "unsafe" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func TestBufferSize(t *testing.T) { diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 716f589af..4b688c627 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -16,7 +16,7 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index 16fa80abe..ab75a87ff 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -18,11 +18,11 @@ import ( "testing" "time" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type sleeper struct { diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index e4fd7d420..08410283f 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -20,7 +20,7 @@ import ( "sync/atomic" "syscall" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index e3a14b665..bda739dbe 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -18,9 +18,9 @@ import ( "bytes" "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 8394eb78b..80158239e 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -21,10 +21,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go index b4d29fc77..b2b5691ee 100644 --- a/pkg/sentry/kernel/pipe/reader_writer.go +++ b/pkg/sentry/kernel/pipe/reader_writer.go @@ -17,11 +17,11 @@ package pipe import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // ReaderWriter satisfies the FileOperations interface and services both diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 6f83e3cee..a5675bd70 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -16,12 +16,12 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 3be171cdc..35ad97d5d 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // ptraceOptions are the subset of options controlling a task's ptrace behavior diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index 5514cf432..cef1276ec 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -18,8 +18,8 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // ptraceArch implements arch-specific ptrace commands. diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 61e412911..d971b96b3 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -17,8 +17,8 @@ package kernel import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // ptraceArch implements arch-specific ptrace commands. diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index b14429854..efebfd872 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/hostcpu" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Restartable sequences. diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index 2347dcf36..c38c5a40c 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const maxSyscallFilterInstructions = 1 << 15 diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 76e19b551..65e5427c1 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -24,8 +24,8 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -41,8 +41,8 @@ go_test( library = ":semaphore", deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/kernel/auth", "//pkg/syserror", ], diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 18299814e..1000f3287 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -19,8 +19,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go index c235f6ca4..e47acefdf 100644 --- a/pkg/sentry/kernel/semaphore/semaphore_test.go +++ b/pkg/sentry/kernel/semaphore/semaphore_test.go @@ -18,8 +18,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 5547c5abf..bfd779837 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -11,9 +11,9 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", "//pkg/refs", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", @@ -22,8 +22,8 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 8ddef7eb8..208569057 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -37,9 +37,9 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -47,9 +47,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Key represents a shm segment key. Analogous to a file name. diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 5d44773d4..3eb78e91b 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -9,14 +9,14 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 28be4a939..8243bb93e 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -18,14 +18,14 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index d2d01add4..93c4fe969 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // maxSyscallNum is the highest supported syscall number. diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 978d66da8..95adf2778 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -35,8 +35,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 247bd4aba..53d4d211b 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -17,8 +17,8 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // SharingOptions controls what resources are shared by a new task created by diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index bb5560acf..2d6e7733c 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -18,13 +18,13 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/usermem" ) var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index c211b5b74..a53e77c9f 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -16,7 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Futex returns t's futex manager. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 0fb3661de..41259210c 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -20,7 +20,7 @@ import ( "sort" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 6357273d3..5568c91bc 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -26,7 +26,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // A taskRunState is a reified state in the task state machine. See README.md diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 39cd1340d..8802db142 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -26,8 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 58af16ee2..de838beef 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // TaskConfig defines the configuration of a new Task (see below). diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 3180f5560..d555d69a8 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index 518bfe1bd..2bf3ce8a8 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -18,8 +18,8 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // MAX_RW_COUNT is the maximum size in bytes of a single read or write. diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index d49594d9f..7ba7dc50c 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -11,7 +11,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sync", "//pkg/syserror", "//pkg/waiter", diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go index 8ef483dd3..00b729d88 100644 --- a/pkg/sentry/kernel/time/context.go +++ b/pkg/sentry/kernel/time/context.go @@ -15,7 +15,7 @@ package time import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the time package's type for context.Context.Value keys. diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go index 849c5b646..cf2f7ca72 100644 --- a/pkg/sentry/kernel/timekeeper_test.go +++ b/pkg/sentry/kernel/timekeeper_test.go @@ -17,12 +17,12 @@ package kernel import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/pgalloc" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // mockClocks is a sentrytime.Clocks that simply returns the times in the diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index fdd10c56c..f1b3c212c 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -18,10 +18,10 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // vdsoParams are the parameters exposed to the VDSO. diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD index 67869757f..cf591c4c1 100644 --- a/pkg/sentry/limits/BUILD +++ b/pkg/sentry/limits/BUILD @@ -12,7 +12,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sync", ], ) diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go index 6972749ed..77e1fe217 100644 --- a/pkg/sentry/limits/context.go +++ b/pkg/sentry/limits/context.go @@ -15,7 +15,7 @@ package limits import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the limit package's type for context.Context.Value keys. diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index d4ad2bd6c..23790378a 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -24,11 +24,12 @@ go_library( "//pkg/abi", "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/cpuid", "//pkg/log", "//pkg/rand", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", @@ -37,12 +38,11 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/mm", "//pkg/sentry/pgalloc", - "//pkg/sentry/safemem", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 6299a3e2f..122ed05c2 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -23,16 +23,16 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go index ccf909cac..098a45d36 100644 --- a/pkg/sentry/loader/interpreter.go +++ b/pkg/sentry/loader/interpreter.go @@ -18,10 +18,10 @@ import ( "bytes" "io" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index b03eeb005..9a613d6b7 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -24,16 +24,16 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // LoadArgs holds specifications for an executable file to be loaded. diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index df8a81907..52f446ed7 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -20,20 +20,20 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index f9a65f086..a98b66de1 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -38,11 +38,11 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/platform", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -51,5 +51,5 @@ go_test( size = "small", srcs = ["mapping_set_test.go"], library = ":memmap", - deps = ["//pkg/sentry/usermem"], + deps = ["//pkg/usermem"], ) diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go index 0a5b7ce45..d609c1ae0 100644 --- a/pkg/sentry/memmap/mapping_set.go +++ b/pkg/sentry/memmap/mapping_set.go @@ -18,7 +18,7 @@ import ( "fmt" "math" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // MappingSet maps offsets into a Mappable to mappings of those offsets. It is diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go index f9b11a59c..d39efe38f 100644 --- a/pkg/sentry/memmap/mapping_set_test.go +++ b/pkg/sentry/memmap/mapping_set_test.go @@ -18,7 +18,7 @@ import ( "reflect" "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type testMappingSpace struct { diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index 16a722a13..c6db9fc8f 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -18,9 +18,9 @@ package memmap import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Mappable represents a memory-mappable object, a mutable mapping from uint64 diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index bd6399fa2..e5729ced5 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -27,7 +27,7 @@ go_template_instance( "minDegree": "8", }, imports = { - "usermem": "gvisor.dev/gvisor/pkg/sentry/usermem", + "usermem": "gvisor.dev/gvisor/pkg/usermem", }, package = "mm", prefix = "vma", @@ -47,7 +47,7 @@ go_template_instance( "minDegree": "8", }, imports = { - "usermem": "gvisor.dev/gvisor/pkg/sentry/usermem", + "usermem": "gvisor.dev/gvisor/pkg/usermem", }, package = "mm", prefix = "pma", @@ -99,10 +99,12 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/atomicbitops", + "//pkg/context", "//pkg/log", "//pkg/refs", + "//pkg/safecopy", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/proc/seqfile", "//pkg/sentry/kernel/auth", @@ -112,13 +114,11 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/safemem", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/buffer", + "//pkg/usermem", ], ) @@ -128,14 +128,14 @@ go_test( srcs = ["mm_test.go"], library = ":mm", deps = [ + "//pkg/context", "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go index cfebcfd42..e58a63deb 100644 --- a/pkg/sentry/mm/address_space.go +++ b/pkg/sentry/mm/address_space.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // AddressSpace returns the platform.AddressSpace bound to mm. diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 4b48866ad..cb29d94b0 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -16,15 +16,15 @@ package mm import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // aioManager creates and manages asynchronous I/O contexts. diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go index df9adf708..c273c982e 100644 --- a/pkg/sentry/mm/debug.go +++ b/pkg/sentry/mm/debug.go @@ -18,7 +18,7 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) const ( diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go index b03e7d020..fa776f9c6 100644 --- a/pkg/sentry/mm/io.go +++ b/pkg/sentry/mm/io.go @@ -15,11 +15,11 @@ package mm import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // There are two supported ways to copy data to/from application virtual diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 4e9ca1de6..47b8fbf43 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -19,13 +19,13 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // NewMemoryManager returns a new MemoryManager with no mappings and 1 user. diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index d2a01d48a..f550acae0 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -17,7 +17,7 @@ package mm import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Dumpability describes if and how core dumps should be created. diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 78cc9e6e4..09e582dd3 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -35,14 +35,14 @@ package mm import ( + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // MemoryManager implements a virtual address space. diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go index 4d2bfaaed..edacca741 100644 --- a/pkg/sentry/mm/mm_test.go +++ b/pkg/sentry/mm/mm_test.go @@ -17,15 +17,15 @@ package mm import ( "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func testMemoryManager(ctx context.Context) *MemoryManager { diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index c976c6f45..62e4c20af 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -17,14 +17,14 @@ package mm import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safecopy" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // existingPMAsLocked checks that pmas exist for all addresses in ar, and diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go index 79610acb7..1ab92f046 100644 --- a/pkg/sentry/mm/procfs.go +++ b/pkg/sentry/mm/procfs.go @@ -19,10 +19,10 @@ import ( "fmt" "strings" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go index 93259c5a3..f56215d9a 100644 --- a/pkg/sentry/mm/save_restore.go +++ b/pkg/sentry/mm/save_restore.go @@ -17,7 +17,7 @@ package mm import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go index b9f2d23e5..6432731d4 100644 --- a/pkg/sentry/mm/shm.go +++ b/pkg/sentry/mm/shm.go @@ -15,10 +15,10 @@ package mm import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // DetachShm unmaps a sysv shared memory segment. diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index ea2d7af74..9ad52082d 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -15,14 +15,14 @@ package mm import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index c2466c988..c5dfa5972 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -19,14 +19,14 @@ import ( mrand "math/rand" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // HandleUserFault handles an application page fault. sp is the faulting diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index f2fd70799..9a14e69e6 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -18,13 +18,13 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Preconditions: mm.mappingMu must be locked for writing. opts must be valid diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 02385a3ce..1eeb9f317 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -61,18 +61,18 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/context", "//pkg/log", "//pkg/memutil", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/hostmm", "//pkg/sentry/platform", - "//pkg/sentry/safemem", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/state", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -81,5 +81,5 @@ go_test( size = "small", srcs = ["pgalloc_test.go"], library = ":pgalloc", - deps = ["//pkg/sentry/usermem"], + deps = ["//pkg/usermem"], ) diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go index 11ccf897b..d25215418 100644 --- a/pkg/sentry/pgalloc/context.go +++ b/pkg/sentry/pgalloc/context.go @@ -15,7 +15,7 @@ package pgalloc import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is this package's type for context.Context.Value keys. diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index c99e023d9..577e9306a 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -29,15 +29,15 @@ import ( "syscall" "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostmm" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // MemoryFile is a platform.File whose pages may be allocated to arbitrary diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go index 428e6a859..293f22c6b 100644 --- a/pkg/sentry/pgalloc/pgalloc_test.go +++ b/pkg/sentry/pgalloc/pgalloc_test.go @@ -17,7 +17,7 @@ package pgalloc import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index aafce1d00..f8385c146 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/usermem" ) // SaveTo writes f's state to the given stream. diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 006450b2d..453241eca 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -26,14 +26,14 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/atomicbitops", + "//pkg/context", "//pkg/log", + "//pkg/safecopy", + "//pkg/safemem", "//pkg/seccomp", "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/safemem", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go index e29bc4485..6759cda65 100644 --- a/pkg/sentry/platform/context.go +++ b/pkg/sentry/platform/context.go @@ -15,7 +15,7 @@ package platform import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the auth package's type for context.Context.Value keys. diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index a4532a766..159f7eafd 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -44,16 +44,16 @@ go_library( "//pkg/cpuid", "//pkg/log", "//pkg/procid", + "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/ring0", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/platform/safecopy", "//pkg/sentry/time", - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", ], ) @@ -75,6 +75,6 @@ go_test( "//pkg/sentry/platform/kvm/testutil", "//pkg/sentry/platform/ring0", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index a25f3c449..be213bfe8 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // dirtySet tracks vCPUs for invalidation. diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 30dbb74d6..35cd55fef 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -19,9 +19,9 @@ import ( "reflect" "syscall" + "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" ) // bluepill enters guest mode. diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index f6459cda9..e34f46aeb 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -18,7 +18,7 @@ import ( "sync/atomic" "syscall" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index 99450d22d..c769ac7b4 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -19,7 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // context is an implementation of the platform context. diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index d337c5c7c..972ba85c3 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // KVM represents a lightweight VM context. diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index 30df725d4..c42752d50 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -27,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) var dummyFPState = (*byte)(arch.NewFloatingPointData()) diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index e6d912168..8076c7529 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // machine contains state associated with the VM as a whole. diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 873e39dc7..923ce3909 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // initArchState initializes architecture-specific state. diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 3b1f20219..09552837a 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type vCPUArchState struct { diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 3f2f97a6b..1c8384e6b 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // setMemoryRegion initializes a region. diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go index 91de5dab1..f7fa2f98d 100644 --- a/pkg/sentry/platform/kvm/physical_map.go +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type region struct { diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go index 2d68855ef..c8897d34f 100644 --- a/pkg/sentry/platform/kvm/virtual_map.go +++ b/pkg/sentry/platform/kvm/virtual_map.go @@ -22,7 +22,7 @@ import ( "regexp" "strconv" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type virtualRegion struct { diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go index 6a2f145be..327e2be4f 100644 --- a/pkg/sentry/platform/kvm/virtual_map_test.go +++ b/pkg/sentry/platform/kvm/virtual_map_test.go @@ -18,7 +18,7 @@ import ( "syscall" "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type checker struct { diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go index 999787462..091c2e365 100644 --- a/pkg/sentry/platform/mmap_min_addr.go +++ b/pkg/sentry/platform/mmap_min_addr.go @@ -20,7 +20,7 @@ import ( "strconv" "strings" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // systemMMapMinAddrSource is the source file. diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index ec22dbf87..2ca696382 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -22,10 +22,10 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Platform provides abstractions for execution contexts (Context, diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 3bcc5e040..95abd321e 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -25,14 +25,14 @@ go_library( "//pkg/abi/linux", "//pkg/log", "//pkg/procid", + "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/hostcpu", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index bb0e03880..03adb624b 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -51,8 +51,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index 72c7ec564..6c0ed7b3e 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // getRegs gets the general purpose register set. diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go index aa1b87237..341dde143 100644 --- a/pkg/sentry/platform/ptrace/stub_unsafe.go +++ b/pkg/sentry/platform/ptrace/stub_unsafe.go @@ -19,8 +19,8 @@ import ( "syscall" "unsafe" - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/safecopy" + "gvisor.dev/gvisor/pkg/usermem" ) // stub is defined in arch-specific assembly. diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 15dc46a5b..31b7cec53 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // Linux kernel errnos which "should never be seen by user programs", but will diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index 6dee8fcc5..934b6fbcd 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -78,6 +78,6 @@ go_library( deps = [ "//pkg/cpuid", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 9dae0dccb..9c6c2cf5c 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -18,7 +18,7 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go index a850ce6cf..1583dda12 100644 --- a/pkg/sentry/platform/ring0/defs_arm64.go +++ b/pkg/sentry/platform/ring0/defs_arm64.go @@ -18,7 +18,7 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index 147311ed3..4cae10459 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -28,6 +28,6 @@ go_binary( deps = [ "//pkg/cpuid", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index 8b5cdd6c1..971eed7fa 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -93,8 +93,8 @@ go_library( "//pkg/sentry/platform/ring0:__subpackages__", ], deps = [ - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", ], ) @@ -108,5 +108,5 @@ go_test( "walker_check.go", ], library = ":pagetables", - deps = ["//pkg/sentry/usermem"], + deps = ["//pkg/usermem"], ) diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go index a90394a33..d08bfdeb3 100644 --- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go +++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go @@ -17,7 +17,7 @@ package pagetables import ( "unsafe" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // newAlignedPTEs returns a set of aligned PTEs. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go index 30c64a372..87e88e97d 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -21,7 +21,7 @@ package pagetables import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // PageTables is a set of page tables. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go index e78424766..78510ebed 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go @@ -19,7 +19,7 @@ package pagetables import ( "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // archPageTables is architecture-specific data. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go index 35e917526..54e8e554f 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go @@ -19,7 +19,7 @@ package pagetables import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func Test2MAnd4K(t *testing.T) { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go index 254116233..2f73d424f 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go @@ -19,7 +19,7 @@ package pagetables import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func Test2MAnd4K(t *testing.T) { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go index 6e95ad2b9..5c88d087d 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go @@ -17,7 +17,7 @@ package pagetables import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type mapping struct { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go index 3e2383c5e..dcf061df9 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go @@ -19,7 +19,7 @@ package pagetables import ( "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // archPageTables is architecture-specific data. diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD deleted file mode 100644 index b8747585b..000000000 --- a/pkg/sentry/platform/safecopy/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "safecopy", - srcs = [ - "atomic_amd64.s", - "atomic_arm64.s", - "memclr_amd64.s", - "memclr_arm64.s", - "memcpy_amd64.s", - "memcpy_arm64.s", - "safecopy.go", - "safecopy_unsafe.go", - "sighandler_amd64.s", - "sighandler_arm64.s", - ], - visibility = ["//pkg/sentry:internal"], - deps = ["//pkg/syserror"], -) - -go_test( - name = "safecopy_test", - srcs = [ - "safecopy_test.go", - ], - library = ":safecopy", -) diff --git a/pkg/sentry/platform/safecopy/LICENSE b/pkg/sentry/platform/safecopy/LICENSE deleted file mode 100644 index 6a66aea5e..000000000 --- a/pkg/sentry/platform/safecopy/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s deleted file mode 100644 index a0cd78f33..000000000 --- a/pkg/sentry/platform/safecopy/atomic_amd64.s +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "textflag.h" - -// handleSwapUint32Fault returns the value stored in DI. Control is transferred -// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal -// number stored in DI. -// -// It must have the same frame configuration as swapUint32 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 - MOVL DI, sig+20(FP) - RET - -// swapUint32 atomically stores new into *addr and returns (the previous *addr -// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the -// value of old is unspecified, and sig is the number of the signal that was -// received. -// -// Preconditions: addr must be aligned to a 4-byte boundary. -// -//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) -TEXT ·swapUint32(SB), NOSPLIT, $0-24 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleSwapUint32Fault will store a different value in this address. - MOVL $0, sig+20(FP) - - MOVQ addr+0(FP), DI - MOVL new+8(FP), AX - XCHGL AX, 0(DI) - MOVL AX, old+16(FP) - RET - -// handleSwapUint64Fault returns the value stored in DI. Control is transferred -// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal -// number stored in DI. -// -// It must have the same frame configuration as swapUint64 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 - MOVL DI, sig+24(FP) - RET - -// swapUint64 atomically stores new into *addr and returns (the previous *addr -// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the -// value of old is unspecified, and sig is the number of the signal that was -// received. -// -// Preconditions: addr must be aligned to a 8-byte boundary. -// -//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) -TEXT ·swapUint64(SB), NOSPLIT, $0-28 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleSwapUint64Fault will store a different value in this address. - MOVL $0, sig+24(FP) - - MOVQ addr+0(FP), DI - MOVQ new+8(FP), AX - XCHGQ AX, 0(DI) - MOVQ AX, old+16(FP) - RET - -// handleCompareAndSwapUint32Fault returns the value stored in DI. Control is -// transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the -// signal number stored in DI. -// -// It must have the same frame configuration as compareAndSwapUint32 so that it -// can undo any potential call frame set up by the assembler. -TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 - MOVL DI, sig+20(FP) - RET - -// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns -// (the value previously stored at addr, 0). If a SIGSEGV or SIGBUS signal is -// received during the operation, the value of prev is unspecified, and sig is -// the number of the signal that was received. -// -// Preconditions: addr must be aligned to a 4-byte boundary. -// -//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) -TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 - // Store 0 as the returned signal number. If we run to completion, this is - // the value the caller will see; if a signal is received, - // handleCompareAndSwapUint32Fault will store a different value in this - // address. - MOVL $0, sig+20(FP) - - MOVQ addr+0(FP), DI - MOVL old+8(FP), AX - MOVL new+12(FP), DX - LOCK - CMPXCHGL DX, 0(DI) - MOVL AX, prev+16(FP) - RET - -// handleLoadUint32Fault returns the value stored in DI. Control is transferred -// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal -// number stored in DI. -// -// It must have the same frame configuration as loadUint32 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 - MOVL DI, sig+12(FP) - RET - -// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS -// signal is received, the value returned is unspecified, and sig is the number -// of the signal that was received. -// -// Preconditions: addr must be aligned to a 4-byte boundary. -// -//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) -TEXT ·loadUint32(SB), NOSPLIT, $0-16 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleLoadUint32Fault will store a different value in this address. - MOVL $0, sig+12(FP) - - MOVQ addr+0(FP), AX - MOVL (AX), BX - MOVL BX, val+8(FP) - RET diff --git a/pkg/sentry/platform/safecopy/atomic_arm64.s b/pkg/sentry/platform/safecopy/atomic_arm64.s deleted file mode 100644 index d58ed71f7..000000000 --- a/pkg/sentry/platform/safecopy/atomic_arm64.s +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// handleSwapUint32Fault returns the value stored in R1. Control is transferred -// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal -// number stored in R1. -// -// It must have the same frame configuration as swapUint32 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 - MOVW R1, sig+20(FP) - RET - -// See the corresponding doc in safecopy_unsafe.go -// -// The code is derived from Go source runtime/internal/atomic.Xchg. -// -//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) -TEXT ·swapUint32(SB), NOSPLIT, $0-24 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleSwapUint32Fault will store a different value in this address. - MOVW $0, sig+20(FP) -again: - MOVD addr+0(FP), R0 - MOVW new+8(FP), R1 - LDAXRW (R0), R2 - STLXRW R1, (R0), R3 - CBNZ R3, again - MOVW R2, old+16(FP) - RET - -// handleSwapUint64Fault returns the value stored in R1. Control is transferred -// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal -// number stored in R1. -// -// It must have the same frame configuration as swapUint64 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 - MOVW R1, sig+24(FP) - RET - -// See the corresponding doc in safecopy_unsafe.go -// -// The code is derived from Go source runtime/internal/atomic.Xchg64. -// -//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) -TEXT ·swapUint64(SB), NOSPLIT, $0-28 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleSwapUint64Fault will store a different value in this address. - MOVW $0, sig+24(FP) -again: - MOVD addr+0(FP), R0 - MOVD new+8(FP), R1 - LDAXR (R0), R2 - STLXR R1, (R0), R3 - CBNZ R3, again - MOVD R2, old+16(FP) - RET - -// handleCompareAndSwapUint32Fault returns the value stored in R1. Control is -// transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS, -// with the signal number stored in R1. -// -// It must have the same frame configuration as compareAndSwapUint32 so that it -// can undo any potential call frame set up by the assembler. -TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 - MOVW R1, sig+20(FP) - RET - -// See the corresponding doc in safecopy_unsafe.go -// -// The code is derived from Go source runtime/internal/atomic.Cas. -// -//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) -TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 - // Store 0 as the returned signal number. If we run to completion, this is - // the value the caller will see; if a signal is received, - // handleCompareAndSwapUint32Fault will store a different value in this - // address. - MOVW $0, sig+20(FP) - - MOVD addr+0(FP), R0 - MOVW old+8(FP), R1 - MOVW new+12(FP), R2 -again: - LDAXRW (R0), R3 - CMPW R1, R3 - BNE done - STLXRW R2, (R0), R4 - CBNZ R4, again -done: - MOVW R3, prev+16(FP) - RET - -// handleLoadUint32Fault returns the value stored in DI. Control is transferred -// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal -// number stored in DI. -// -// It must have the same frame configuration as loadUint32 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 - MOVW R1, sig+12(FP) - RET - -// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS -// signal is received, the value returned is unspecified, and sig is the number -// of the signal that was received. -// -// Preconditions: addr must be aligned to a 4-byte boundary. -// -//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) -TEXT ·loadUint32(SB), NOSPLIT, $0-16 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleLoadUint32Fault will store a different value in this address. - MOVW $0, sig+12(FP) - - MOVD addr+0(FP), R0 - LDARW (R0), R1 - MOVW R1, val+8(FP) - RET diff --git a/pkg/sentry/platform/safecopy/memclr_amd64.s b/pkg/sentry/platform/safecopy/memclr_amd64.s deleted file mode 100644 index 64cf32f05..000000000 --- a/pkg/sentry/platform/safecopy/memclr_amd64.s +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// handleMemclrFault returns (the value stored in AX, the value stored in DI). -// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, -// with the faulting address stored in AX and the signal number stored in DI. -// -// It must have the same frame configuration as memclr so that it can undo any -// potential call frame set up by the assembler. -TEXT handleMemclrFault(SB), NOSPLIT, $0-28 - MOVQ AX, addr+16(FP) - MOVL DI, sig+24(FP) - RET - -// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS -// signal is received during the write, it returns the address that caused the -// fault and the number of the signal that was received. Otherwise, it returns -// an unspecified address and a signal number of 0. -// -// Data is written in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully written. -// -// The code is derived from runtime.memclrNoHeapPointers. -// -// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) -TEXT ·memclr(SB), NOSPLIT, $0-28 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleMemclrFault will store a different value in this address. - MOVL $0, sig+24(FP) - - MOVQ ptr+0(FP), DI - MOVQ n+8(FP), BX - XORQ AX, AX - - // MOVOU seems always faster than REP STOSQ. -tail: - TESTQ BX, BX - JEQ _0 - CMPQ BX, $2 - JBE _1or2 - CMPQ BX, $4 - JBE _3or4 - CMPQ BX, $8 - JB _5through7 - JE _8 - CMPQ BX, $16 - JBE _9through16 - PXOR X0, X0 - CMPQ BX, $32 - JBE _17through32 - CMPQ BX, $64 - JBE _33through64 - CMPQ BX, $128 - JBE _65through128 - CMPQ BX, $256 - JBE _129through256 - // TODO: use branch table and BSR to make this just a single dispatch - // TODO: for really big clears, use MOVNTDQ, even without AVX2. - -loop: - MOVOU X0, 0(DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, 64(DI) - MOVOU X0, 80(DI) - MOVOU X0, 96(DI) - MOVOU X0, 112(DI) - MOVOU X0, 128(DI) - MOVOU X0, 144(DI) - MOVOU X0, 160(DI) - MOVOU X0, 176(DI) - MOVOU X0, 192(DI) - MOVOU X0, 208(DI) - MOVOU X0, 224(DI) - MOVOU X0, 240(DI) - SUBQ $256, BX - ADDQ $256, DI - CMPQ BX, $256 - JAE loop - JMP tail - -_1or2: - MOVB AX, (DI) - MOVB AX, -1(DI)(BX*1) - RET -_0: - RET -_3or4: - MOVW AX, (DI) - MOVW AX, -2(DI)(BX*1) - RET -_5through7: - MOVL AX, (DI) - MOVL AX, -4(DI)(BX*1) - RET -_8: - // We need a separate case for 8 to make sure we clear pointers atomically. - MOVQ AX, (DI) - RET -_9through16: - MOVQ AX, (DI) - MOVQ AX, -8(DI)(BX*1) - RET -_17through32: - MOVOU X0, (DI) - MOVOU X0, -16(DI)(BX*1) - RET -_33through64: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) - RET -_65through128: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, -64(DI)(BX*1) - MOVOU X0, -48(DI)(BX*1) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) - RET -_129through256: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, 64(DI) - MOVOU X0, 80(DI) - MOVOU X0, 96(DI) - MOVOU X0, 112(DI) - MOVOU X0, -128(DI)(BX*1) - MOVOU X0, -112(DI)(BX*1) - MOVOU X0, -96(DI)(BX*1) - MOVOU X0, -80(DI)(BX*1) - MOVOU X0, -64(DI)(BX*1) - MOVOU X0, -48(DI)(BX*1) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) - RET diff --git a/pkg/sentry/platform/safecopy/memclr_arm64.s b/pkg/sentry/platform/safecopy/memclr_arm64.s deleted file mode 100644 index 7361b9067..000000000 --- a/pkg/sentry/platform/safecopy/memclr_arm64.s +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// handleMemclrFault returns (the value stored in R0, the value stored in R1). -// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, -// with the faulting address stored in R0 and the signal number stored in R1. -// -// It must have the same frame configuration as memclr so that it can undo any -// potential call frame set up by the assembler. -TEXT handleMemclrFault(SB), NOSPLIT, $0-28 - MOVD R0, addr+16(FP) - MOVW R1, sig+24(FP) - RET - -// See the corresponding doc in safecopy_unsafe.go -// -// The code is derived from runtime.memclrNoHeapPointers. -// -// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) -TEXT ·memclr(SB), NOSPLIT, $0-28 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleMemclrFault will store a different value in this address. - MOVW $0, sig+24(FP) - MOVD ptr+0(FP), R0 - MOVD n+8(FP), R1 - - // If size is less than 16 bytes, use tail_zero to zero what remains - CMP $16, R1 - BLT tail_zero - // Get buffer offset into 16 byte aligned address for better performance - ANDS $15, R0, ZR - BNE unaligned_to_16 -aligned_to_16: - LSR $4, R1, R2 -zero_by_16: - STP.P (ZR, ZR), 16(R0) // Store pair with post index. - SUBS $1, R2, R2 - BNE zero_by_16 - ANDS $15, R1, R1 - BEQ end - - // Zero buffer with size=R1 < 16 -tail_zero: - TBZ $3, R1, tail_zero_4 - MOVD.P ZR, 8(R0) -tail_zero_4: - TBZ $2, R1, tail_zero_2 - MOVW.P ZR, 4(R0) -tail_zero_2: - TBZ $1, R1, tail_zero_1 - MOVH.P ZR, 2(R0) -tail_zero_1: - TBZ $0, R1, end - MOVB ZR, (R0) -end: - RET - -unaligned_to_16: - MOVD R0, R2 -head_loop: - MOVBU.P ZR, 1(R0) - ANDS $15, R0, ZR - BNE head_loop - // Adjust length for what remains - SUB R2, R0, R3 - SUB R3, R1 - // If size is less than 16 bytes, use tail_zero to zero what remains - CMP $16, R1 - BLT tail_zero - B aligned_to_16 diff --git a/pkg/sentry/platform/safecopy/memcpy_amd64.s b/pkg/sentry/platform/safecopy/memcpy_amd64.s deleted file mode 100644 index 129691d68..000000000 --- a/pkg/sentry/platform/safecopy/memcpy_amd64.s +++ /dev/null @@ -1,250 +0,0 @@ -// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. -// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. -// Portions Copyright 2009 The Go Authors. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#include "textflag.h" - -// handleMemcpyFault returns (the value stored in AX, the value stored in DI). -// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, -// with the faulting address stored in AX and the signal number stored in DI. -// -// It must have the same frame configuration as memcpy so that it can undo any -// potential call frame set up by the assembler. -TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 - MOVQ AX, addr+24(FP) - MOVL DI, sig+32(FP) - RET - -// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received -// during the copy, it returns the address that caused the fault and the number -// of the signal that was received. Otherwise, it returns an unspecified address -// and a signal number of 0. -// -// Data is copied in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully copied. -// -// The code is derived from the forward copying part of runtime.memmove. -// -// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) -TEXT ·memcpy(SB), NOSPLIT, $0-36 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleMemcpyFault will store a different value in this address. - MOVL $0, sig+32(FP) - - MOVQ to+0(FP), DI - MOVQ from+8(FP), SI - MOVQ n+16(FP), BX - - // REP instructions have a high startup cost, so we handle small sizes - // with some straightline code. The REP MOVSQ instruction is really fast - // for large sizes. The cutover is approximately 2K. -tail: - // move_129through256 or smaller work whether or not the source and the - // destination memory regions overlap because they load all data into - // registers before writing it back. move_256through2048 on the other - // hand can be used only when the memory regions don't overlap or the copy - // direction is forward. - TESTQ BX, BX - JEQ move_0 - CMPQ BX, $2 - JBE move_1or2 - CMPQ BX, $4 - JBE move_3or4 - CMPQ BX, $8 - JB move_5through7 - JE move_8 - CMPQ BX, $16 - JBE move_9through16 - CMPQ BX, $32 - JBE move_17through32 - CMPQ BX, $64 - JBE move_33through64 - CMPQ BX, $128 - JBE move_65through128 - CMPQ BX, $256 - JBE move_129through256 - // TODO: use branch table and BSR to make this just a single dispatch - -/* - * forward copy loop - */ - CMPQ BX, $2048 - JLS move_256through2048 - - // Check alignment - MOVL SI, AX - ORL DI, AX - TESTL $7, AX - JEQ fwdBy8 - - // Do 1 byte at a time - MOVQ BX, CX - REP; MOVSB - RET - -fwdBy8: - // Do 8 bytes at a time - MOVQ BX, CX - SHRQ $3, CX - ANDQ $7, BX - REP; MOVSQ - JMP tail - -move_1or2: - MOVB (SI), AX - MOVB AX, (DI) - MOVB -1(SI)(BX*1), CX - MOVB CX, -1(DI)(BX*1) - RET -move_0: - RET -move_3or4: - MOVW (SI), AX - MOVW AX, (DI) - MOVW -2(SI)(BX*1), CX - MOVW CX, -2(DI)(BX*1) - RET -move_5through7: - MOVL (SI), AX - MOVL AX, (DI) - MOVL -4(SI)(BX*1), CX - MOVL CX, -4(DI)(BX*1) - RET -move_8: - // We need a separate case for 8 to make sure we write pointers atomically. - MOVQ (SI), AX - MOVQ AX, (DI) - RET -move_9through16: - MOVQ (SI), AX - MOVQ AX, (DI) - MOVQ -8(SI)(BX*1), CX - MOVQ CX, -8(DI)(BX*1) - RET -move_17through32: - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU -16(SI)(BX*1), X1 - MOVOU X1, -16(DI)(BX*1) - RET -move_33through64: - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU 16(SI), X1 - MOVOU X1, 16(DI) - MOVOU -32(SI)(BX*1), X2 - MOVOU X2, -32(DI)(BX*1) - MOVOU -16(SI)(BX*1), X3 - MOVOU X3, -16(DI)(BX*1) - RET -move_65through128: - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU 16(SI), X1 - MOVOU X1, 16(DI) - MOVOU 32(SI), X2 - MOVOU X2, 32(DI) - MOVOU 48(SI), X3 - MOVOU X3, 48(DI) - MOVOU -64(SI)(BX*1), X4 - MOVOU X4, -64(DI)(BX*1) - MOVOU -48(SI)(BX*1), X5 - MOVOU X5, -48(DI)(BX*1) - MOVOU -32(SI)(BX*1), X6 - MOVOU X6, -32(DI)(BX*1) - MOVOU -16(SI)(BX*1), X7 - MOVOU X7, -16(DI)(BX*1) - RET -move_129through256: - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU 16(SI), X1 - MOVOU X1, 16(DI) - MOVOU 32(SI), X2 - MOVOU X2, 32(DI) - MOVOU 48(SI), X3 - MOVOU X3, 48(DI) - MOVOU 64(SI), X4 - MOVOU X4, 64(DI) - MOVOU 80(SI), X5 - MOVOU X5, 80(DI) - MOVOU 96(SI), X6 - MOVOU X6, 96(DI) - MOVOU 112(SI), X7 - MOVOU X7, 112(DI) - MOVOU -128(SI)(BX*1), X8 - MOVOU X8, -128(DI)(BX*1) - MOVOU -112(SI)(BX*1), X9 - MOVOU X9, -112(DI)(BX*1) - MOVOU -96(SI)(BX*1), X10 - MOVOU X10, -96(DI)(BX*1) - MOVOU -80(SI)(BX*1), X11 - MOVOU X11, -80(DI)(BX*1) - MOVOU -64(SI)(BX*1), X12 - MOVOU X12, -64(DI)(BX*1) - MOVOU -48(SI)(BX*1), X13 - MOVOU X13, -48(DI)(BX*1) - MOVOU -32(SI)(BX*1), X14 - MOVOU X14, -32(DI)(BX*1) - MOVOU -16(SI)(BX*1), X15 - MOVOU X15, -16(DI)(BX*1) - RET -move_256through2048: - SUBQ $256, BX - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU 16(SI), X1 - MOVOU X1, 16(DI) - MOVOU 32(SI), X2 - MOVOU X2, 32(DI) - MOVOU 48(SI), X3 - MOVOU X3, 48(DI) - MOVOU 64(SI), X4 - MOVOU X4, 64(DI) - MOVOU 80(SI), X5 - MOVOU X5, 80(DI) - MOVOU 96(SI), X6 - MOVOU X6, 96(DI) - MOVOU 112(SI), X7 - MOVOU X7, 112(DI) - MOVOU 128(SI), X8 - MOVOU X8, 128(DI) - MOVOU 144(SI), X9 - MOVOU X9, 144(DI) - MOVOU 160(SI), X10 - MOVOU X10, 160(DI) - MOVOU 176(SI), X11 - MOVOU X11, 176(DI) - MOVOU 192(SI), X12 - MOVOU X12, 192(DI) - MOVOU 208(SI), X13 - MOVOU X13, 208(DI) - MOVOU 224(SI), X14 - MOVOU X14, 224(DI) - MOVOU 240(SI), X15 - MOVOU X15, 240(DI) - CMPQ BX, $256 - LEAQ 256(SI), SI - LEAQ 256(DI), DI - JGE move_256through2048 - JMP tail diff --git a/pkg/sentry/platform/safecopy/memcpy_arm64.s b/pkg/sentry/platform/safecopy/memcpy_arm64.s deleted file mode 100644 index e7e541565..000000000 --- a/pkg/sentry/platform/safecopy/memcpy_arm64.s +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// handleMemcpyFault returns (the value stored in R0, the value stored in R1). -// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, -// with the faulting address stored in R0 and the signal number stored in R1. -// -// It must have the same frame configuration as memcpy so that it can undo any -// potential call frame set up by the assembler. -TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 - MOVD R0, addr+24(FP) - MOVW R1, sig+32(FP) - RET - -// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received -// during the copy, it returns the address that caused the fault and the number -// of the signal that was received. Otherwise, it returns an unspecified address -// and a signal number of 0. -// -// Data is copied in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully copied. -// -// The code is derived from the Go source runtime.memmove. -// -// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) -TEXT ·memcpy(SB), NOSPLIT, $-8-36 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleMemcpyFault will store a different value in this address. - MOVW $0, sig+32(FP) - - MOVD to+0(FP), R3 - MOVD from+8(FP), R4 - MOVD n+16(FP), R5 - CMP $0, R5 - BNE check - RET - -check: - AND $~7, R5, R7 // R7 is N&~7. - SUB R7, R5, R6 // R6 is N&7. - - // Copying forward proceeds by copying R7/8 words then copying R6 bytes. - // R3 and R4 are advanced as we copy. - - // (There may be implementations of armv8 where copying by bytes until - // at least one of source or dest is word aligned is a worthwhile - // optimization, but the on the one tested so far (xgene) it did not - // make a significance difference.) - - CMP $0, R7 // Do we need to do any word-by-word copying? - BEQ noforwardlarge - ADD R3, R7, R9 // R9 points just past where we copy by word. - -forwardlargeloop: - MOVD.P 8(R4), R8 // R8 is just a scratch register. - MOVD.P R8, 8(R3) - CMP R3, R9 - BNE forwardlargeloop - -noforwardlarge: - CMP $0, R6 // Do we need to do any byte-by-byte copying? - BNE forwardtail - RET - -forwardtail: - ADD R3, R6, R9 // R9 points just past the destination memory. - -forwardtailloop: - MOVBU.P 1(R4), R8 - MOVBU.P R8, 1(R3) - CMP R3, R9 - BNE forwardtailloop - RET diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go deleted file mode 100644 index 2fb7e5809..000000000 --- a/pkg/sentry/platform/safecopy/safecopy.go +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package safecopy provides an efficient implementation of functions to access -// memory that may result in SIGSEGV or SIGBUS being sent to the accessor. -package safecopy - -import ( - "fmt" - "reflect" - "runtime" - "syscall" - - "gvisor.dev/gvisor/pkg/syserror" -) - -// SegvError is returned when a safecopy function receives SIGSEGV. -type SegvError struct { - // Addr is the address at which the SIGSEGV occurred. - Addr uintptr -} - -// Error implements error.Error. -func (e SegvError) Error() string { - return fmt.Sprintf("SIGSEGV at %#x", e.Addr) -} - -// BusError is returned when a safecopy function receives SIGBUS. -type BusError struct { - // Addr is the address at which the SIGBUS occurred. - Addr uintptr -} - -// Error implements error.Error. -func (e BusError) Error() string { - return fmt.Sprintf("SIGBUS at %#x", e.Addr) -} - -// AlignmentError is returned when a safecopy function is passed an address -// that does not meet alignment requirements. -type AlignmentError struct { - // Addr is the invalid address. - Addr uintptr - - // Alignment is the required alignment. - Alignment uintptr -} - -// Error implements error.Error. -func (e AlignmentError) Error() string { - return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment) -} - -var ( - // The begin and end addresses below are for the functions that are - // checked by the signal handler. - memcpyBegin uintptr - memcpyEnd uintptr - memclrBegin uintptr - memclrEnd uintptr - swapUint32Begin uintptr - swapUint32End uintptr - swapUint64Begin uintptr - swapUint64End uintptr - compareAndSwapUint32Begin uintptr - compareAndSwapUint32End uintptr - loadUint32Begin uintptr - loadUint32End uintptr - - // savedSigSegVHandler is a pointer to the SIGSEGV handler that was - // configured before we replaced it with our own. We still call into it - // when we get a SIGSEGV that is not interesting to us. - savedSigSegVHandler uintptr - - // same a above, but for SIGBUS signals. - savedSigBusHandler uintptr -) - -// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS -// signals. -func signalHandler() - -// FindEndAddress returns the end address (one byte beyond the last) of the -// function that contains the specified address (begin). -func FindEndAddress(begin uintptr) uintptr { - f := runtime.FuncForPC(begin) - if f != nil { - for p := begin; ; p++ { - g := runtime.FuncForPC(p) - if f != g { - return p - } - } - } - return begin -} - -// initializeAddresses initializes the addresses used by the signal handler. -func initializeAddresses() { - // The following functions are written in assembly language, so they won't - // be inlined by the existing compiler/linker. Tests will fail if this - // assumption is violated. - memcpyBegin = reflect.ValueOf(memcpy).Pointer() - memcpyEnd = FindEndAddress(memcpyBegin) - memclrBegin = reflect.ValueOf(memclr).Pointer() - memclrEnd = FindEndAddress(memclrBegin) - swapUint32Begin = reflect.ValueOf(swapUint32).Pointer() - swapUint32End = FindEndAddress(swapUint32Begin) - swapUint64Begin = reflect.ValueOf(swapUint64).Pointer() - swapUint64End = FindEndAddress(swapUint64Begin) - compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer() - compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin) - loadUint32Begin = reflect.ValueOf(loadUint32).Pointer() - loadUint32End = FindEndAddress(loadUint32Begin) -} - -func init() { - initializeAddresses() - if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil { - panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err)) - } - if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil { - panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err)) - } - syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) { - switch e.(type) { - case SegvError, BusError, AlignmentError: - return syscall.EFAULT, true - default: - return 0, false - } - }) -} diff --git a/pkg/sentry/platform/safecopy/safecopy_test.go b/pkg/sentry/platform/safecopy/safecopy_test.go deleted file mode 100644 index 5818f7f9b..000000000 --- a/pkg/sentry/platform/safecopy/safecopy_test.go +++ /dev/null @@ -1,617 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safecopy - -import ( - "bytes" - "fmt" - "io/ioutil" - "math/rand" - "os" - "runtime/debug" - "syscall" - "testing" - "unsafe" -) - -// Size of a page in bytes. Cloned from usermem.PageSize to avoid a circular -// dependency. -const pageSize = 4096 - -func initRandom(b []byte) { - for i := range b { - b[i] = byte(rand.Intn(256)) - } -} - -func randBuf(size int) []byte { - b := make([]byte, size) - initRandom(b) - return b -} - -func TestCopyInSuccess(t *testing.T) { - // Test that CopyIn does not return an error when all pages are accessible. - const bufLen = 8192 - a := randBuf(bufLen) - b := make([]byte, bufLen) - - n, err := CopyIn(b, unsafe.Pointer(&a[0])) - if n != bufLen { - t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) - } - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !bytes.Equal(a, b) { - t.Errorf("Buffers are not equal when they should be: %v %v", a, b) - } -} - -func TestCopyOutSuccess(t *testing.T) { - // Test that CopyOut does not return an error when all pages are - // accessible. - const bufLen = 8192 - a := randBuf(bufLen) - b := make([]byte, bufLen) - - n, err := CopyOut(unsafe.Pointer(&b[0]), a) - if n != bufLen { - t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) - } - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !bytes.Equal(a, b) { - t.Errorf("Buffers are not equal when they should be: %v %v", a, b) - } -} - -func TestCopySuccess(t *testing.T) { - // Test that Copy does not return an error when all pages are accessible. - const bufLen = 8192 - a := randBuf(bufLen) - b := make([]byte, bufLen) - - n, err := Copy(unsafe.Pointer(&b[0]), unsafe.Pointer(&a[0]), bufLen) - if n != bufLen { - t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) - } - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !bytes.Equal(a, b) { - t.Errorf("Buffers are not equal when they should be: %v %v", a, b) - } -} - -func TestZeroOutSuccess(t *testing.T) { - // Test that ZeroOut does not return an error when all pages are - // accessible. - const bufLen = 8192 - a := make([]byte, bufLen) - b := randBuf(bufLen) - - n, err := ZeroOut(unsafe.Pointer(&b[0]), bufLen) - if n != bufLen { - t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) - } - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !bytes.Equal(a, b) { - t.Errorf("Buffers are not equal when they should be: %v %v", a, b) - } -} - -func TestSwapUint32Success(t *testing.T) { - // Test that SwapUint32 does not return an error when the page is - // accessible. - before := uint32(rand.Int31()) - after := uint32(rand.Int31()) - val := before - - old, err := SwapUint32(unsafe.Pointer(&val), after) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if old != before { - t.Errorf("Unexpected old value: got %v, want %v", old, before) - } - if val != after { - t.Errorf("Unexpected new value: got %v, want %v", val, after) - } -} - -func TestSwapUint32AlignmentError(t *testing.T) { - // Test that SwapUint32 returns an AlignmentError when passed an unaligned - // address. - data := new(struct{ val uint64 }) - addr := uintptr(unsafe.Pointer(&data.val)) + 1 - want := AlignmentError{Addr: addr, Alignment: 4} - if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } -} - -func TestSwapUint64Success(t *testing.T) { - // Test that SwapUint64 does not return an error when the page is - // accessible. - before := uint64(rand.Int63()) - after := uint64(rand.Int63()) - // "The first word in ... an allocated struct or slice can be relied upon - // to be 64-bit aligned." - sync/atomic docs - data := new(struct{ val uint64 }) - data.val = before - - old, err := SwapUint64(unsafe.Pointer(&data.val), after) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if old != before { - t.Errorf("Unexpected old value: got %v, want %v", old, before) - } - if data.val != after { - t.Errorf("Unexpected new value: got %v, want %v", data.val, after) - } -} - -func TestSwapUint64AlignmentError(t *testing.T) { - // Test that SwapUint64 returns an AlignmentError when passed an unaligned - // address. - data := new(struct{ val1, val2 uint64 }) - addr := uintptr(unsafe.Pointer(&data.val1)) + 1 - want := AlignmentError{Addr: addr, Alignment: 8} - if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } -} - -func TestCompareAndSwapUint32Success(t *testing.T) { - // Test that CompareAndSwapUint32 does not return an error when the page is - // accessible. - before := uint32(rand.Int31()) - after := uint32(rand.Int31()) - val := before - - old, err := CompareAndSwapUint32(unsafe.Pointer(&val), before, after) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if old != before { - t.Errorf("Unexpected old value: got %v, want %v", old, before) - } - if val != after { - t.Errorf("Unexpected new value: got %v, want %v", val, after) - } -} - -func TestCompareAndSwapUint32AlignmentError(t *testing.T) { - // Test that CompareAndSwapUint32 returns an AlignmentError when passed an - // unaligned address. - data := new(struct{ val uint64 }) - addr := uintptr(unsafe.Pointer(&data.val)) + 1 - want := AlignmentError{Addr: addr, Alignment: 4} - if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } -} - -// withSegvErrorTestMapping calls fn with a two-page mapping. The first page -// contains random data, and the second page generates SIGSEGV when accessed. -func withSegvErrorTestMapping(t *testing.T, fn func(m []byte)) { - mapping, err := syscall.Mmap(-1, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE) - if err != nil { - t.Fatalf("Mmap failed: %v", err) - } - defer syscall.Munmap(mapping) - if err := syscall.Mprotect(mapping[pageSize:], syscall.PROT_NONE); err != nil { - t.Fatalf("Mprotect failed: %v", err) - } - initRandom(mapping[:pageSize]) - - fn(mapping) -} - -// withBusErrorTestMapping calls fn with a two-page mapping. The first page -// contains random data, and the second page generates SIGBUS when accessed. -func withBusErrorTestMapping(t *testing.T, fn func(m []byte)) { - f, err := ioutil.TempFile("", "sigbus_test") - if err != nil { - t.Fatalf("TempFile failed: %v", err) - } - defer f.Close() - if err := f.Truncate(pageSize); err != nil { - t.Fatalf("Truncate failed: %v", err) - } - mapping, err := syscall.Mmap(int(f.Fd()), 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) - if err != nil { - t.Fatalf("Mmap failed: %v", err) - } - defer syscall.Munmap(mapping) - initRandom(mapping[:pageSize]) - - fn(mapping) -} - -func TestCopyInSegvError(t *testing.T) { - // Test that CopyIn returns a SegvError when reaching a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - dst := randBuf(pageSize) - n, err := CopyIn(dst, src) - if n != bytesBeforeFault { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyInBusError(t *testing.T) { - // Test that CopyIn returns a BusError when reaching a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - dst := randBuf(pageSize) - n, err := CopyIn(dst, src) - if n != bytesBeforeFault { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyOutSegvError(t *testing.T) { - // Test that CopyOut returns a SegvError when reaching a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - src := randBuf(pageSize) - n, err := CopyOut(dst, src) - if n != bytesBeforeFault { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyOutBusError(t *testing.T) { - // Test that CopyOut returns a BusError when reaching a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - src := randBuf(pageSize) - n, err := CopyOut(dst, src) - if n != bytesBeforeFault { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopySourceSegvError(t *testing.T) { - // Test that Copy returns a SegvError when copying from a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - dst := randBuf(pageSize) - n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopySourceBusError(t *testing.T) { - // Test that Copy returns a BusError when copying from a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - dst := randBuf(pageSize) - n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyDestinationSegvError(t *testing.T) { - // Test that Copy returns a SegvError when copying to a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - src := randBuf(pageSize) - n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyDestinationBusError(t *testing.T) { - // Test that Copy returns a BusError when copying to a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - src := randBuf(pageSize) - n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestZeroOutSegvError(t *testing.T) { - // Test that ZeroOut returns a SegvError when reaching a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - n, err := ZeroOut(dst, pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) { - t.Errorf("Non-zero bytes in written part of mapping: %v", got) - } - }) - }) - } -} - -func TestZeroOutBusError(t *testing.T) { - // Test that ZeroOut returns a BusError when reaching a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - n, err := ZeroOut(dst, pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) { - t.Errorf("Non-zero bytes in written part of mapping: %v", got) - } - }) - }) - } -} - -func TestSwapUint32SegvError(t *testing.T) { - // Test that SwapUint32 returns a SegvError when reaching a page that - // signals SIGSEGV. - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := SwapUint32(unsafe.Pointer(secondPage), 1) - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestSwapUint32BusError(t *testing.T) { - // Test that SwapUint32 returns a BusError when reaching a page that - // signals SIGBUS. - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := SwapUint32(unsafe.Pointer(secondPage), 1) - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestSwapUint64SegvError(t *testing.T) { - // Test that SwapUint64 returns a SegvError when reaching a page that - // signals SIGSEGV. - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := SwapUint64(unsafe.Pointer(secondPage), 1) - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestSwapUint64BusError(t *testing.T) { - // Test that SwapUint64 returns a BusError when reaching a page that - // signals SIGBUS. - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := SwapUint64(unsafe.Pointer(secondPage), 1) - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestCompareAndSwapUint32SegvError(t *testing.T) { - // Test that CompareAndSwapUint32 returns a SegvError when reaching a page - // that signals SIGSEGV. - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestCompareAndSwapUint32BusError(t *testing.T) { - // Test that CompareAndSwapUint32 returns a BusError when reaching a page - // that signals SIGBUS. - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func testCopy(dst, src []byte) (panicked bool) { - defer func() { - if r := recover(); r != nil { - panicked = true - } - }() - debug.SetPanicOnFault(true) - copy(dst, src) - return -} - -func TestSegVOnMemmove(t *testing.T) { - // Test that SIGSEGVs received by runtime.memmove when *not* doing - // CopyIn or CopyOut work gets propagated to the runtime. - const bufLen = pageSize - a, err := syscall.Mmap(-1, 0, bufLen, syscall.PROT_NONE, syscall.MAP_ANON|syscall.MAP_PRIVATE) - if err != nil { - t.Fatalf("Mmap failed: %v", err) - - } - defer syscall.Munmap(a) - b := randBuf(bufLen) - - if !testCopy(b, a) { - t.Fatalf("testCopy didn't panic when it should have") - } - - if !testCopy(a, b) { - t.Fatalf("testCopy didn't panic when it should have") - } -} - -func TestSigbusOnMemmove(t *testing.T) { - // Test that SIGBUS received by runtime.memmove when *not* doing - // CopyIn or CopyOut work gets propagated to the runtime. - const bufLen = pageSize - f, err := ioutil.TempFile("", "sigbus_test") - if err != nil { - t.Fatalf("TempFile failed: %v", err) - } - os.Remove(f.Name()) - defer f.Close() - - a, err := syscall.Mmap(int(f.Fd()), 0, bufLen, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) - if err != nil { - t.Fatalf("Mmap failed: %v", err) - - } - defer syscall.Munmap(a) - b := randBuf(bufLen) - - if !testCopy(b, a) { - t.Fatalf("testCopy didn't panic when it should have") - } - - if !testCopy(a, b) { - t.Fatalf("testCopy didn't panic when it should have") - } -} diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go deleted file mode 100644 index eef028e68..000000000 --- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go +++ /dev/null @@ -1,335 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safecopy - -import ( - "fmt" - "syscall" - "unsafe" -) - -// maxRegisterSize is the maximum register size used in memcpy and memclr. It -// is used to decide by how much to rewind the copy (for memcpy) or zeroing -// (for memclr) before proceeding. -const maxRegisterSize = 16 - -// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received -// during the copy, it returns the address that caused the fault and the number -// of the signal that was received. Otherwise, it returns an unspecified address -// and a signal number of 0. -// -// Data is copied in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully copied. -// -//go:noescape -func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) - -// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS -// signal is received during the write, it returns the address that caused the -// fault and the number of the signal that was received. Otherwise, it returns -// an unspecified address and a signal number of 0. -// -// Data is written in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully written. -// -//go:noescape -func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) - -// swapUint32 atomically stores new into *ptr and returns (the previous *ptr -// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the -// value of old is unspecified, and sig is the number of the signal that was -// received. -// -// Preconditions: ptr must be aligned to a 4-byte boundary. -// -//go:noescape -func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) - -// swapUint64 atomically stores new into *ptr and returns (the previous *ptr -// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the -// value of old is unspecified, and sig is the number of the signal that was -// received. -// -// Preconditions: ptr must be aligned to a 8-byte boundary. -// -//go:noescape -func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) - -// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns -// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is -// received during the operation, the value of prev is unspecified, and sig is -// the number of the signal that was received. -// -// Preconditions: ptr must be aligned to a 4-byte boundary. -// -//go:noescape -func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) - -// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It -// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. -// -// Preconditions: ptr must be aligned to a 4-byte boundary. -// -//go:noescape -func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) - -// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes -// copied and an error if SIGSEGV or SIGBUS is received while reading from src. -func CopyIn(dst []byte, src unsafe.Pointer) (int, error) { - toCopy := uintptr(len(dst)) - if len(dst) == 0 { - return 0, nil - } - - fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy) - if sig == 0 { - return len(dst), nil - } - - faultN, srcN := uintptr(fault), uintptr(src) - if faultN < srcN || faultN >= srcN+toCopy { - panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy)) - } - - // memcpy might have ended the copy up to maxRegisterSize bytes before - // fault, if an instruction caused a memory access that straddled two - // pages, and the second one faulted. Try to copy up to the fault. - var done int - if faultN-srcN > maxRegisterSize { - done = int(faultN - srcN - maxRegisterSize) - } - n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done))) - done += n - if err != nil { - return done, err - } - return done, errorFromFaultSignal(fault, sig) -} - -// CopyOut copies len(src) bytes from src to dst. If returns the number of -// bytes done and an error if SIGSEGV or SIGBUS is received while writing to -// dst. -func CopyOut(dst unsafe.Pointer, src []byte) (int, error) { - toCopy := uintptr(len(src)) - if toCopy == 0 { - return 0, nil - } - - fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy) - if sig == 0 { - return len(src), nil - } - - faultN, dstN := uintptr(fault), uintptr(dst) - if faultN < dstN || faultN >= dstN+toCopy { - panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy)) - } - - // memcpy might have ended the copy up to maxRegisterSize bytes before - // fault, if an instruction caused a memory access that straddled two - // pages, and the second one faulted. Try to copy up to the fault. - var done int - if faultN-dstN > maxRegisterSize { - done = int(faultN - dstN - maxRegisterSize) - } - n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)]) - done += n - if err != nil { - return done, err - } - return done, errorFromFaultSignal(fault, sig) -} - -// Copy copies toCopy bytes from src to dst. It returns the number of bytes -// copied and an error if SIGSEGV or SIGBUS is received while reading from src -// or writing to dst. -// -// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap, -// the resulting contents of dst are unspecified. -func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { - if toCopy == 0 { - return 0, nil - } - - fault, sig := memcpy(dst, src, toCopy) - if sig == 0 { - return toCopy, nil - } - - // Did the fault occur while reading from src or writing to dst? - faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst) - faultAfterSrc := ^uintptr(0) - if faultN >= srcN { - faultAfterSrc = faultN - srcN - } - faultAfterDst := ^uintptr(0) - if faultN >= dstN { - faultAfterDst = faultN - dstN - } - if faultAfterSrc >= toCopy && faultAfterDst >= toCopy { - panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy)) - } - faultedAfter := faultAfterSrc - if faultedAfter > faultAfterDst { - faultedAfter = faultAfterDst - } - - // memcpy might have ended the copy up to maxRegisterSize bytes before - // fault, if an instruction caused a memory access that straddled two - // pages, and the second one faulted. Try to copy up to the fault. - var done uintptr - if faultedAfter > maxRegisterSize { - done = faultedAfter - maxRegisterSize - } - n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done) - done += n - if err != nil { - return done, err - } - return done, errorFromFaultSignal(fault, sig) -} - -// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes -// written and an error if SIGSEGV or SIGBUS is received while writing to dst. -func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) { - if toZero == 0 { - return 0, nil - } - - fault, sig := memclr(dst, toZero) - if sig == 0 { - return toZero, nil - } - - faultN, dstN := uintptr(fault), uintptr(dst) - if faultN < dstN || faultN >= dstN+toZero { - panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero)) - } - - // memclr might have ended the write up to maxRegisterSize bytes before - // fault, if an instruction caused a memory access that straddled two - // pages, and the second one faulted. Try to write up to the fault. - var done uintptr - if faultN-dstN > maxRegisterSize { - done = faultN - dstN - maxRegisterSize - } - n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done) - done += n - if err != nil { - return done, err - } - return done, errorFromFaultSignal(fault, sig) -} - -// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns -// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is -// not aligned to a 4-byte boundary. -func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) { - if addr := uintptr(ptr); addr&3 != 0 { - return 0, AlignmentError{addr, 4} - } - old, sig := swapUint32(ptr, new) - return old, errorFromFaultSignal(ptr, sig) -} - -// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns -// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is -// not aligned to an 8-byte boundary. -func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) { - if addr := uintptr(ptr); addr&7 != 0 { - return 0, AlignmentError{addr, 8} - } - old, sig := swapUint64(ptr, new) - return old, errorFromFaultSignal(ptr, sig) -} - -// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32, -// except that it returns an error if SIGSEGV or SIGBUS is received while -// accessing ptr, or if ptr is not aligned to a 4-byte boundary. -func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) { - if addr := uintptr(ptr); addr&3 != 0 { - return 0, AlignmentError{addr, 4} - } - prev, sig := compareAndSwapUint32(ptr, old, new) - return prev, errorFromFaultSignal(ptr, sig) -} - -// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It -// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. -// -// Preconditions: ptr must be aligned to a 4-byte boundary. -func LoadUint32(ptr unsafe.Pointer) (uint32, error) { - if addr := uintptr(ptr); addr&3 != 0 { - return 0, AlignmentError{addr, 4} - } - val, sig := loadUint32(ptr) - return val, errorFromFaultSignal(ptr, sig) -} - -func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error { - switch sig { - case 0: - return nil - case int32(syscall.SIGSEGV): - return SegvError{uintptr(addr)} - case int32(syscall.SIGBUS): - return BusError{uintptr(addr)} - default: - panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr)) - } -} - -// ReplaceSignalHandler replaces the existing signal handler for the provided -// signal with the one that handles faults in safecopy-protected functions. -// -// It stores the value of the previously set handler in previous. -// -// This function will be called on initialization in order to install safecopy -// handlers for appropriate signals. These handlers will call the previous -// handler however, and if this is function is being used externally then the -// same courtesy is expected. -func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error { - var sa struct { - handler uintptr - flags uint64 - restorer uintptr - mask uint64 - } - const maskLen = 8 - - // Get the existing signal handler information, and save the current - // handler. Once we replace it, we will use this pointer to fall back to - // it when we receive other signals. - if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 { - return e - } - - // Fail if there isn't a previous handler. - if sa.handler == 0 { - return fmt.Errorf("previous handler for signal %x isn't set", sig) - } - - *previous = sa.handler - - // Install our own handler. - sa.handler = handler - if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 { - return e - } - - return nil -} diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s deleted file mode 100644 index 475ae48e9..000000000 --- a/pkg/sentry/platform/safecopy/sighandler_amd64.s +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "textflag.h" - -// The signals handled by sigHandler. -#define SIGBUS 7 -#define SIGSEGV 11 - -// Offsets to the registers in context->uc_mcontext.gregs[]. -#define REG_RDI 0x68 -#define REG_RAX 0x90 -#define REG_IP 0xa8 - -// Offset to the si_addr field of siginfo. -#define SI_CODE 0x08 -#define SI_ADDR 0x10 - -// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must -// not be set up as a handler to any other signals. -// -// If the instruction causing the signal is within a safecopy-protected -// function, the signal is handled such that execution resumes in the -// appropriate fault handling stub with AX containing the faulting address and -// DI containing the signal number. Otherwise control is transferred to the -// previously configured signal handler (savedSigSegvHandler or -// savedSigBusHandler). -// -// This function cannot be written in go because it runs whenever a signal is -// received by the thread (preempting whatever was running), which includes when -// garbage collector has stopped or isn't expecting any interactions (like -// barriers). -// -// The arguments are the following: -// DI - The signal number. -// SI - Pointer to siginfo_t structure. -// DX - Pointer to ucontext structure. -TEXT ·signalHandler(SB),NOSPLIT,$0 - // Check if the signal is from the kernel. - MOVQ $0x0, CX - CMPL CX, SI_CODE(SI) - JGE original_handler - - // Check if RIP is within the area we care about. - MOVQ REG_IP(DX), CX - CMPQ CX, ·memcpyBegin(SB) - JB not_memcpy - CMPQ CX, ·memcpyEnd(SB) - JAE not_memcpy - - // Modify the context such that execution will resume in the fault - // handler. - LEAQ handleMemcpyFault(SB), CX - JMP handle_fault - -not_memcpy: - CMPQ CX, ·memclrBegin(SB) - JB not_memclr - CMPQ CX, ·memclrEnd(SB) - JAE not_memclr - - LEAQ handleMemclrFault(SB), CX - JMP handle_fault - -not_memclr: - CMPQ CX, ·swapUint32Begin(SB) - JB not_swapuint32 - CMPQ CX, ·swapUint32End(SB) - JAE not_swapuint32 - - LEAQ handleSwapUint32Fault(SB), CX - JMP handle_fault - -not_swapuint32: - CMPQ CX, ·swapUint64Begin(SB) - JB not_swapuint64 - CMPQ CX, ·swapUint64End(SB) - JAE not_swapuint64 - - LEAQ handleSwapUint64Fault(SB), CX - JMP handle_fault - -not_swapuint64: - CMPQ CX, ·compareAndSwapUint32Begin(SB) - JB not_casuint32 - CMPQ CX, ·compareAndSwapUint32End(SB) - JAE not_casuint32 - - LEAQ handleCompareAndSwapUint32Fault(SB), CX - JMP handle_fault - -not_casuint32: - CMPQ CX, ·loadUint32Begin(SB) - JB not_loaduint32 - CMPQ CX, ·loadUint32End(SB) - JAE not_loaduint32 - - LEAQ handleLoadUint32Fault(SB), CX - JMP handle_fault - -not_loaduint32: -original_handler: - // Jump to the previous signal handler, which is likely the golang one. - XORQ CX, CX - MOVQ ·savedSigBusHandler(SB), AX - CMPL DI, $SIGSEGV - CMOVQEQ ·savedSigSegVHandler(SB), AX - JMP AX - -handle_fault: - // Entered with the address of the fault handler in RCX; store it in - // RIP. - MOVQ CX, REG_IP(DX) - - // Store the faulting address in RAX. - MOVQ SI_ADDR(SI), CX - MOVQ CX, REG_RAX(DX) - - // Store the signal number in EDI. - MOVL DI, REG_RDI(DX) - - RET diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s deleted file mode 100644 index 53e4ac2c1..000000000 --- a/pkg/sentry/platform/safecopy/sighandler_arm64.s +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "textflag.h" - -// The signals handled by sigHandler. -#define SIGBUS 7 -#define SIGSEGV 11 - -// Offsets to the registers in context->uc_mcontext.gregs[]. -#define REG_R0 0xB8 -#define REG_R1 0xC0 -#define REG_PC 0x1B8 - -// Offset to the si_addr field of siginfo. -#define SI_CODE 0x08 -#define SI_ADDR 0x10 - -// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must -// not be set up as a handler to any other signals. -// -// If the instruction causing the signal is within a safecopy-protected -// function, the signal is handled such that execution resumes in the -// appropriate fault handling stub with R0 containing the faulting address and -// R1 containing the signal number. Otherwise control is transferred to the -// previously configured signal handler (savedSigSegvHandler or -// savedSigBusHandler). -// -// This function cannot be written in go because it runs whenever a signal is -// received by the thread (preempting whatever was running), which includes when -// garbage collector has stopped or isn't expecting any interactions (like -// barriers). -// -// The arguments are the following: -// R0 - The signal number. -// R1 - Pointer to siginfo_t structure. -// R2 - Pointer to ucontext structure. -TEXT ·signalHandler(SB),NOSPLIT,$0 - // Check if the signal is from the kernel, si_code > 0 means a kernel signal. - MOVD SI_CODE(R1), R7 - CMPW $0x0, R7 - BLE original_handler - - // Check if PC is within the area we care about. - MOVD REG_PC(R2), R7 - MOVD ·memcpyBegin(SB), R8 - CMP R8, R7 - BLO not_memcpy - MOVD ·memcpyEnd(SB), R8 - CMP R8, R7 - BHS not_memcpy - - // Modify the context such that execution will resume in the fault handler. - MOVD $handleMemcpyFault(SB), R7 - B handle_fault - -not_memcpy: - MOVD ·memclrBegin(SB), R8 - CMP R8, R7 - BLO not_memclr - MOVD ·memclrEnd(SB), R8 - CMP R8, R7 - BHS not_memclr - - MOVD $handleMemclrFault(SB), R7 - B handle_fault - -not_memclr: - MOVD ·swapUint32Begin(SB), R8 - CMP R8, R7 - BLO not_swapuint32 - MOVD ·swapUint32End(SB), R8 - CMP R8, R7 - BHS not_swapuint32 - - MOVD $handleSwapUint32Fault(SB), R7 - B handle_fault - -not_swapuint32: - MOVD ·swapUint64Begin(SB), R8 - CMP R8, R7 - BLO not_swapuint64 - MOVD ·swapUint64End(SB), R8 - CMP R8, R7 - BHS not_swapuint64 - - MOVD $handleSwapUint64Fault(SB), R7 - B handle_fault - -not_swapuint64: - MOVD ·compareAndSwapUint32Begin(SB), R8 - CMP R8, R7 - BLO not_casuint32 - MOVD ·compareAndSwapUint32End(SB), R8 - CMP R8, R7 - BHS not_casuint32 - - MOVD $handleCompareAndSwapUint32Fault(SB), R7 - B handle_fault - -not_casuint32: - MOVD ·loadUint32Begin(SB), R8 - CMP R8, R7 - BLO not_loaduint32 - MOVD ·loadUint32End(SB), R8 - CMP R8, R7 - BHS not_loaduint32 - - MOVD $handleLoadUint32Fault(SB), R7 - B handle_fault - -not_loaduint32: -original_handler: - // Jump to the previous signal handler, which is likely the golang one. - MOVD ·savedSigBusHandler(SB), R7 - MOVD ·savedSigSegVHandler(SB), R8 - CMPW $SIGSEGV, R0 - CSEL EQ, R8, R7, R7 - B (R7) - -handle_fault: - // Entered with the address of the fault handler in R7; store it in PC. - MOVD R7, REG_PC(R2) - - // Store the faulting address in R0. - MOVD SI_ADDR(R1), R7 - MOVD R7, REG_R0(R2) - - // Store the signal number in R1. - MOVW R0, REG_R1(R2) - - RET diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD deleted file mode 100644 index 3ab76da97..000000000 --- a/pkg/sentry/safemem/BUILD +++ /dev/null @@ -1,27 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "safemem", - srcs = [ - "block_unsafe.go", - "io.go", - "safemem.go", - "seq_unsafe.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/platform/safecopy", - ], -) - -go_test( - name = "safemem_test", - size = "small", - srcs = [ - "io_test.go", - "seq_test.go", - ], - library = ":safemem", -) diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go deleted file mode 100644 index 6f03c94bf..000000000 --- a/pkg/sentry/safemem/block_unsafe.go +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "fmt" - "reflect" - "unsafe" - - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" -) - -// A Block is a range of contiguous bytes, similar to []byte but with the -// following differences: -// -// - The memory represented by a Block may require the use of safecopy to -// access. -// -// - Block does not carry a capacity and cannot be expanded. -// -// Blocks are immutable and may be copied by value. The zero value of Block -// represents an empty range, analogous to a nil []byte. -type Block struct { - // [start, start+length) is the represented memory. - // - // start is an unsafe.Pointer to ensure that Block prevents the represented - // memory from being garbage-collected. - start unsafe.Pointer - length int - - // needSafecopy is true if accessing the represented memory requires the - // use of safecopy. - needSafecopy bool -} - -// BlockFromSafeSlice returns a Block equivalent to slice, which is safe to -// access without safecopy. -func BlockFromSafeSlice(slice []byte) Block { - return blockFromSlice(slice, false) -} - -// BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to -// access without safecopy. -func BlockFromUnsafeSlice(slice []byte) Block { - return blockFromSlice(slice, true) -} - -func blockFromSlice(slice []byte, needSafecopy bool) Block { - if len(slice) == 0 { - return Block{} - } - return Block{ - start: unsafe.Pointer(&slice[0]), - length: len(slice), - needSafecopy: needSafecopy, - } -} - -// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+len), which is -// safe to access without safecopy. -// -// Preconditions: ptr+len does not overflow. -func BlockFromSafePointer(ptr unsafe.Pointer, len int) Block { - return blockFromPointer(ptr, len, false) -} - -// BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which -// is not safe to access without safecopy. -// -// Preconditions: ptr+len does not overflow. -func BlockFromUnsafePointer(ptr unsafe.Pointer, len int) Block { - return blockFromPointer(ptr, len, true) -} - -func blockFromPointer(ptr unsafe.Pointer, len int, needSafecopy bool) Block { - if uptr := uintptr(ptr); uptr+uintptr(len) < uptr { - panic(fmt.Sprintf("ptr %#x + len %#x overflows", ptr, len)) - } - return Block{ - start: ptr, - length: len, - needSafecopy: needSafecopy, - } -} - -// DropFirst returns a Block equivalent to b, but with the first n bytes -// omitted. It is analogous to the [n:] operation on a slice, except that if n -// > b.Len(), DropFirst returns an empty Block instead of panicking. -// -// Preconditions: n >= 0. -func (b Block) DropFirst(n int) Block { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return b.DropFirst64(uint64(n)) -} - -// DropFirst64 is equivalent to DropFirst but takes a uint64. -func (b Block) DropFirst64(n uint64) Block { - if n >= uint64(b.length) { - return Block{} - } - return Block{ - start: unsafe.Pointer(uintptr(b.start) + uintptr(n)), - length: b.length - int(n), - needSafecopy: b.needSafecopy, - } -} - -// TakeFirst returns a Block equivalent to the first n bytes of b. It is -// analogous to the [:n] operation on a slice, except that if n > b.Len(), -// TakeFirst returns a copy of b instead of panicking. -// -// Preconditions: n >= 0. -func (b Block) TakeFirst(n int) Block { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return b.TakeFirst64(uint64(n)) -} - -// TakeFirst64 is equivalent to TakeFirst but takes a uint64. -func (b Block) TakeFirst64(n uint64) Block { - if n == 0 { - return Block{} - } - if n >= uint64(b.length) { - return b - } - return Block{ - start: b.start, - length: int(n), - needSafecopy: b.needSafecopy, - } -} - -// ToSlice returns a []byte equivalent to b. -func (b Block) ToSlice() []byte { - var bs []byte - hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) - hdr.Data = uintptr(b.start) - hdr.Len = b.length - hdr.Cap = b.length - return bs -} - -// Addr returns b's start address as a uintptr. It returns uintptr instead of -// unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers -// without importing the unsafe package explicitly. -// -// Note that a uintptr is not recognized as a pointer by the garbage collector, -// such that if there are no uses of b after a call to b.Addr() and the address -// is to Go-managed memory, the returned uintptr does not prevent garbage -// collection of the pointee. -func (b Block) Addr() uintptr { - return uintptr(b.start) -} - -// Len returns b's length in bytes. -func (b Block) Len() int { - return b.length -} - -// NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy. -func (b Block) NeedSafecopy() bool { - return b.needSafecopy -} - -// String implements fmt.Stringer.String. -func (b Block) String() string { - if uintptr(b.start) == 0 && b.length == 0 { - return "" - } - var suffix string - if b.needSafecopy { - suffix = "*" - } - return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix) -} - -// Copy copies src.Len() or dst.Len() bytes, whichever is less, from src -// to dst and returns the number of bytes copied. -// -// If src and dst overlap, the data stored in dst is unspecified. -func Copy(dst, src Block) (int, error) { - if !dst.needSafecopy && !src.needSafecopy { - return copy(dst.ToSlice(), src.ToSlice()), nil - } - - n := dst.length - if n > src.length { - n = src.length - } - if n == 0 { - return 0, nil - } - - switch { - case dst.needSafecopy && !src.needSafecopy: - return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice()) - case !dst.needSafecopy && src.needSafecopy: - return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start) - case dst.needSafecopy && src.needSafecopy: - n64, err := safecopy.Copy(dst.start, src.start, uintptr(n)) - return int(n64), err - default: - panic("unreachable") - } -} - -// Zero sets all bytes in dst to 0 and returns the number of bytes zeroed. -func Zero(dst Block) (int, error) { - if !dst.needSafecopy { - bs := dst.ToSlice() - for i := range bs { - bs[i] = 0 - } - return len(bs), nil - } - - n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length)) - return int(n64), err -} - -// Safecopy atomics are no slower than non-safecopy atomics, so use the former -// even when !b.needSafecopy to get consistent alignment checking. - -// SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b. -// -// Preconditions: b.Len() >= 4. -func SwapUint32(b Block, new uint32) (uint32, error) { - if b.length < 4 { - panic(fmt.Sprintf("insufficient length: %d", b.length)) - } - return safecopy.SwapUint32(b.start, new) -} - -// SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b. -// -// Preconditions: b.Len() >= 8. -func SwapUint64(b Block, new uint64) (uint64, error) { - if b.length < 8 { - panic(fmt.Sprintf("insufficient length: %d", b.length)) - } - return safecopy.SwapUint64(b.start, new) -} - -// CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4 -// bytes of b. -// -// Preconditions: b.Len() >= 4. -func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) { - if b.length < 4 { - panic(fmt.Sprintf("insufficient length: %d", b.length)) - } - return safecopy.CompareAndSwapUint32(b.start, old, new) -} - -// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b. -// -// Preconditions: b.Len() >= 4. -func LoadUint32(b Block) (uint32, error) { - if b.length < 4 { - panic(fmt.Sprintf("insufficient length: %d", b.length)) - } - return safecopy.LoadUint32(b.start) -} diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go deleted file mode 100644 index f039a5c34..000000000 --- a/pkg/sentry/safemem/io.go +++ /dev/null @@ -1,392 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "errors" - "io" - "math" -) - -// ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write -// beyond the end of the BlockSeq. -var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq") - -// Reader represents a streaming byte source like io.Reader. -type Reader interface { - // ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the - // number of bytes read. It may return a partial read without an error - // (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a - // full read with an error (i.e. (dsts.NumBytes(), err) where err != nil); - // note that this differs from io.Reader.Read (in particular, io.EOF should - // not be returned if ReadToBlocks successfully reads dsts.NumBytes() - // bytes.) - ReadToBlocks(dsts BlockSeq) (uint64, error) -} - -// Writer represents a streaming byte sink like io.Writer. -type Writer interface { - // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns - // the number of bytes written. It may return a partial write without an - // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not - // return a full write with an error (i.e. srcs.NumBytes(), err) where err - // != nil). - WriteFromBlocks(srcs BlockSeq) (uint64, error) -} - -// ReadFullToBlocks repeatedly invokes r.ReadToBlocks until dsts.NumBytes() -// bytes have been read or ReadToBlocks returns an error. -func ReadFullToBlocks(r Reader, dsts BlockSeq) (uint64, error) { - var done uint64 - for !dsts.IsEmpty() { - n, err := r.ReadToBlocks(dsts) - done += n - if err != nil { - return done, err - } - dsts = dsts.DropFirst64(n) - } - return done, nil -} - -// WriteFullFromBlocks repeatedly invokes w.WriteFromBlocks until -// srcs.NumBytes() bytes have been written or WriteFromBlocks returns an error. -func WriteFullFromBlocks(w Writer, srcs BlockSeq) (uint64, error) { - var done uint64 - for !srcs.IsEmpty() { - n, err := w.WriteFromBlocks(srcs) - done += n - if err != nil { - return done, err - } - srcs = srcs.DropFirst64(n) - } - return done, nil -} - -// BlockSeqReader implements Reader by reading from a BlockSeq. -type BlockSeqReader struct { - Blocks BlockSeq -} - -// ReadToBlocks implements Reader.ReadToBlocks. -func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { - n, err := CopySeq(dsts, r.Blocks) - r.Blocks = r.Blocks.DropFirst64(n) - if err != nil { - return n, err - } - if n < dsts.NumBytes() { - return n, io.EOF - } - return n, nil -} - -// BlockSeqWriter implements Writer by writing to a BlockSeq. -type BlockSeqWriter struct { - Blocks BlockSeq -} - -// WriteFromBlocks implements Writer.WriteFromBlocks. -func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { - n, err := CopySeq(w.Blocks, srcs) - w.Blocks = w.Blocks.DropFirst64(n) - if err != nil { - return n, err - } - if n < srcs.NumBytes() { - return n, ErrEndOfBlockSeq - } - return n, nil -} - -// ReaderFunc implements Reader for a function with the semantics of -// Reader.ReadToBlocks. -type ReaderFunc func(dsts BlockSeq) (uint64, error) - -// ReadToBlocks implements Reader.ReadToBlocks. -func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { - return f(dsts) -} - -// WriterFunc implements Writer for a function with the semantics of -// Writer.WriteFromBlocks. -type WriterFunc func(srcs BlockSeq) (uint64, error) - -// WriteFromBlocks implements Writer.WriteFromBlocks. -func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { - return f(srcs) -} - -// ToIOReader implements io.Reader for a (safemem.)Reader. -// -// ToIOReader will return a successful partial read iff Reader.ReadToBlocks does -// so. -type ToIOReader struct { - Reader Reader -} - -// Read implements io.Reader.Read. -func (r ToIOReader) Read(dst []byte) (int, error) { - n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst))) - return int(n), err -} - -// ToIOWriter implements io.Writer for a (safemem.)Writer. -type ToIOWriter struct { - Writer Writer -} - -// Write implements io.Writer.Write. -func (w ToIOWriter) Write(src []byte) (int, error) { - // io.Writer does not permit partial writes. - n, err := WriteFullFromBlocks(w.Writer, BlockSeqOf(BlockFromSafeSlice(src))) - return int(n), err -} - -// FromIOReader implements Reader for an io.Reader by repeatedly invoking -// io.Reader.Read until it returns an error or partial read. This is not -// thread-safe. -// -// FromIOReader will return a successful partial read iff Reader.Read does so. -type FromIOReader struct { - Reader io.Reader -} - -// ReadToBlocks implements Reader.ReadToBlocks. -func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { - var buf []byte - var done uint64 - for !dsts.IsEmpty() { - dst := dsts.Head() - var n int - var err error - n, buf, err = r.readToBlock(dst, buf) - done += uint64(n) - if n != dst.Len() { - return done, err - } - dsts = dsts.Tail() - if err != nil { - if dsts.IsEmpty() && err == io.EOF { - return done, nil - } - return done, err - } - } - return done, nil -} - -func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) { - // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require - // safecopy. - if !dst.NeedSafecopy() { - n, err := r.Reader.Read(dst.ToSlice()) - return n, buf, err - } - if len(buf) < dst.Len() { - buf = make([]byte, dst.Len()) - } - rn, rerr := r.Reader.Read(buf[:dst.Len()]) - wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) - if wberr != nil { - return wbn, buf, wberr - } - return wbn, buf, rerr -} - -// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly -// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial -// read indicates an error. This is not thread-safe. -type FromIOReaderAt struct { - ReaderAt io.ReaderAt - Offset int64 -} - -// ReadToBlocks implements Reader.ReadToBlocks. -func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) { - var buf []byte - var done uint64 - for !dsts.IsEmpty() { - dst := dsts.Head() - var n int - var err error - n, buf, err = r.readToBlock(dst, buf) - done += uint64(n) - if n != dst.Len() { - return done, err - } - dsts = dsts.Tail() - if err != nil { - if dsts.IsEmpty() && err == io.EOF { - return done, nil - } - return done, err - } - } - return done, nil -} - -func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) { - // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require - // safecopy. - if !dst.NeedSafecopy() { - n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset) - r.Offset += int64(n) - return n, buf, err - } - if len(buf) < dst.Len() { - buf = make([]byte, dst.Len()) - } - rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset) - r.Offset += int64(rn) - wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) - if wberr != nil { - return wbn, buf, wberr - } - return wbn, buf, rerr -} - -// FromIOWriter implements Writer for an io.Writer by repeatedly invoking -// io.Writer.Write until it returns an error or partial write. -// -// FromIOWriter will tolerate implementations of io.Writer.Write that return -// partial writes with a nil error in contravention of io.Writer's -// requirements, since Writer is permitted to do so. FromIOWriter will return a -// successful partial write iff Writer.Write does so. -type FromIOWriter struct { - Writer io.Writer -} - -// WriteFromBlocks implements Writer.WriteFromBlocks. -func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { - var buf []byte - var done uint64 - for !srcs.IsEmpty() { - src := srcs.Head() - var n int - var err error - n, buf, err = w.writeFromBlock(src, buf) - done += uint64(n) - if n != src.Len() || err != nil { - return done, err - } - srcs = srcs.Tail() - } - return done, nil -} - -func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) { - // io.Writer isn't safecopy-aware, so we have to buffer Blocks that require - // safecopy. - if !src.NeedSafecopy() { - n, err := w.Writer.Write(src.ToSlice()) - return n, buf, err - } - if len(buf) < src.Len() { - buf = make([]byte, src.Len()) - } - bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src) - wn, werr := w.Writer.Write(buf[:bufn]) - if werr != nil { - return wn, buf, werr - } - return wn, buf, buferr -} - -// FromVecReaderFunc implements Reader for a function that reads data into a -// [][]byte and returns the number of bytes read as an int64. -type FromVecReaderFunc struct { - ReadVec func(dsts [][]byte) (int64, error) -} - -// ReadToBlocks implements Reader.ReadToBlocks. -// -// ReadToBlocks calls r.ReadVec at most once. -func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { - if dsts.IsEmpty() { - return 0, nil - } - // Ensure that we don't pass a [][]byte with a total length > MaxInt64. - dsts = dsts.TakeFirst64(uint64(math.MaxInt64)) - dstSlices := make([][]byte, 0, dsts.NumBlocks()) - // Buffer Blocks that require safecopy. - for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() { - dst := tmp.Head() - if dst.NeedSafecopy() { - dstSlices = append(dstSlices, make([]byte, dst.Len())) - } else { - dstSlices = append(dstSlices, dst.ToSlice()) - } - } - rn, rerr := r.ReadVec(dstSlices) - dsts = dsts.TakeFirst64(uint64(rn)) - var done uint64 - var i int - for !dsts.IsEmpty() { - dst := dsts.Head() - if dst.NeedSafecopy() { - n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i])) - done += uint64(n) - if err != nil { - return done, err - } - } else { - done += uint64(dst.Len()) - } - dsts = dsts.Tail() - i++ - } - return done, rerr -} - -// FromVecWriterFunc implements Writer for a function that writes data from a -// [][]byte and returns the number of bytes written. -type FromVecWriterFunc struct { - WriteVec func(srcs [][]byte) (int64, error) -} - -// WriteFromBlocks implements Writer.WriteFromBlocks. -// -// WriteFromBlocks calls w.WriteVec at most once. -func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { - if srcs.IsEmpty() { - return 0, nil - } - // Ensure that we don't pass a [][]byte with a total length > MaxInt64. - srcs = srcs.TakeFirst64(uint64(math.MaxInt64)) - srcSlices := make([][]byte, 0, srcs.NumBlocks()) - // Buffer Blocks that require safecopy. - var buferr error - for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() { - src := tmp.Head() - if src.NeedSafecopy() { - slice := make([]byte, src.Len()) - n, err := Copy(BlockFromSafeSlice(slice), src) - srcSlices = append(srcSlices, slice[:n]) - if err != nil { - buferr = err - break - } - } else { - srcSlices = append(srcSlices, src.ToSlice()) - } - } - n, err := w.WriteVec(srcSlices) - if err != nil { - return uint64(n), err - } - return uint64(n), buferr -} diff --git a/pkg/sentry/safemem/io_test.go b/pkg/sentry/safemem/io_test.go deleted file mode 100644 index 629741bee..000000000 --- a/pkg/sentry/safemem/io_test.go +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "bytes" - "io" - "testing" -) - -func makeBlocks(slices ...[]byte) []Block { - blocks := make([]Block, 0, len(slices)) - for _, s := range slices { - blocks = append(blocks, BlockFromSafeSlice(s)) - } - return blocks -} - -func TestFromIOReaderFullRead(t *testing.T) { - r := FromIOReader{bytes.NewBufferString("foobar")} - dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) - n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) - if wantN := uint64(6); n != wantN || err != nil { - t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - for i, want := range [][]byte{[]byte("foo"), []byte("bar")} { - if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { - t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) - } - } -} - -type eofHidingReader struct { - Reader io.Reader -} - -func (r eofHidingReader) Read(dst []byte) (int, error) { - n, err := r.Reader.Read(dst) - if err == io.EOF { - return n, nil - } - return n, err -} - -func TestFromIOReaderPartialRead(t *testing.T) { - r := FromIOReader{eofHidingReader{bytes.NewBufferString("foob")}} - dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) - n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) - // FromIOReader should stop after the eofHidingReader returns (1, nil) - // for a 3-byte read. - if wantN := uint64(4); n != wantN || err != nil { - t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - for i, want := range [][]byte{[]byte("foo"), []byte("b\x00\x00")} { - if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { - t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) - } - } -} - -type singleByteReader struct { - Reader io.Reader -} - -func (r singleByteReader) Read(dst []byte) (int, error) { - if len(dst) == 0 { - return r.Reader.Read(dst) - } - return r.Reader.Read(dst[:1]) -} - -func TestSingleByteReader(t *testing.T) { - r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}} - dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) - n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) - // FromIOReader should stop after the singleByteReader returns (1, nil) - // for a 3-byte read. - if wantN := uint64(1); n != wantN || err != nil { - t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - for i, want := range [][]byte{[]byte("f\x00\x00"), []byte("\x00\x00\x00")} { - if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { - t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) - } - } -} - -func TestReadFullToBlocks(t *testing.T) { - r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}} - dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) - n, err := ReadFullToBlocks(r, BlockSeqFromSlice(dsts)) - // ReadFullToBlocks should call into FromIOReader => singleByteReader - // repeatedly until dsts is exhausted. - if wantN := uint64(6); n != wantN || err != nil { - t.Errorf("ReadFullToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - for i, want := range [][]byte{[]byte("foo"), []byte("bar")} { - if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { - t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) - } - } -} - -func TestFromIOWriterFullWrite(t *testing.T) { - srcs := makeBlocks([]byte("foo"), []byte("bar")) - var dst bytes.Buffer - w := FromIOWriter{&dst} - n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) - if wantN := uint64(6); n != wantN || err != nil { - t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -type limitedWriter struct { - Writer io.Writer - Done int - Limit int -} - -func (w *limitedWriter) Write(src []byte) (int, error) { - count := len(src) - if count > (w.Limit - w.Done) { - count = w.Limit - w.Done - } - n, err := w.Writer.Write(src[:count]) - w.Done += n - return n, err -} - -func TestFromIOWriterPartialWrite(t *testing.T) { - srcs := makeBlocks([]byte("foo"), []byte("bar")) - var dst bytes.Buffer - w := FromIOWriter{&limitedWriter{&dst, 0, 4}} - n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) - // FromIOWriter should stop after the limitedWriter returns (1, nil) for a - // 3-byte write. - if wantN := uint64(4); n != wantN || err != nil { - t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -type singleByteWriter struct { - Writer io.Writer -} - -func (w singleByteWriter) Write(src []byte) (int, error) { - if len(src) == 0 { - return w.Writer.Write(src) - } - return w.Writer.Write(src[:1]) -} - -func TestSingleByteWriter(t *testing.T) { - srcs := makeBlocks([]byte("foo"), []byte("bar")) - var dst bytes.Buffer - w := FromIOWriter{singleByteWriter{&dst}} - n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) - // FromIOWriter should stop after the singleByteWriter returns (1, nil) - // for a 3-byte write. - if wantN := uint64(1); n != wantN || err != nil { - t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("f"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -func TestWriteFullToBlocks(t *testing.T) { - srcs := makeBlocks([]byte("foo"), []byte("bar")) - var dst bytes.Buffer - w := FromIOWriter{singleByteWriter{&dst}} - n, err := WriteFullFromBlocks(w, BlockSeqFromSlice(srcs)) - // WriteFullToBlocks should call into FromIOWriter => singleByteWriter - // repeatedly until srcs is exhausted. - if wantN := uint64(6); n != wantN || err != nil { - t.Errorf("WriteFullFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} diff --git a/pkg/sentry/safemem/safemem.go b/pkg/sentry/safemem/safemem.go deleted file mode 100644 index 3e70d33a2..000000000 --- a/pkg/sentry/safemem/safemem.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package safemem provides the Block and BlockSeq types. -package safemem diff --git a/pkg/sentry/safemem/seq_test.go b/pkg/sentry/safemem/seq_test.go deleted file mode 100644 index eba4bb535..000000000 --- a/pkg/sentry/safemem/seq_test.go +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "bytes" - "reflect" - "testing" -) - -type blockSeqTest struct { - desc string - - pieces []string - haveOffset bool - offset uint64 - haveLimit bool - limit uint64 - - want string -} - -func (t blockSeqTest) NonEmptyByteSlices() [][]byte { - // t is a value, so we can mutate it freely. - slices := make([][]byte, 0, len(t.pieces)) - for _, str := range t.pieces { - if t.haveOffset { - strOff := t.offset - if strOff > uint64(len(str)) { - strOff = uint64(len(str)) - } - str = str[strOff:] - t.offset -= strOff - } - if t.haveLimit { - strLim := t.limit - if strLim > uint64(len(str)) { - strLim = uint64(len(str)) - } - str = str[:strLim] - t.limit -= strLim - } - if len(str) != 0 { - slices = append(slices, []byte(str)) - } - } - return slices -} - -func (t blockSeqTest) BlockSeq() BlockSeq { - blocks := make([]Block, 0, len(t.pieces)) - for _, str := range t.pieces { - blocks = append(blocks, BlockFromSafeSlice([]byte(str))) - } - bs := BlockSeqFromSlice(blocks) - if t.haveOffset { - bs = bs.DropFirst64(t.offset) - } - if t.haveLimit { - bs = bs.TakeFirst64(t.limit) - } - return bs -} - -var blockSeqTests = []blockSeqTest{ - { - desc: "Empty sequence", - }, - { - desc: "Sequence of length 1", - pieces: []string{"foobar"}, - want: "foobar", - }, - { - desc: "Sequence of length 2", - pieces: []string{"foo", "bar"}, - want: "foobar", - }, - { - desc: "Empty Blocks", - pieces: []string{"", "foo", "", "", "bar", ""}, - want: "foobar", - }, - { - desc: "Sequence with non-zero offset", - pieces: []string{"foo", "bar"}, - haveOffset: true, - offset: 2, - want: "obar", - }, - { - desc: "Sequence with non-maximal limit", - pieces: []string{"foo", "bar"}, - haveLimit: true, - limit: 5, - want: "fooba", - }, - { - desc: "Sequence with offset and limit", - pieces: []string{"foo", "bar"}, - haveOffset: true, - offset: 2, - haveLimit: true, - limit: 3, - want: "oba", - }, -} - -func TestBlockSeqNumBytes(t *testing.T) { - for _, test := range blockSeqTests { - t.Run(test.desc, func(t *testing.T) { - if got, want := test.BlockSeq().NumBytes(), uint64(len(test.want)); got != want { - t.Errorf("NumBytes: got %d, wanted %d", got, want) - } - }) - } -} - -func TestBlockSeqIterBlocks(t *testing.T) { - // Tests BlockSeq iteration using Head/Tail. - for _, test := range blockSeqTests { - t.Run(test.desc, func(t *testing.T) { - srcs := test.BlockSeq() - // "Note that a non-nil empty slice and a nil slice ... are not - // deeply equal." - reflect - slices := make([][]byte, 0, 0) - for !srcs.IsEmpty() { - src := srcs.Head() - slices = append(slices, src.ToSlice()) - nextSrcs := srcs.Tail() - if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-uint64(src.Len()); got != want { - t.Fatalf("%v.Tail(): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want) - } - srcs = nextSrcs - } - if wantSlices := test.NonEmptyByteSlices(); !reflect.DeepEqual(slices, wantSlices) { - t.Errorf("Accumulated slices: got %v, wanted %v", slices, wantSlices) - } - }) - } -} - -func TestBlockSeqIterBytes(t *testing.T) { - // Tests BlockSeq iteration using Head/DropFirst. - for _, test := range blockSeqTests { - t.Run(test.desc, func(t *testing.T) { - srcs := test.BlockSeq() - var dst bytes.Buffer - for !srcs.IsEmpty() { - src := srcs.Head() - var b [1]byte - n, err := Copy(BlockFromSafeSlice(b[:]), src) - if n != 1 || err != nil { - t.Fatalf("Copy: got (%v, %v), wanted (1, nil)", n, err) - } - dst.WriteByte(b[0]) - nextSrcs := srcs.DropFirst(1) - if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-1; got != want { - t.Fatalf("%v.DropFirst(1): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want) - } - srcs = nextSrcs - } - if got := string(dst.Bytes()); got != test.want { - t.Errorf("Copied string: got %q, wanted %q", got, test.want) - } - }) - } -} - -func TestBlockSeqDropBeyondLimit(t *testing.T) { - blocks := []Block{BlockFromSafeSlice([]byte("123")), BlockFromSafeSlice([]byte("4"))} - bs := BlockSeqFromSlice(blocks) - if got, want := bs.NumBytes(), uint64(4); got != want { - t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) - } - bs = bs.TakeFirst(1) - if got, want := bs.NumBytes(), uint64(1); got != want { - t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) - } - bs = bs.DropFirst(2) - if got, want := bs.NumBytes(), uint64(0); got != want { - t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) - } -} diff --git a/pkg/sentry/safemem/seq_unsafe.go b/pkg/sentry/safemem/seq_unsafe.go deleted file mode 100644 index 354a95dde..000000000 --- a/pkg/sentry/safemem/seq_unsafe.go +++ /dev/null @@ -1,299 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "bytes" - "fmt" - "reflect" - "unsafe" -) - -// A BlockSeq represents a sequence of Blocks, each of which has non-zero -// length. -// -// BlockSeqs are immutable and may be copied by value. The zero value of -// BlockSeq represents an empty sequence. -type BlockSeq struct { - // If length is 0, then the BlockSeq is empty. Invariants: data == 0; - // offset == 0; limit == 0. - // - // If length is -1, then the BlockSeq represents the single Block{data, - // limit, false}. Invariants: offset == 0; limit > 0; limit does not - // overflow the range of an int. - // - // If length is -2, then the BlockSeq represents the single Block{data, - // limit, true}. Invariants: offset == 0; limit > 0; limit does not - // overflow the range of an int. - // - // Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks - // in the array of Blocks starting at address `data`, starting at `offset` - // bytes into the first Block and limited to the following `limit` bytes. - // Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <= - // the combined length of all Blocks in the array; the first Block in the - // array has non-zero length. - // - // length is never 1; sequences consisting of a single Block are always - // stored inline (with length < 0). - data unsafe.Pointer - length int - offset int - limit uint64 -} - -// BlockSeqOf returns a BlockSeq representing the single Block b. -func BlockSeqOf(b Block) BlockSeq { - bs := BlockSeq{ - data: b.start, - length: -1, - limit: uint64(b.length), - } - if b.needSafecopy { - bs.length = -2 - } - return bs -} - -// BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice. -// If slice contains Blocks with zero length, BlockSeq will skip them during -// iteration. -// -// Whether the returned BlockSeq shares memory with slice is unspecified; -// clients should avoid mutating slices passed to BlockSeqFromSlice. -// -// Preconditions: The combined length of all Blocks in slice <= math.MaxUint64. -func BlockSeqFromSlice(slice []Block) BlockSeq { - slice = skipEmpty(slice) - var limit uint64 - for _, b := range slice { - sum := limit + uint64(b.Len()) - if sum < limit { - panic("BlockSeq length overflows uint64") - } - limit = sum - } - return blockSeqFromSliceLimited(slice, limit) -} - -// Preconditions: The combined length of all Blocks in slice <= limit. If -// len(slice) != 0, the first Block in slice has non-zero length, and limit > -// 0. -func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq { - switch len(slice) { - case 0: - return BlockSeq{} - case 1: - return BlockSeqOf(slice[0].TakeFirst64(limit)) - default: - return BlockSeq{ - data: unsafe.Pointer(&slice[0]), - length: len(slice), - limit: limit, - } - } -} - -func skipEmpty(slice []Block) []Block { - for i, b := range slice { - if b.Len() != 0 { - return slice[i:] - } - } - return nil -} - -// IsEmpty returns true if bs contains no Blocks. -// -// Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0). -// (Of these, prefer to use bs.IsEmpty().) -func (bs BlockSeq) IsEmpty() bool { - return bs.length == 0 -} - -// NumBlocks returns the number of Blocks in bs. -func (bs BlockSeq) NumBlocks() int { - // In general, we have to count: if bs represents a windowed slice then the - // slice may contain Blocks with zero length, and bs.length may be larger - // than the actual number of Blocks due to bs.limit. - var n int - for !bs.IsEmpty() { - n++ - bs = bs.Tail() - } - return n -} - -// NumBytes returns the sum of Block.Len() for all Blocks in bs. -func (bs BlockSeq) NumBytes() uint64 { - return bs.limit -} - -// Head returns the first Block in bs. -// -// Preconditions: !bs.IsEmpty(). -func (bs BlockSeq) Head() Block { - if bs.length == 0 { - panic("empty BlockSeq") - } - if bs.length < 0 { - return bs.internalBlock() - } - return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit) -} - -// Preconditions: bs.length < 0. -func (bs BlockSeq) internalBlock() Block { - return Block{ - start: bs.data, - length: int(bs.limit), - needSafecopy: bs.length == -2, - } -} - -// Tail returns a BlockSeq consisting of all Blocks in bs after the first. -// -// Preconditions: !bs.IsEmpty(). -func (bs BlockSeq) Tail() BlockSeq { - if bs.length == 0 { - panic("empty BlockSeq") - } - if bs.length < 0 { - return BlockSeq{} - } - head := (*Block)(bs.data).DropFirst(bs.offset) - headLen := uint64(head.Len()) - if headLen >= bs.limit { - // The head Block exhausts the limit, so the tail is empty. - return BlockSeq{} - } - var extSlice []Block - extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice)) - extSliceHdr.Data = uintptr(bs.data) - extSliceHdr.Len = bs.length - extSliceHdr.Cap = bs.length - tailSlice := skipEmpty(extSlice[1:]) - tailLimit := bs.limit - headLen - return blockSeqFromSliceLimited(tailSlice, tailLimit) -} - -// DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes -// omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq. -// -// Preconditions: n >= 0. -func (bs BlockSeq) DropFirst(n int) BlockSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return bs.DropFirst64(uint64(n)) -} - -// DropFirst64 is equivalent to DropFirst but takes an uint64. -func (bs BlockSeq) DropFirst64(n uint64) BlockSeq { - if n >= bs.limit { - return BlockSeq{} - } - for { - // Calling bs.Head() here is surprisingly expensive, so inline getting - // the head's length. - var headLen uint64 - if bs.length < 0 { - headLen = bs.limit - } else { - headLen = uint64((*Block)(bs.data).Len() - bs.offset) - } - if n < headLen { - // Dropping ends partway through the head Block. - if bs.length < 0 { - return BlockSeqOf(bs.internalBlock().DropFirst64(n)) - } - bs.offset += int(n) - bs.limit -= n - return bs - } - n -= headLen - bs = bs.Tail() - } -} - -// TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n > -// bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs. -// -// Preconditions: n >= 0. -func (bs BlockSeq) TakeFirst(n int) BlockSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return bs.TakeFirst64(uint64(n)) -} - -// TakeFirst64 is equivalent to TakeFirst but takes a uint64. -func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq { - if n == 0 { - return BlockSeq{} - } - if bs.limit > n { - bs.limit = n - } - return bs -} - -// String implements fmt.Stringer.String. -func (bs BlockSeq) String() string { - var buf bytes.Buffer - buf.WriteByte('[') - var sep string - for !bs.IsEmpty() { - buf.WriteString(sep) - sep = " " - buf.WriteString(bs.Head().String()) - bs = bs.Tail() - } - buf.WriteByte(']') - return buf.String() -} - -// CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less, -// from srcs to dsts and returns the number of bytes copied. -// -// If srcs and dsts overlap, the data stored in dsts is unspecified. -func CopySeq(dsts, srcs BlockSeq) (uint64, error) { - var done uint64 - for !dsts.IsEmpty() && !srcs.IsEmpty() { - dst := dsts.Head() - src := srcs.Head() - n, err := Copy(dst, src) - done += uint64(n) - if err != nil { - return done, err - } - dsts = dsts.DropFirst(n) - srcs = srcs.DropFirst(n) - } - return done, nil -} - -// ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed. -func ZeroSeq(dsts BlockSeq) (uint64, error) { - var done uint64 - for !dsts.IsEmpty() { - n, err := Zero(dsts.Head()) - done += uint64(n) - if err != nil { - return done, err - } - dsts = dsts.DropFirst(n) - } - return done, nil -} diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 8e2b97afb..611fa22c3 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -9,15 +9,15 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/tcpip", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 3850f6345..79e16d6e8 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -12,13 +12,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 1684dfc24..00265f15b 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -19,14 +19,14 @@ package control import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const maxInt = int(^uint(0) >> 1) diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 42bf7be6a..5a07d5d0e 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -16,23 +16,23 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/fdnotifier", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/control", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", + "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index c957b0f1d..bde4c7a1e 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -21,19 +21,19 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go index e69ec38c2..cd67234d2 100644 --- a/pkg/sentry/socket/hostinet/socket_unsafe.go +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -19,14 +19,14 @@ import ( "unsafe" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func firstBytePtr(bs []byte) unsafe.Pointer { diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index e67b46c9e..034eca676 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -25,13 +25,13 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" ) var defaultRecvBufSize = inet.TCPBufferSize{ diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index ed34a8308..fa2a2cb66 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -15,10 +15,10 @@ go_library( "//pkg/binary", "//pkg/log", "//pkg/sentry/kernel", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/tcpip", "//pkg/tcpip/iptables", "//pkg/tcpip/stack", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index c65c36081..6ef740463 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -23,11 +23,11 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/iptables" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" ) // errorTargetName is used to mark targets as error targets. Error targets diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index baaac13c6..f8b8e467d 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -13,8 +13,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -25,11 +25,11 @@ go_library( "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go index ce0a1afd0..b21e0ca4b 100644 --- a/pkg/sentry/socket/netlink/message.go +++ b/pkg/sentry/socket/netlink/message.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // alignUp rounds a length up to an alignment. diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index be005df24..07f860a49 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -18,7 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD index 2137c7aeb..0234aadde 100644 --- a/pkg/sentry/socket/netlink/route/BUILD +++ b/pkg/sentry/socket/netlink/route/BUILD @@ -8,7 +8,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go index 6b4a0ecf4..80a15d6cb 100644 --- a/pkg/sentry/socket/netlink/route/protocol.go +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -19,7 +19,7 @@ import ( "bytes" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index cea56f4ed..c4b95debb 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -32,11 +32,11 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD index 73fbdf1eb..b6434923c 100644 --- a/pkg/sentry/socket/netlink/uevent/BUILD +++ b/pkg/sentry/socket/netlink/uevent/BUILD @@ -8,7 +8,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/kernel", "//pkg/sentry/socket/netlink", "//pkg/syserr", diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go index b5d7808d7..1ee4296bc 100644 --- a/pkg/sentry/socket/netlink/uevent/protocol.go +++ b/pkg/sentry/socket/netlink/uevent/protocol.go @@ -20,7 +20,7 @@ package uevent import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/syserr" diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index e3d1f90cb..ab01cb4fa 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -17,10 +17,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/log", "//pkg/metric", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -28,11 +29,9 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", @@ -45,6 +44,7 @@ go_library( "//pkg/tcpip/stack", "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 318acbeff..8619cc506 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -34,20 +34,19 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" @@ -57,6 +56,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index 2d2c1ba2a..5afff2564 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -18,7 +18,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 2389a9cdb..50d9744e6 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -24,16 +24,16 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" ) // ControlMessages represents the union of unix control messages and tcpip diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index bade18686..08743deba 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -12,23 +12,23 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/refs", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go index 2447f24ef..129949990 100644 --- a/pkg/sentry/socket/unix/io.go +++ b/pkg/sentry/socket/unix/io.go @@ -15,8 +15,8 @@ package unix import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index 4bdfc9208..74bcd6300 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -28,9 +28,9 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/ilist", "//pkg/refs", - "//pkg/sentry/context", "//pkg/sync", "//pkg/syserr", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 9e6fbc111..ce5b94ee7 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -16,7 +16,7 @@ package transport import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 0322dec0b..4b06d63ac 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -16,7 +16,7 @@ package transport import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index fcc0da332..dcbafe0e5 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -19,7 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 7f49ba864..4d30aa714 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -22,9 +22,9 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -33,10 +33,10 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index ff6fafa63..762a946fe 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -34,7 +34,7 @@ go_library( "//pkg/sentry/socket/netlink", "//pkg/sentry/socket/netstack", "//pkg/sentry/syscalls/linux", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go index 5187594a7..074e80f9b 100644 --- a/pkg/sentry/strace/poll.go +++ b/pkg/sentry/strace/poll.go @@ -22,7 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // PollEventSet is the set of poll(2) event flags. diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go index c77d418e6..3a4c32aa0 100644 --- a/pkg/sentry/strace/select.go +++ b/pkg/sentry/strace/select.go @@ -19,7 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func fdsFromSet(t *kernel.Task, set []byte) []int { diff --git a/pkg/sentry/strace/signal.go b/pkg/sentry/strace/signal.go index 5656d53eb..c41f36e3f 100644 --- a/pkg/sentry/strace/signal.go +++ b/pkg/sentry/strace/signal.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // signalNames contains the names of all named signals. diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index b6d7177f4..d2079c85f 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // SocketFamily are the possible socket(2) families. diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 629c1f308..3fc4a47fc 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -33,7 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // DefaultLogMaximumSize is the default LogMaximumSize. diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 7d74e0f70..8d6c52850 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -63,11 +63,12 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/bpf", + "//pkg/context", "//pkg/log", "//pkg/metric", "//pkg/rand", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/lock", @@ -87,16 +88,15 @@ go_library( "//pkg/sentry/loader", "//pkg/sentry/memmap", "//pkg/sentry/mm", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index c76771a54..7435b50bf 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index d3587fda6..03a39fe65 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // ARM64 is a table of Linux arm64 syscall API with the corresponding syscall diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index 333013d8c..2ddb2b146 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -17,8 +17,8 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index f56411bfe..b401978db 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // I/O commands. diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 65b4a227b..5f11b496c 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 9bc2445a5..c54735148 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -18,8 +18,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" @@ -28,8 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // fileOpAt performs an operation on the second last component in the path. diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index bde17a767..b68261f72 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // futexWaitRestartBlock encapsulates the state required to restart futex(2) diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index 912cbe4ff..f66f4ffde 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Getdents implements linux syscall getdents(2) for 64bit systems. diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go index f5a519d8a..ac934dc6f 100644 --- a/pkg/sentry/syscalls/linux/sys_mempolicy.go +++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // We unconditionally report a single NUMA node. This also means that our diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 58a05b5bb..9959f6e61 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Brk implements linux syscall brk(2). diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go index 8c13e2d82..eb5ff48f5 100644 --- a/pkg/sentry/syscalls/linux/sys_mount.go +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Mount implements Linux syscall mount(2). diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 418d7fa5f..798344042 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // pipe2 implements the actual system call with flags. diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index 2b2df989a..4f8762d7d 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go index bc4c588bf..c0aa0fd60 100644 --- a/pkg/sentry/syscalls/linux/sys_random.go +++ b/pkg/sentry/syscalls/linux/sys_random.go @@ -19,11 +19,11 @@ import ( "math" "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index cd31e0649..f9f594190 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index 51e3f836b..e08c333d6 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // rlimit describes an implementation of 'struct rlimit', which may vary from diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go index 18510ead8..5b7a66f4d 100644 --- a/pkg/sentry/syscalls/linux/sys_seccomp.go +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go index cde3b54e7..5f54f2456 100644 --- a/pkg/sentry/syscalls/linux/sys_sem.go +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const opsMax = 500 // SEMOPM diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index fb6efd5d8..209be2990 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // "For a process to have permission to send a signal it must diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index cda517a81..2919228d0 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -26,9 +26,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // minListenBacklog is the minimum reasonable backlog for listening sockets. diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 69b17b799..c841abccb 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Stat implements linux syscall stat(2). diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go index 58afb4a9a..75a567bd4 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // copyOutStat copies the attributes (sattr, uattr) to the struct stat at diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go index 3e1251e0b..80c98d05c 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // copyOutStat copies the attributes (sattr, uattr) to the struct stat at diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index b47c3b5c4..0c9e2255d 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -24,8 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/loader" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go index b887fa9d7..2d2aa0819 100644 --- a/pkg/sentry/syscalls/linux/sys_time.go +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // The most significant 29 bits hold either a pid or a file descriptor. diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index d4134207b..432351917 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const nsecPerSec = int64(time.Second) diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index ad4b67806..aba892939 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 77deb8980..efb95555c 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // GetXattr implements linux syscall getxattr(2). diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go index 4ff8f9234..ddc3ee26e 100644 --- a/pkg/sentry/syscalls/linux/timespec.go +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // copyTimespecIn copies a Timespec from the untrusted app range to the kernel. diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD index 370fa6ec5..5d4aa3a63 100644 --- a/pkg/sentry/unimpl/BUILD +++ b/pkg/sentry/unimpl/BUILD @@ -14,7 +14,7 @@ go_library( srcs = ["events.go"], visibility = ["//:sandbox"], deps = [ + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", ], ) diff --git a/pkg/sentry/unimpl/events.go b/pkg/sentry/unimpl/events.go index 79b5de9e4..73ed9372f 100644 --- a/pkg/sentry/unimpl/events.go +++ b/pkg/sentry/unimpl/events.go @@ -17,8 +17,8 @@ package unimpl import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" ) // contextID is the events package's type for context.Context.Value keys. diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD index e9c18f170..7467e6398 100644 --- a/pkg/sentry/uniqueid/BUILD +++ b/pkg/sentry/uniqueid/BUILD @@ -7,7 +7,7 @@ go_library( srcs = ["context.go"], visibility = ["//pkg/sentry:internal"], deps = [ - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/socket/unix/transport", ], ) diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go index 4e466d66d..1fb884a90 100644 --- a/pkg/sentry/uniqueid/context.go +++ b/pkg/sentry/uniqueid/context.go @@ -17,7 +17,7 @@ package uniqueid import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD deleted file mode 100644 index c8322e29e..000000000 --- a/pkg/sentry/usermem/BUILD +++ /dev/null @@ -1,55 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "addr_range", - out = "addr_range.go", - package = "usermem", - prefix = "Addr", - template = "//pkg/segment:generic_range", - types = { - "T": "Addr", - }, -) - -go_library( - name = "usermem", - srcs = [ - "access_type.go", - "addr.go", - "addr_range.go", - "addr_range_seq_unsafe.go", - "bytes_io.go", - "bytes_io_unsafe.go", - "usermem.go", - "usermem_arm64.go", - "usermem_unsafe.go", - "usermem_x86.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/atomicbitops", - "//pkg/binary", - "//pkg/log", - "//pkg/sentry/context", - "//pkg/sentry/safemem", - "//pkg/syserror", - ], -) - -go_test( - name = "usermem_test", - size = "small", - srcs = [ - "addr_range_seq_test.go", - "usermem_test.go", - ], - library = ":usermem", - deps = [ - "//pkg/sentry/context", - "//pkg/sentry/safemem", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/usermem/README.md b/pkg/sentry/usermem/README.md deleted file mode 100644 index f6d2137eb..000000000 --- a/pkg/sentry/usermem/README.md +++ /dev/null @@ -1,31 +0,0 @@ -This package defines primitives for sentry access to application memory. - -Major types: - -- The `IO` interface represents a virtual address space and provides I/O - methods on that address space. `IO` is the lowest-level primitive. The - primary implementation of the `IO` interface is `mm.MemoryManager`. - -- `IOSequence` represents a collection of individually-contiguous address - ranges in a `IO` that is operated on sequentially, analogous to Linux's - `struct iov_iter`. - -Major usage patterns: - -- Access to a task's virtual memory, subject to the application's memory - protections and while running on that task's goroutine, from a context that - is at or above the level of the `kernel` package (e.g. most syscall - implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers - defined in `kernel/task_usermem.go`. - -- Access to a task's virtual memory, from a context that is at or above the - level of the `kernel` package, but where any of the above constraints does - not hold (e.g. `PTRACE_POKEDATA`, which ignores application memory - protections); obtain the task's `mm.MemoryManager` by calling - `kernel.Task.MemoryManager`, and call its `IO` methods directly. - -- Access to a task's virtual memory, from a context that is below the level of - the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments - from higher layers, usually in the form of an `IOSequence`. The - `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions - in `kernel/task_usermem.go` are convenience functions for doing so. diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go deleted file mode 100644 index 9c1742a59..000000000 --- a/pkg/sentry/usermem/access_type.go +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "syscall" -) - -// AccessType specifies memory access types. This is used for -// setting mapping permissions, as well as communicating faults. -// -// +stateify savable -type AccessType struct { - // Read is read access. - Read bool - - // Write is write access. - Write bool - - // Execute is executable access. - Execute bool -} - -// String returns a pretty representation of access. This looks like the -// familiar r-x, rw-, etc. and can be relied on as such. -func (a AccessType) String() string { - bits := [3]byte{'-', '-', '-'} - if a.Read { - bits[0] = 'r' - } - if a.Write { - bits[1] = 'w' - } - if a.Execute { - bits[2] = 'x' - } - return string(bits[:]) -} - -// Any returns true iff at least one of Read, Write or Execute is true. -func (a AccessType) Any() bool { - return a.Read || a.Write || a.Execute -} - -// Prot returns the system prot (syscall.PROT_READ, etc.) for this access. -func (a AccessType) Prot() int { - var prot int - if a.Read { - prot |= syscall.PROT_READ - } - if a.Write { - prot |= syscall.PROT_WRITE - } - if a.Execute { - prot |= syscall.PROT_EXEC - } - return prot -} - -// SupersetOf returns true iff the access types in a are a superset of the -// access types in other. -func (a AccessType) SupersetOf(other AccessType) bool { - if !a.Read && other.Read { - return false - } - if !a.Write && other.Write { - return false - } - if !a.Execute && other.Execute { - return false - } - return true -} - -// Intersect returns the access types set in both a and other. -func (a AccessType) Intersect(other AccessType) AccessType { - return AccessType{ - Read: a.Read && other.Read, - Write: a.Write && other.Write, - Execute: a.Execute && other.Execute, - } -} - -// Union returns the access types set in either a or other. -func (a AccessType) Union(other AccessType) AccessType { - return AccessType{ - Read: a.Read || other.Read, - Write: a.Write || other.Write, - Execute: a.Execute || other.Execute, - } -} - -// Effective returns the set of effective access types allowed by a, even if -// some types are not explicitly allowed. -func (a AccessType) Effective() AccessType { - // In Linux, Write and Execute access generally imply Read access. See - // mm/mmap.c:protection_map. - // - // The notable exception is get_user_pages, which only checks against - // the original vma flags. That said, most user memory accesses do not - // use GUP. - if a.Write || a.Execute { - a.Read = true - } - return a -} - -// Convenient access types. -var ( - NoAccess = AccessType{} - Read = AccessType{Read: true} - Write = AccessType{Write: true} - Execute = AccessType{Execute: true} - ReadWrite = AccessType{Read: true, Write: true} - AnyAccess = AccessType{Read: true, Write: true, Execute: true} -) diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go deleted file mode 100644 index e79210804..000000000 --- a/pkg/sentry/usermem/addr.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "fmt" -) - -// Addr represents a generic virtual address. -// -// +stateify savable -type Addr uintptr - -// AddLength adds the given length to start and returns the result. ok is true -// iff adding the length did not overflow the range of Addr. -// -// Note: This function is usually used to get the end of an address range -// defined by its start address and length. Since the resulting end is -// exclusive, end == 0 is technically valid, and corresponds to a range that -// extends to the end of the address space, but ok will be false. This isn't -// expected to ever come up in practice. -func (v Addr) AddLength(length uint64) (end Addr, ok bool) { - end = v + Addr(length) - // The second half of the following check is needed in case uintptr is - // smaller than 64 bits. - ok = end >= v && length <= uint64(^Addr(0)) - return -} - -// RoundDown returns the address rounded down to the nearest page boundary. -func (v Addr) RoundDown() Addr { - return v & ^Addr(PageSize-1) -} - -// RoundUp returns the address rounded up to the nearest page boundary. ok is -// true iff rounding up did not wrap around. -func (v Addr) RoundUp() (addr Addr, ok bool) { - addr = Addr(v + PageSize - 1).RoundDown() - ok = addr >= v - return -} - -// MustRoundUp is equivalent to RoundUp, but panics if rounding up wraps -// around. -func (v Addr) MustRoundUp() Addr { - addr, ok := v.RoundUp() - if !ok { - panic(fmt.Sprintf("usermem.Addr(%d).RoundUp() wraps", v)) - } - return addr -} - -// HugeRoundDown returns the address rounded down to the nearest huge page -// boundary. -func (v Addr) HugeRoundDown() Addr { - return v & ^Addr(HugePageSize-1) -} - -// HugeRoundUp returns the address rounded up to the nearest huge page boundary. -// ok is true iff rounding up did not wrap around. -func (v Addr) HugeRoundUp() (addr Addr, ok bool) { - addr = Addr(v + HugePageSize - 1).HugeRoundDown() - ok = addr >= v - return -} - -// PageOffset returns the offset of v into the current page. -func (v Addr) PageOffset() uint64 { - return uint64(v & Addr(PageSize-1)) -} - -// IsPageAligned returns true if v.PageOffset() == 0. -func (v Addr) IsPageAligned() bool { - return v.PageOffset() == 0 -} - -// AddrRange is a range of Addrs. -// -// type AddrRange - -// ToRange returns [v, v+length). -func (v Addr) ToRange(length uint64) (AddrRange, bool) { - end, ok := v.AddLength(length) - return AddrRange{v, end}, ok -} - -// IsPageAligned returns true if ar.Start.IsPageAligned() and -// ar.End.IsPageAligned(). -func (ar AddrRange) IsPageAligned() bool { - return ar.Start.IsPageAligned() && ar.End.IsPageAligned() -} - -// String implements fmt.Stringer.String. -func (ar AddrRange) String() string { - return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End) -} diff --git a/pkg/sentry/usermem/addr_range_seq_test.go b/pkg/sentry/usermem/addr_range_seq_test.go deleted file mode 100644 index 82f735026..000000000 --- a/pkg/sentry/usermem/addr_range_seq_test.go +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "testing" -) - -var addrRangeSeqTests = []struct { - desc string - ranges []AddrRange -}{ - { - desc: "Empty sequence", - }, - { - desc: "Single empty AddrRange", - ranges: []AddrRange{ - {0x10, 0x10}, - }, - }, - { - desc: "Single non-empty AddrRange of length 1", - ranges: []AddrRange{ - {0x10, 0x11}, - }, - }, - { - desc: "Single non-empty AddrRange of length 2", - ranges: []AddrRange{ - {0x10, 0x12}, - }, - }, - { - desc: "Multiple non-empty AddrRanges", - ranges: []AddrRange{ - {0x10, 0x11}, - {0x20, 0x22}, - }, - }, - { - desc: "Multiple AddrRanges including empty AddrRanges", - ranges: []AddrRange{ - {0x10, 0x10}, - {0x20, 0x20}, - {0x30, 0x33}, - {0x40, 0x44}, - {0x50, 0x50}, - {0x60, 0x60}, - {0x70, 0x77}, - {0x80, 0x88}, - {0x90, 0x90}, - {0xa0, 0xa0}, - }, - }, -} - -func testAddrRangeSeqEqualityWithTailIteration(t *testing.T, ars AddrRangeSeq, wantRanges []AddrRange) { - var wantLen int64 - for _, ar := range wantRanges { - wantLen += int64(ar.Length()) - } - - var i int - for !ars.IsEmpty() { - if gotLen := ars.NumBytes(); gotLen != wantLen { - t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen) - } - if gotN, wantN := ars.NumRanges(), len(wantRanges)-i; gotN != wantN { - t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted %d", i, ars, gotN, wantN) - } - got := ars.Head() - if i >= len(wantRanges) { - t.Errorf("Iteration %d: %v.Head(): got %s, wanted ", i, ars, got) - } else if want := wantRanges[i]; got != want { - t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want) - } - ars = ars.Tail() - wantLen -= int64(got.Length()) - i++ - } - if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 { - t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen) - } - if gotN := ars.NumRanges(); gotN != 0 { - t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted 0", i, ars, gotN) - } -} - -func TestAddrRangeSeqTailIteration(t *testing.T) { - for _, test := range addrRangeSeqTests { - t.Run(test.desc, func(t *testing.T) { - testAddrRangeSeqEqualityWithTailIteration(t, AddrRangeSeqFromSlice(test.ranges), test.ranges) - }) - } -} - -func TestAddrRangeSeqDropFirstEmpty(t *testing.T) { - var ars AddrRangeSeq - if got, want := ars.DropFirst(1), ars; got != want { - t.Errorf("%v.DropFirst(1): got %v, wanted %v", ars, got, want) - } -} - -func TestAddrRangeSeqDropSingleByteIteration(t *testing.T) { - // Tests AddrRangeSeq iteration using Head/DropFirst, simulating - // I/O-per-AddrRange. - for _, test := range addrRangeSeqTests { - t.Run(test.desc, func(t *testing.T) { - // Figure out what AddrRanges we expect to see. - var wantLen int64 - var wantRanges []AddrRange - for _, ar := range test.ranges { - wantLen += int64(ar.Length()) - wantRanges = append(wantRanges, ar) - if ar.Length() == 0 { - // We "do" 0 bytes of I/O and then call DropFirst(0), - // advancing to the next AddrRange. - continue - } - // Otherwise we "do" 1 byte of I/O and then call DropFirst(1), - // advancing the AddrRange by 1 byte, or to the next AddrRange - // if this one is exhausted. - for ar.Start++; ar.Length() != 0; ar.Start++ { - wantRanges = append(wantRanges, ar) - } - } - t.Logf("Expected AddrRanges: %s (%d bytes)", wantRanges, wantLen) - - ars := AddrRangeSeqFromSlice(test.ranges) - var i int - for !ars.IsEmpty() { - if gotLen := ars.NumBytes(); gotLen != wantLen { - t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen) - } - got := ars.Head() - if i >= len(wantRanges) { - t.Errorf("Iteration %d: %v.Head(): got %s, wanted ", i, ars, got) - } else if want := wantRanges[i]; got != want { - t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want) - } - if got.Length() == 0 { - ars = ars.DropFirst(0) - } else { - ars = ars.DropFirst(1) - wantLen-- - } - i++ - } - if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 { - t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen) - } - }) - } -} - -func TestAddrRangeSeqTakeFirstEmpty(t *testing.T) { - var ars AddrRangeSeq - if got, want := ars.TakeFirst(1), ars; got != want { - t.Errorf("%v.TakeFirst(1): got %v, wanted %v", ars, got, want) - } -} - -func TestAddrRangeSeqTakeFirst(t *testing.T) { - ranges := []AddrRange{ - {0x10, 0x11}, - {0x20, 0x22}, - {0x30, 0x30}, - {0x40, 0x44}, - {0x50, 0x55}, - {0x60, 0x60}, - {0x70, 0x77}, - } - ars := AddrRangeSeqFromSlice(ranges).TakeFirst(5) - want := []AddrRange{ - {0x10, 0x11}, // +1 byte (total 1 byte), not truncated - {0x20, 0x22}, // +2 bytes (total 3 bytes), not truncated - {0x30, 0x30}, // +0 bytes (total 3 bytes), no change - {0x40, 0x42}, // +2 bytes (total 5 bytes), partially truncated - {0x50, 0x50}, // +0 bytes (total 5 bytes), fully truncated - {0x60, 0x60}, // +0 bytes (total 5 bytes), "fully truncated" (no change) - {0x70, 0x70}, // +0 bytes (total 5 bytes), fully truncated - } - testAddrRangeSeqEqualityWithTailIteration(t, ars, want) -} diff --git a/pkg/sentry/usermem/addr_range_seq_unsafe.go b/pkg/sentry/usermem/addr_range_seq_unsafe.go deleted file mode 100644 index c09337c15..000000000 --- a/pkg/sentry/usermem/addr_range_seq_unsafe.go +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "bytes" - "fmt" - "reflect" - "unsafe" -) - -// An AddrRangeSeq represents a sequence of AddrRanges. -// -// AddrRangeSeqs are immutable and may be copied by value. The zero value of -// AddrRangeSeq represents an empty sequence. -// -// An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary -// since zero-length AddrRanges are significant to MM bounds checks. -type AddrRangeSeq struct { - // If length is 0, then the AddrRangeSeq represents no AddrRanges. - // Invariants: data == 0; offset == 0; limit == 0. - // - // If length is 1, then the AddrRangeSeq represents the single - // AddrRange{offset, offset+limit}. Invariants: data == 0. - // - // Otherwise, length >= 2, and the AddrRangeSeq represents the `length` - // AddrRanges in the array of AddrRanges starting at address `data`, - // starting at `offset` bytes into the first AddrRange and limited to the - // following `limit` bytes. (AddrRanges after `limit` are still iterated, - // but are truncated to a length of 0.) Invariants: data != 0; offset <= - // data[0].Length(); limit > 0; offset+limit <= the combined length of all - // AddrRanges in the array. - data unsafe.Pointer - length int - offset Addr - limit Addr -} - -// AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar. -func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq { - return AddrRangeSeq{ - length: 1, - offset: ar.Start, - limit: ar.Length(), - } -} - -// AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in -// slice. -// -// Whether the returned AddrRangeSeq shares memory with slice is unspecified; -// clients should avoid mutating slices passed to AddrRangeSeqFromSlice. -// -// Preconditions: The combined length of all AddrRanges in slice <= -// math.MaxInt64. -func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq { - var limit int64 - for _, ar := range slice { - len64 := int64(ar.Length()) - if len64 < 0 { - panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar)) - } - sum := limit + len64 - if sum < limit { - panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice)) - } - limit = sum - } - return addrRangeSeqFromSliceLimited(slice, limit) -} - -// Preconditions: The combined length of all AddrRanges in slice <= limit. -// limit >= 0. If len(slice) != 0, then limit > 0. -func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq { - switch len(slice) { - case 0: - return AddrRangeSeq{} - case 1: - return AddrRangeSeq{ - length: 1, - offset: slice[0].Start, - limit: Addr(limit), - } - default: - return AddrRangeSeq{ - data: unsafe.Pointer(&slice[0]), - length: len(slice), - limit: Addr(limit), - } - } -} - -// IsEmpty returns true if ars.NumRanges() == 0. -// -// Note that since AddrRangeSeq may contain AddrRanges with a length of zero, -// an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not -// necessarily empty. -func (ars AddrRangeSeq) IsEmpty() bool { - return ars.length == 0 -} - -// NumRanges returns the number of AddrRanges in ars. -func (ars AddrRangeSeq) NumRanges() int { - return ars.length -} - -// NumBytes returns the number of bytes represented by ars. -func (ars AddrRangeSeq) NumBytes() int64 { - return int64(ars.limit) -} - -// Head returns the first AddrRange in ars. -// -// Preconditions: !ars.IsEmpty(). -func (ars AddrRangeSeq) Head() AddrRange { - if ars.length == 0 { - panic("empty AddrRangeSeq") - } - if ars.length == 1 { - return AddrRange{ars.offset, ars.offset + ars.limit} - } - ar := *(*AddrRange)(ars.data) - ar.Start += ars.offset - if ar.Length() > ars.limit { - ar.End = ar.Start + ars.limit - } - return ar -} - -// Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the -// first. -// -// Preconditions: !ars.IsEmpty(). -func (ars AddrRangeSeq) Tail() AddrRangeSeq { - if ars.length == 0 { - panic("empty AddrRangeSeq") - } - if ars.length == 1 { - return AddrRangeSeq{} - } - return ars.externalTail() -} - -// Preconditions: ars.length >= 2. -func (ars AddrRangeSeq) externalTail() AddrRangeSeq { - headLen := (*AddrRange)(ars.data).Length() - ars.offset - var tailLimit int64 - if ars.limit > headLen { - tailLimit = int64(ars.limit - headLen) - } - var extSlice []AddrRange - extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice)) - extSliceHdr.Data = uintptr(ars.data) - extSliceHdr.Len = ars.length - extSliceHdr.Cap = ars.length - return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit) -} - -// DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n -// bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty -// AddrRangeSeq. -// -// If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit -// at least ars.Head(), even if n == 0. This guarantees that the basic pattern -// of: -// -// for !ars.IsEmpty() { -// n, err = doIOWith(ars.Head()) -// if err != nil { -// return err -// } -// ars = ars.DropFirst(n) -// } -// -// works even in the presence of zero-length AddrRanges. -// -// Preconditions: n >= 0. -func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return ars.DropFirst64(int64(n)) -} - -// DropFirst64 is equivalent to DropFirst but takes an int64. -func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - if Addr(n) > ars.limit { - return AddrRangeSeq{} - } - // Handle initial empty AddrRange. - switch ars.length { - case 0: - return AddrRangeSeq{} - case 1: - if ars.limit == 0 { - return AddrRangeSeq{} - } - default: - if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen { - ars = ars.externalTail() - } - } - for n != 0 { - // Calling ars.Head() here is surprisingly expensive, so inline getting - // the head's length. - var headLen Addr - if ars.length == 1 { - headLen = ars.limit - } else { - headLen = (*AddrRange)(ars.data).Length() - ars.offset - } - if Addr(n) < headLen { - // Dropping ends partway through the head AddrRange. - ars.offset += Addr(n) - ars.limit -= Addr(n) - return ars - } - n -= int64(headLen) - ars = ars.Tail() - } - return ars -} - -// TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n -// bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the -// first n bytes are reduced to a length of zero, but will still be iterated. -// -// Preconditions: n >= 0. -func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return ars.TakeFirst64(int64(n)) -} - -// TakeFirst64 is equivalent to TakeFirst but takes an int64. -func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - if ars.limit > Addr(n) { - ars.limit = Addr(n) - } - return ars -} - -// String implements fmt.Stringer.String. -func (ars AddrRangeSeq) String() string { - // This is deliberately chosen to be the same as fmt's automatic stringer - // for []AddrRange. - var buf bytes.Buffer - buf.WriteByte('[') - var sep string - for !ars.IsEmpty() { - buf.WriteString(sep) - sep = " " - buf.WriteString(ars.Head().String()) - ars = ars.Tail() - } - buf.WriteByte(']') - return buf.String() -} diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go deleted file mode 100644 index 7898851b3..000000000 --- a/pkg/sentry/usermem/bytes_io.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/syserror" -) - -const maxInt = int(^uint(0) >> 1) - -// BytesIO implements IO using a byte slice. Addresses are interpreted as -// offsets into the slice. Reads and writes beyond the end of the slice return -// EFAULT. -type BytesIO struct { - Bytes []byte -} - -// CopyOut implements IO.CopyOut. -func (b *BytesIO) CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) { - rngN, rngErr := b.rangeCheck(addr, len(src)) - if rngN == 0 { - return 0, rngErr - } - return copy(b.Bytes[int(addr):], src[:rngN]), rngErr -} - -// CopyIn implements IO.CopyIn. -func (b *BytesIO) CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) { - rngN, rngErr := b.rangeCheck(addr, len(dst)) - if rngN == 0 { - return 0, rngErr - } - return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr -} - -// ZeroOut implements IO.ZeroOut. -func (b *BytesIO) ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) { - if toZero > int64(maxInt) { - return 0, syserror.EINVAL - } - rngN, rngErr := b.rangeCheck(addr, int(toZero)) - if rngN == 0 { - return 0, rngErr - } - zeroSlice := b.Bytes[int(addr) : int(addr)+rngN] - for i := range zeroSlice { - zeroSlice[i] = 0 - } - return int64(rngN), rngErr -} - -// CopyOutFrom implements IO.CopyOutFrom. -func (b *BytesIO) CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) { - dsts, rngErr := b.blocksFromAddrRanges(ars) - n, err := src.ReadToBlocks(dsts) - if err != nil { - return int64(n), err - } - return int64(n), rngErr -} - -// CopyInTo implements IO.CopyInTo. -func (b *BytesIO) CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) { - srcs, rngErr := b.blocksFromAddrRanges(ars) - n, err := dst.WriteFromBlocks(srcs) - if err != nil { - return int64(n), err - } - return int64(n), rngErr -} - -func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) { - if length == 0 { - return 0, nil - } - if length < 0 { - return 0, syserror.EINVAL - } - max := Addr(len(b.Bytes)) - if addr >= max { - return 0, syserror.EFAULT - } - end, ok := addr.AddLength(uint64(length)) - if !ok || end > max { - return int(max - addr), syserror.EFAULT - } - return length, nil -} - -func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) { - switch ars.NumRanges() { - case 0: - return safemem.BlockSeq{}, nil - case 1: - block, err := b.blockFromAddrRange(ars.Head()) - return safemem.BlockSeqOf(block), err - default: - blocks := make([]safemem.Block, 0, ars.NumRanges()) - for !ars.IsEmpty() { - block, err := b.blockFromAddrRange(ars.Head()) - if block.Len() != 0 { - blocks = append(blocks, block) - } - if err != nil { - return safemem.BlockSeqFromSlice(blocks), err - } - ars = ars.Tail() - } - return safemem.BlockSeqFromSlice(blocks), nil - } -} - -func (b *BytesIO) blockFromAddrRange(ar AddrRange) (safemem.Block, error) { - n, err := b.rangeCheck(ar.Start, int(ar.Length())) - if n == 0 { - return safemem.Block{}, err - } - return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err -} - -// BytesIOSequence returns an IOSequence representing the given byte slice. -func BytesIOSequence(buf []byte) IOSequence { - return IOSequence{ - IO: &BytesIO{buf}, - Addrs: AddrRangeSeqOf(AddrRange{0, Addr(len(buf))}), - } -} diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go deleted file mode 100644 index fca5952f4..000000000 --- a/pkg/sentry/usermem/bytes_io_unsafe.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "sync/atomic" - "unsafe" - - "gvisor.dev/gvisor/pkg/atomicbitops" - "gvisor.dev/gvisor/pkg/sentry/context" -) - -// SwapUint32 implements IO.SwapUint32. -func (b *BytesIO) SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) { - if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { - return 0, rngErr - } - return atomic.SwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), new), nil -} - -// CompareAndSwapUint32 implements IO.CompareAndSwapUint32. -func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) { - if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { - return 0, rngErr - } - return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil -} - -// LoadUint32 implements IO.LoadUint32. -func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) { - if _, err := b.rangeCheck(addr, 4); err != nil { - return 0, err - } - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil -} diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go deleted file mode 100644 index 7b1f312b1..000000000 --- a/pkg/sentry/usermem/usermem.go +++ /dev/null @@ -1,597 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package usermem governs access to user memory. -package usermem - -import ( - "bytes" - "errors" - "io" - "strconv" - - "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/syserror" -) - -// IO provides access to the contents of a virtual memory space. -// -// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any -// meaningful data. -type IO interface { - // CopyOut copies len(src) bytes from src to the memory mapped at addr. It - // returns the number of bytes copied. If the number of bytes copied is < - // len(src), it returns a non-nil error explaining why. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. - // - // Postconditions: CopyOut does not retain src. - CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) - - // CopyIn copies len(dst) bytes from the memory mapped at addr to dst. - // It returns the number of bytes copied. If the number of bytes copied is - // < len(dst), it returns a non-nil error explaining why. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. - // - // Postconditions: CopyIn does not retain dst. - CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) - - // ZeroOut sets toZero bytes to 0, starting at addr. It returns the number - // of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a - // non-nil error explaining why. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. toZero >= 0. - ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) - - // CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at - // ars. It returns the number of bytes copied, which may be less than the - // number of bytes read from src if copying fails. CopyOutFrom may return a - // partial copy without an error iff src.ReadToBlocks returns a partial - // read without an error. - // - // CopyOutFrom calls src.ReadToBlocks at most once. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. src.ReadToBlocks must not block - // on mm.MemoryManager.activeMu or any preceding locks in the lock order. - CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) - - // CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to - // dst. It returns the number of bytes copied. CopyInTo may return a - // partial copy without an error iff dst.WriteFromBlocks returns a partial - // write without an error. - // - // CopyInTo calls dst.WriteFromBlocks at most once. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. dst.WriteFromBlocks must not - // block on mm.MemoryManager.activeMu or any preceding locks in the lock - // order. - CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) - - // TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst - // at most once, which is unnecessary in most cases, forces implementations - // to gather safemem.Blocks into a single slice to pass to src/dst. Add - // CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid - // this allocation. - - // SwapUint32 atomically sets the uint32 value at addr to new and - // returns the previous value. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. addr must be aligned to a 4-byte - // boundary. - SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) - - // CompareAndSwapUint32 atomically compares the uint32 value at addr to - // old; if they are equal, the value in memory is replaced by new. In - // either case, the previous value stored in memory is returned. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. addr must be aligned to a 4-byte - // boundary. - CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) - - // LoadUint32 atomically loads the uint32 value at addr and returns it. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. addr must be aligned to a 4-byte - // boundary. - LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) -} - -// IOOpts contains options applicable to all IO methods. -type IOOpts struct { - // If IgnorePermissions is true, application-defined memory protections set - // by mmap(2) or mprotect(2) will be ignored. (Memory protections required - // by the target of the mapping are never ignored.) - IgnorePermissions bool - - // If AddressSpaceActive is true, the IO implementation may assume that it - // has an active AddressSpace and can therefore use AddressSpace copying - // without performing activation. See mm/io.go for details. - AddressSpaceActive bool -} - -// IOReadWriter is an io.ReadWriter that reads from / writes to addresses -// starting at addr in IO. The preconditions that apply to IO.CopyIn and -// IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write -// respectively. -type IOReadWriter struct { - Ctx context.Context - IO IO - Addr Addr - Opts IOOpts -} - -// Read implements io.Reader.Read. -// -// Note that an address space does not have an "end of file", so Read can only -// return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or -// unreadable memory, or beyond the end of the address space, should return -// EFAULT. -func (rw *IOReadWriter) Read(dst []byte) (int, error) { - n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts) - end, ok := rw.Addr.AddLength(uint64(n)) - if ok { - rw.Addr = end - } else { - // Disallow wraparound. - rw.Addr = ^Addr(0) - if err != nil { - err = syserror.EFAULT - } - } - return n, err -} - -// Writer implements io.Writer.Write. -func (rw *IOReadWriter) Write(src []byte) (int, error) { - n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts) - end, ok := rw.Addr.AddLength(uint64(n)) - if ok { - rw.Addr = end - } else { - // Disallow wraparound. - rw.Addr = ^Addr(0) - if err != nil { - err = syserror.EFAULT - } - } - return n, err -} - -// CopyObjectOut copies a fixed-size value or slice of fixed-size values from -// src to the memory mapped at addr in uio. It returns the number of bytes -// copied. -// -// CopyObjectOut must use reflection to encode src; performance-sensitive -// clients should do encoding manually and use uio.CopyOut directly. -// -// Preconditions: As for IO.CopyOut. -func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) { - w := &IOReadWriter{ - Ctx: ctx, - IO: uio, - Addr: addr, - Opts: opts, - } - // Allocate a byte slice the size of the object being marshaled. This - // adds an extra reflection call, but avoids needing to grow the slice - // during encoding, which can result in many heap-allocated slices. - b := make([]byte, 0, binary.Size(src)) - return w.Write(binary.Marshal(b, ByteOrder, src)) -} - -// CopyObjectIn copies a fixed-size value or slice of fixed-size values from -// the memory mapped at addr in uio to dst. It returns the number of bytes -// copied. -// -// CopyObjectIn must use reflection to decode dst; performance-sensitive -// clients should use uio.CopyIn directly and do decoding manually. -// -// Preconditions: As for IO.CopyIn. -func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) { - r := &IOReadWriter{ - Ctx: ctx, - IO: uio, - Addr: addr, - Opts: opts, - } - buf := make([]byte, binary.Size(dst)) - if _, err := io.ReadFull(r, buf); err != nil { - return 0, err - } - binary.Unmarshal(buf, ByteOrder, dst) - return int(r.Addr - addr), nil -} - -// CopyStringIn tuning parameters, defined outside that function for tests. -const ( - copyStringIncrement = 64 - copyStringMaxInitBufLen = 256 -) - -// CopyStringIn copies a NUL-terminated string of unknown length from the -// memory mapped at addr in uio and returns it as a string (not including the -// trailing NUL). If the length of the string, including the terminating NUL, -// would exceed maxlen, CopyStringIn returns the string truncated to maxlen and -// ENAMETOOLONG. -// -// Preconditions: As for IO.CopyFromUser. maxlen >= 0. -func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) { - initLen := maxlen - if initLen > copyStringMaxInitBufLen { - initLen = copyStringMaxInitBufLen - } - buf := make([]byte, initLen) - var done int - for done < maxlen { - // Read up to copyStringIncrement bytes at a time. - readlen := copyStringIncrement - if readlen > maxlen-done { - readlen = maxlen - done - } - end, ok := addr.AddLength(uint64(readlen)) - if !ok { - return stringFromImmutableBytes(buf[:done]), syserror.EFAULT - } - // Shorten the read to avoid crossing page boundaries, since faulting - // in a page unnecessarily is expensive. This also ensures that partial - // copies up to the end of application-mappable memory succeed. - if addr.RoundDown() != end.RoundDown() { - end = end.RoundDown() - readlen = int(end - addr) - } - // Ensure that our buffer is large enough to accommodate the read. - if done+readlen > len(buf) { - newBufLen := len(buf) * 2 - if newBufLen > maxlen { - newBufLen = maxlen - } - buf = append(buf, make([]byte, newBufLen-len(buf))...) - } - n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts) - // Look for the terminating zero byte, which may have occurred before - // hitting err. - if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 { - return stringFromImmutableBytes(buf[:done+i]), nil - } - - done += n - if err != nil { - return stringFromImmutableBytes(buf[:done]), err - } - addr = end - } - return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG -} - -// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The -// maximum number of bytes copied is ars.NumBytes() or len(src), whichever is -// less. CopyOutVec returns the number of bytes copied; if this is less than -// the maximum, it returns a non-nil error explaining why. -// -// Preconditions: As for IO.CopyOut. -func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) { - var done int - for !ars.IsEmpty() && done < len(src) { - ar := ars.Head() - cplen := len(src) - done - if Addr(cplen) >= ar.Length() { - cplen = int(ar.Length()) - } - n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts) - done += n - if err != nil { - return done, err - } - ars = ars.DropFirst(n) - } - return done, nil -} - -// CopyInVec copies bytes from the memory mapped at ars in uio to dst. The -// maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is -// less. CopyInVec returns the number of bytes copied; if this is less than the -// maximum, it returns a non-nil error explaining why. -// -// Preconditions: As for IO.CopyIn. -func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) { - var done int - for !ars.IsEmpty() && done < len(dst) { - ar := ars.Head() - cplen := len(dst) - done - if Addr(cplen) >= ar.Length() { - cplen = int(ar.Length()) - } - n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts) - done += n - if err != nil { - return done, err - } - ars = ars.DropFirst(n) - } - return done, nil -} - -// ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum -// number of bytes written is ars.NumBytes() or toZero, whichever is less. -// ZeroOutVec returns the number of bytes written; if this is less than the -// maximum, it returns a non-nil error explaining why. -// -// Preconditions: As for IO.ZeroOut. -func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) { - var done int64 - for !ars.IsEmpty() && done < toZero { - ar := ars.Head() - cplen := toZero - done - if Addr(cplen) >= ar.Length() { - cplen = int64(ar.Length()) - } - n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts) - done += n - if err != nil { - return done, err - } - ars = ars.DropFirst64(n) - } - return done, nil -} - -func isASCIIWhitespace(b byte) bool { - // Compare Linux include/linux/ctype.h, lib/ctype.c. - // 9 => horizontal tab '\t' - // 10 => line feed '\n' - // 11 => vertical tab '\v' - // 12 => form feed '\c' - // 13 => carriage return '\r' - return b == ' ' || (b >= 9 && b <= 13) -} - -// CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal -// strings from the memory mapped at ars in uio and converts them to int32 -// values in dsts. It returns the number of bytes read. -// -// CopyInt32StringsInVec shares the following properties with Linux's -// kernel/sysctl.c:proc_dointvec(write=1): -// -// - If any read value overflows the range of int32, or any invalid characters -// are encountered during the read, CopyInt32StringsInVec returns EINVAL. -// -// - If, upon reaching the end of ars, fewer than len(dsts) values have been -// read, CopyInt32StringsInVec returns no error if at least 1 value was read -// and EINVAL otherwise. -// -// - Trailing whitespace after the last successfully read value is counted in -// the number of bytes read. -// -// Unlike proc_dointvec(): -// -// - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to -// PageSize-1; callers that require this must do so explicitly. -// -// - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0. -// -// Preconditions: As for CopyInVec. -func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) { - if len(dsts) == 0 { - return 0, nil - } - - buf := make([]byte, ars.NumBytes()) - n, cperr := CopyInVec(ctx, uio, ars, buf, opts) - buf = buf[:n] - - var i, j int - for ; j < len(dsts); j++ { - // Skip leading whitespace. - for i < len(buf) && isASCIIWhitespace(buf[i]) { - i++ - } - if i == len(buf) { - break - } - - // Find the end of the value to be parsed (next whitespace or end of string). - nextI := i + 1 - for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) { - nextI++ - } - - // Parse a single value. - val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32) - if err != nil { - return int64(i), syserror.EINVAL - } - dsts[j] = int32(val) - - i = nextI - } - - // Skip trailing whitespace. - for i < len(buf) && isASCIIWhitespace(buf[i]) { - i++ - } - - if cperr != nil { - return int64(i), cperr - } - if j == 0 { - return int64(i), syserror.EINVAL - } - return int64(i), nil -} - -// CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at -// most one int32. -func CopyInt32StringInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) { - dsts := [1]int32{*dst} - n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts) - *dst = dsts[0] - return n, err -} - -// IOSequence holds arguments to IO methods. -type IOSequence struct { - IO IO - Addrs AddrRangeSeq - Opts IOOpts -} - -// NumBytes returns s.Addrs.NumBytes(). -// -// Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since -// s.Addrs may contain a non-zero number of zero-length AddrRanges. -// Many clients of -// IOSequence currently do something like: -// -// if ioseq.NumBytes() == 0 { -// return 0, nil -// } -// if f.availableBytes == 0 { -// return 0, syserror.ErrWouldBlock -// } -// return ioseq.CopyOutFrom(..., reader) -// -// In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong -// behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means -// that we will return success for zero-length I/O in cases where Linux would -// return EFAULT due to a failed access_ok() check, so in the long term we -// should move checks for ErrWouldBlock etc. into the body of -// reader.ReadToBlocks and use s.Addrs.IsEmpty() instead. -func (s IOSequence) NumBytes() int64 { - return s.Addrs.NumBytes() -} - -// DropFirst returns a copy of s with s.Addrs.DropFirst(n). -// -// Preconditions: As for AddrRangeSeq.DropFirst. -func (s IOSequence) DropFirst(n int) IOSequence { - return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts} -} - -// DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n). -// -// Preconditions: As for AddrRangeSeq.DropFirst64. -func (s IOSequence) DropFirst64(n int64) IOSequence { - return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts} -} - -// TakeFirst returns a copy of s with s.Addrs.TakeFirst(n). -// -// Preconditions: As for AddrRangeSeq.TakeFirst. -func (s IOSequence) TakeFirst(n int) IOSequence { - return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts} -} - -// TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n). -// -// Preconditions: As for AddrRangeSeq.TakeFirst64. -func (s IOSequence) TakeFirst64(n int64) IOSequence { - return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts} -} - -// CopyOut invokes CopyOutVec over s.Addrs. -// -// As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated -// to s.NumBytes(), and a nil error will be returned. -// -// Preconditions: As for CopyOutVec. -func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) { - return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts) -} - -// CopyIn invokes CopyInVec over s.Addrs. -// -// As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to -// s.NumBytes(), and a nil error will be returned. -// -// Preconditions: As for CopyInVec. -func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) { - return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts) -} - -// ZeroOut invokes ZeroOutVec over s.Addrs. -// -// As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated -// to s.NumBytes(), and a nil error will be returned. -// -// Preconditions: As for ZeroOutVec. -func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) { - return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts) -} - -// CopyOutFrom invokes s.CopyOutFrom over s.Addrs. -// -// Preconditions: As for IO.CopyOutFrom. -func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) { - return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts) -} - -// CopyInTo invokes s.CopyInTo over s.Addrs. -// -// Preconditions: As for IO.CopyInTo. -func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) { - return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts) -} - -// Reader returns an io.Reader that reads from s. Reads beyond the end of s -// return io.EOF. The preconditions that apply to s.CopyIn also apply to the -// returned io.Reader.Read. -func (s IOSequence) Reader(ctx context.Context) io.Reader { - return &ioSequenceReadWriter{ctx, s} -} - -// Writer returns an io.Writer that writes to s. Writes beyond the end of s -// return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also -// apply to the returned io.Writer.Write. -func (s IOSequence) Writer(ctx context.Context) io.Writer { - return &ioSequenceReadWriter{ctx, s} -} - -// ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when -// attempting to write beyond the end of the IOSequence. -var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence") - -type ioSequenceReadWriter struct { - ctx context.Context - s IOSequence -} - -// Read implements io.Reader.Read. -func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) { - n, err := rw.s.CopyIn(rw.ctx, dst) - rw.s = rw.s.DropFirst(n) - if err == nil && rw.s.NumBytes() == 0 { - err = io.EOF - } - return n, err -} - -// Write implements io.Writer.Write. -func (rw *ioSequenceReadWriter) Write(src []byte) (int, error) { - n, err := rw.s.CopyOut(rw.ctx, src) - rw.s = rw.s.DropFirst(n) - if err == nil && n < len(src) { - err = ErrEndOfIOSequence - } - return n, err -} diff --git a/pkg/sentry/usermem/usermem_arm64.go b/pkg/sentry/usermem/usermem_arm64.go deleted file mode 100644 index fdfc30a66..000000000 --- a/pkg/sentry/usermem/usermem_arm64.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package usermem - -import ( - "encoding/binary" - "syscall" -) - -const ( - // PageSize is the system page size. - // arm64 support 4K/16K/64K page size, - // which can be get by syscall.Getpagesize(). - // Currently, only 4K page size is supported. - PageSize = 1 << PageShift - - // HugePageSize is the system huge page size. - HugePageSize = 1 << HugePageShift - - // PageShift is the binary log of the system page size. - PageShift = 12 - - // HugePageShift is the binary log of the system huge page size. - // Should be calculated by "PageShift + (PageShift - 3)" - // when multiple page size support is ready. - HugePageShift = 21 -) - -var ( - // ByteOrder is the native byte order (little endian). - ByteOrder = binary.LittleEndian -) - -func init() { - // Make sure the page size is 4K on arm64 platform. - if size := syscall.Getpagesize(); size != PageSize { - panic("Only 4K page size is supported on arm64!") - } -} diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go deleted file mode 100644 index 299f64754..000000000 --- a/pkg/sentry/usermem/usermem_test.go +++ /dev/null @@ -1,424 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "bytes" - "encoding/binary" - "fmt" - "reflect" - "strings" - "testing" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/syserror" -) - -// newContext returns a context.Context that we can use in these tests (we -// can't use contexttest because it depends on usermem). -func newContext() context.Context { - return context.Background() -} - -func newBytesIOString(s string) *BytesIO { - return &BytesIO{[]byte(s)} -} - -func TestBytesIOCopyOutSuccess(t *testing.T) { - b := newBytesIOString("ABCDE") - n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) - if wantN := 3; n != wantN || err != nil { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := b.Bytes, []byte("AfooE"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyOutFailure(t *testing.T) { - b := newBytesIOString("ABC") - n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) - if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyInSuccess(t *testing.T) { - b := newBytesIOString("AfooE") - var dst [3]byte - n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) - if wantN := 3; n != wantN || err != nil { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyInFailure(t *testing.T) { - b := newBytesIOString("Afo") - var dst [3]byte - n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) - if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -func TestBytesIOZeroOutSuccess(t *testing.T) { - b := newBytesIOString("ABCD") - n, err := b.ZeroOut(newContext(), 1, 2, IOOpts{}) - if wantN := int64(2); n != wantN || err != nil { - t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := b.Bytes, []byte("A\x00\x00D"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOZeroOutFailure(t *testing.T) { - b := newBytesIOString("ABC") - n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{}) - if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyOutFromSuccess(t *testing.T) { - b := newBytesIOString("ABCDEFGH") - n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{ - {Start: 4, End: 7}, - {Start: 1, End: 4}, - }), safemem.FromIOReader{bytes.NewBufferString("barfoo")}, IOOpts{}) - if wantN := int64(6); n != wantN || err != nil { - t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := b.Bytes, []byte("AfoobarH"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyOutFromFailure(t *testing.T) { - b := newBytesIOString("ABCDE") - n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{ - {Start: 1, End: 4}, - {Start: 4, End: 7}, - }), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{}) - if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyInToSuccess(t *testing.T) { - b := newBytesIOString("AfoobarH") - var dst bytes.Buffer - n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{ - {Start: 4, End: 7}, - {Start: 1, End: 4}, - }), safemem.FromIOWriter{&dst}, IOOpts{}) - if wantN := int64(6); n != wantN || err != nil { - t.Errorf("CopyInTo: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("barfoo"); !bytes.Equal(got, want) { - t.Errorf("dst.Bytes(): got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyInToFailure(t *testing.T) { - b := newBytesIOString("Afoob") - var dst bytes.Buffer - n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{ - {Start: 1, End: 4}, - {Start: 4, End: 7}, - }), safemem.FromIOWriter{&dst}, IOOpts{}) - if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { - t.Errorf("dst.Bytes(): got %q, wanted %q", got, want) - } -} - -type testStruct struct { - Int8 int8 - Uint8 uint8 - Int16 int16 - Uint16 uint16 - Int32 int32 - Uint32 uint32 - Int64 int64 - Uint64 uint64 -} - -func TestCopyObject(t *testing.T) { - wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8} - wantN := binary.Size(wantObj) - b := &BytesIO{make([]byte, wantN)} - ctx := newContext() - if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil { - t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - var gotObj testStruct - if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil { - t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if gotObj != wantObj { - t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj) - } -} - -func TestCopyStringInShort(t *testing.T) { - // Tests for string length <= copyStringIncrement. - want := strings.Repeat("A", copyStringIncrement-2) - mem := want + "\x00" - if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) - } -} - -func TestCopyStringInLong(t *testing.T) { - // Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen - // (requiring multiple calls to IO.CopyIn()). - want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4) - mem := want + "\x00" - if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) - } -} - -func TestCopyStringInVeryLong(t *testing.T) { - // Tests for string length > copyStringMaxInitBufLen (requiring buffer - // reallocation). - want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4) - mem := want + "\x00" - if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) - } -} - -func TestCopyStringInNoTerminatingZeroByte(t *testing.T) { - want := strings.Repeat("A", copyStringIncrement-1) - got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{}) - if wantErr := syserror.EFAULT; got != want || err != wantErr { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) - } -} - -func TestCopyStringInTruncatedByMaxlen(t *testing.T) { - got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{}) - if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) - } -} - -func TestCopyInt32StringsInVec(t *testing.T) { - for _, test := range []struct { - str string - n int - initial []int32 - final []int32 - }{ - { - str: "100 200", - n: len("100 200"), - initial: []int32{1, 2}, - final: []int32{100, 200}, - }, - { - // Fewer values ok - str: "100", - n: len("100"), - initial: []int32{1, 2}, - final: []int32{100, 2}, - }, - { - // Extra values ok - str: "100 200 300", - n: len("100 200 "), - initial: []int32{1, 2}, - final: []int32{100, 200}, - }, - { - // Leading and trailing whitespace ok - str: " 100\t200\n", - n: len(" 100\t200\n"), - initial: []int32{1, 2}, - final: []int32{100, 200}, - }, - } { - t.Run(fmt.Sprintf("%q", test.str), func(t *testing.T) { - src := BytesIOSequence([]byte(test.str)) - dsts := append([]int32(nil), test.initial...) - if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); n != int64(test.n) || err != nil { - t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (%d, nil)", n, err, test.n) - } - if !reflect.DeepEqual(dsts, test.final) { - t.Errorf("dsts: got %v, wanted %v", dsts, test.final) - } - }) - } -} - -func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) { - for _, s := range []string{"", "\n", "a123"} { - t.Run(fmt.Sprintf("%q", s), func(t *testing.T) { - src := BytesIOSequence([]byte(s)) - initial := []int32{1, 2} - dsts := append([]int32(nil), initial...) - if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); err != syserror.EINVAL { - t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL) - } - if !reflect.DeepEqual(dsts, initial) { - t.Errorf("dsts: got %v, wanted %v", dsts, initial) - } - }) - } -} - -func TestIOSequenceCopyOut(t *testing.T) { - buf := []byte("ABCD") - s := BytesIOSequence(buf) - - // CopyOut limited by len(src). - n, err := s.CopyOut(newContext(), []byte("fo")) - if wantN := 2; n != wantN || err != nil { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("foCD"); !bytes.Equal(buf, want) { - t.Errorf("buf: got %q, wanted %q", buf, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(2); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - // CopyOut limited by s.NumBytes(). - n, err = s.CopyOut(newContext(), []byte("obar")) - if wantN := 2; n != wantN || err != nil { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("foob"); !bytes.Equal(buf, want) { - t.Errorf("buf: got %q, wanted %q", buf, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(0); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } -} - -func TestIOSequenceCopyIn(t *testing.T) { - s := BytesIOSequence([]byte("foob")) - dst := []byte("ABCDEF") - - // CopyIn limited by len(dst). - n, err := s.CopyIn(newContext(), dst[:2]) - if wantN := 2; n != wantN || err != nil { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("foCDEF"); !bytes.Equal(dst, want) { - t.Errorf("dst: got %q, wanted %q", dst, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(2); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - // CopyIn limited by s.Remaining(). - n, err = s.CopyIn(newContext(), dst[2:]) - if wantN := 2; n != wantN || err != nil { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("foobEF"); !bytes.Equal(dst, want) { - t.Errorf("dst: got %q, wanted %q", dst, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(0); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } -} - -func TestIOSequenceZeroOut(t *testing.T) { - buf := []byte("ABCD") - s := BytesIOSequence(buf) - - // ZeroOut limited by toZero. - n, err := s.ZeroOut(newContext(), 2) - if wantN := int64(2); n != wantN || err != nil { - t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("\x00\x00CD"); !bytes.Equal(buf, want) { - t.Errorf("buf: got %q, wanted %q", buf, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(2); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - // ZeroOut limited by s.NumBytes(). - n, err = s.ZeroOut(newContext(), 4) - if wantN := int64(2); n != wantN || err != nil { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("\x00\x00\x00\x00"); !bytes.Equal(buf, want) { - t.Errorf("buf: got %q, wanted %q", buf, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(0); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } -} - -func TestIOSequenceTakeFirst(t *testing.T) { - s := BytesIOSequence([]byte("foobar")) - if got, want := s.NumBytes(), int64(6); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - s = s.TakeFirst(3) - if got, want := s.NumBytes(), int64(3); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - // TakeFirst(n) where n > s.NumBytes() is a no-op. - s = s.TakeFirst(9) - if got, want := s.NumBytes(), int64(3); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - var dst [3]byte - n, err := s.CopyIn(newContext(), dst[:]) - if wantN := 3; n != wantN || err != nil { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } - s = s.DropFirst(3) - if got, want := s.NumBytes(), int64(0); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } -} diff --git a/pkg/sentry/usermem/usermem_unsafe.go b/pkg/sentry/usermem/usermem_unsafe.go deleted file mode 100644 index 876783e78..000000000 --- a/pkg/sentry/usermem/usermem_unsafe.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "unsafe" -) - -// stringFromImmutableBytes is equivalent to string(bs), except that it never -// copies even if escape analysis can't prove that bs does not escape. This is -// only valid if bs is never mutated after stringFromImmutableBytes returns. -func stringFromImmutableBytes(bs []byte) string { - // Compare strings.Builder.String(). - return *(*string)(unsafe.Pointer(&bs)) -} diff --git a/pkg/sentry/usermem/usermem_x86.go b/pkg/sentry/usermem/usermem_x86.go deleted file mode 100644 index 8059b72d2..000000000 --- a/pkg/sentry/usermem/usermem_x86.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 i386 - -package usermem - -import "encoding/binary" - -const ( - // PageSize is the system page size. - PageSize = 1 << PageShift - - // HugePageSize is the system huge page size. - HugePageSize = 1 << HugePageShift - - // PageShift is the binary log of the system page size. - PageShift = 12 - - // HugePageShift is the binary log of the system huge page size. - HugePageShift = 21 -) - -var ( - // ByteOrder is the native byte order (little endian). - ByteOrder = binary.LittleEndian -) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 51acdc4e9..6b1009328 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -26,14 +26,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -48,11 +48,11 @@ go_test( library = ":vfs", deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index 705194ebc..d97362b9a 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -15,7 +15,7 @@ package vfs import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is this package's type for context.Context.Value keys. diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index 9f9d6e783..3af2aa58d 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -17,7 +17,7 @@ package vfs import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 51c95c2d9..225024463 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -18,12 +18,12 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index c00b3c84b..fb9b87fdc 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -19,12 +19,12 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 9ed58512f..1720d325d 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -22,11 +22,11 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // fileDescription is the common fd struct which a filesystem implementation diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index ea78f555b..a06a6caf3 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -18,8 +18,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" ) // A Filesystem is a tree of nodes represented by Dentries, which forms part of diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index 023301780..c58b70728 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -18,7 +18,7 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 00177b371..d39528051 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -19,7 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index cf80df90e..b318c681a 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -15,8 +15,8 @@ package vfs import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go index ee5c8b9e2..392c7611e 100644 --- a/pkg/sentry/vfs/testutil.go +++ b/pkg/sentry/vfs/testutil.go @@ -18,8 +18,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 1f6f56293..b2bf48853 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -31,8 +31,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD new file mode 100644 index 000000000..ff8b9e91a --- /dev/null +++ b/pkg/usermem/BUILD @@ -0,0 +1,55 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "addr_range", + out = "addr_range.go", + package = "usermem", + prefix = "Addr", + template = "//pkg/segment:generic_range", + types = { + "T": "Addr", + }, +) + +go_library( + name = "usermem", + srcs = [ + "access_type.go", + "addr.go", + "addr_range.go", + "addr_range_seq_unsafe.go", + "bytes_io.go", + "bytes_io_unsafe.go", + "usermem.go", + "usermem_arm64.go", + "usermem_unsafe.go", + "usermem_x86.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/atomicbitops", + "//pkg/binary", + "//pkg/context", + "//pkg/log", + "//pkg/safemem", + "//pkg/syserror", + ], +) + +go_test( + name = "usermem_test", + size = "small", + srcs = [ + "addr_range_seq_test.go", + "usermem_test.go", + ], + library = ":usermem", + deps = [ + "//pkg/context", + "//pkg/safemem", + "//pkg/syserror", + ], +) diff --git a/pkg/usermem/README.md b/pkg/usermem/README.md new file mode 100644 index 000000000..f6d2137eb --- /dev/null +++ b/pkg/usermem/README.md @@ -0,0 +1,31 @@ +This package defines primitives for sentry access to application memory. + +Major types: + +- The `IO` interface represents a virtual address space and provides I/O + methods on that address space. `IO` is the lowest-level primitive. The + primary implementation of the `IO` interface is `mm.MemoryManager`. + +- `IOSequence` represents a collection of individually-contiguous address + ranges in a `IO` that is operated on sequentially, analogous to Linux's + `struct iov_iter`. + +Major usage patterns: + +- Access to a task's virtual memory, subject to the application's memory + protections and while running on that task's goroutine, from a context that + is at or above the level of the `kernel` package (e.g. most syscall + implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers + defined in `kernel/task_usermem.go`. + +- Access to a task's virtual memory, from a context that is at or above the + level of the `kernel` package, but where any of the above constraints does + not hold (e.g. `PTRACE_POKEDATA`, which ignores application memory + protections); obtain the task's `mm.MemoryManager` by calling + `kernel.Task.MemoryManager`, and call its `IO` methods directly. + +- Access to a task's virtual memory, from a context that is below the level of + the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments + from higher layers, usually in the form of an `IOSequence`. The + `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions + in `kernel/task_usermem.go` are convenience functions for doing so. diff --git a/pkg/usermem/access_type.go b/pkg/usermem/access_type.go new file mode 100644 index 000000000..9c1742a59 --- /dev/null +++ b/pkg/usermem/access_type.go @@ -0,0 +1,128 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "syscall" +) + +// AccessType specifies memory access types. This is used for +// setting mapping permissions, as well as communicating faults. +// +// +stateify savable +type AccessType struct { + // Read is read access. + Read bool + + // Write is write access. + Write bool + + // Execute is executable access. + Execute bool +} + +// String returns a pretty representation of access. This looks like the +// familiar r-x, rw-, etc. and can be relied on as such. +func (a AccessType) String() string { + bits := [3]byte{'-', '-', '-'} + if a.Read { + bits[0] = 'r' + } + if a.Write { + bits[1] = 'w' + } + if a.Execute { + bits[2] = 'x' + } + return string(bits[:]) +} + +// Any returns true iff at least one of Read, Write or Execute is true. +func (a AccessType) Any() bool { + return a.Read || a.Write || a.Execute +} + +// Prot returns the system prot (syscall.PROT_READ, etc.) for this access. +func (a AccessType) Prot() int { + var prot int + if a.Read { + prot |= syscall.PROT_READ + } + if a.Write { + prot |= syscall.PROT_WRITE + } + if a.Execute { + prot |= syscall.PROT_EXEC + } + return prot +} + +// SupersetOf returns true iff the access types in a are a superset of the +// access types in other. +func (a AccessType) SupersetOf(other AccessType) bool { + if !a.Read && other.Read { + return false + } + if !a.Write && other.Write { + return false + } + if !a.Execute && other.Execute { + return false + } + return true +} + +// Intersect returns the access types set in both a and other. +func (a AccessType) Intersect(other AccessType) AccessType { + return AccessType{ + Read: a.Read && other.Read, + Write: a.Write && other.Write, + Execute: a.Execute && other.Execute, + } +} + +// Union returns the access types set in either a or other. +func (a AccessType) Union(other AccessType) AccessType { + return AccessType{ + Read: a.Read || other.Read, + Write: a.Write || other.Write, + Execute: a.Execute || other.Execute, + } +} + +// Effective returns the set of effective access types allowed by a, even if +// some types are not explicitly allowed. +func (a AccessType) Effective() AccessType { + // In Linux, Write and Execute access generally imply Read access. See + // mm/mmap.c:protection_map. + // + // The notable exception is get_user_pages, which only checks against + // the original vma flags. That said, most user memory accesses do not + // use GUP. + if a.Write || a.Execute { + a.Read = true + } + return a +} + +// Convenient access types. +var ( + NoAccess = AccessType{} + Read = AccessType{Read: true} + Write = AccessType{Write: true} + Execute = AccessType{Execute: true} + ReadWrite = AccessType{Read: true, Write: true} + AnyAccess = AccessType{Read: true, Write: true, Execute: true} +) diff --git a/pkg/usermem/addr.go b/pkg/usermem/addr.go new file mode 100644 index 000000000..e79210804 --- /dev/null +++ b/pkg/usermem/addr.go @@ -0,0 +1,108 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "fmt" +) + +// Addr represents a generic virtual address. +// +// +stateify savable +type Addr uintptr + +// AddLength adds the given length to start and returns the result. ok is true +// iff adding the length did not overflow the range of Addr. +// +// Note: This function is usually used to get the end of an address range +// defined by its start address and length. Since the resulting end is +// exclusive, end == 0 is technically valid, and corresponds to a range that +// extends to the end of the address space, but ok will be false. This isn't +// expected to ever come up in practice. +func (v Addr) AddLength(length uint64) (end Addr, ok bool) { + end = v + Addr(length) + // The second half of the following check is needed in case uintptr is + // smaller than 64 bits. + ok = end >= v && length <= uint64(^Addr(0)) + return +} + +// RoundDown returns the address rounded down to the nearest page boundary. +func (v Addr) RoundDown() Addr { + return v & ^Addr(PageSize-1) +} + +// RoundUp returns the address rounded up to the nearest page boundary. ok is +// true iff rounding up did not wrap around. +func (v Addr) RoundUp() (addr Addr, ok bool) { + addr = Addr(v + PageSize - 1).RoundDown() + ok = addr >= v + return +} + +// MustRoundUp is equivalent to RoundUp, but panics if rounding up wraps +// around. +func (v Addr) MustRoundUp() Addr { + addr, ok := v.RoundUp() + if !ok { + panic(fmt.Sprintf("usermem.Addr(%d).RoundUp() wraps", v)) + } + return addr +} + +// HugeRoundDown returns the address rounded down to the nearest huge page +// boundary. +func (v Addr) HugeRoundDown() Addr { + return v & ^Addr(HugePageSize-1) +} + +// HugeRoundUp returns the address rounded up to the nearest huge page boundary. +// ok is true iff rounding up did not wrap around. +func (v Addr) HugeRoundUp() (addr Addr, ok bool) { + addr = Addr(v + HugePageSize - 1).HugeRoundDown() + ok = addr >= v + return +} + +// PageOffset returns the offset of v into the current page. +func (v Addr) PageOffset() uint64 { + return uint64(v & Addr(PageSize-1)) +} + +// IsPageAligned returns true if v.PageOffset() == 0. +func (v Addr) IsPageAligned() bool { + return v.PageOffset() == 0 +} + +// AddrRange is a range of Addrs. +// +// type AddrRange + +// ToRange returns [v, v+length). +func (v Addr) ToRange(length uint64) (AddrRange, bool) { + end, ok := v.AddLength(length) + return AddrRange{v, end}, ok +} + +// IsPageAligned returns true if ar.Start.IsPageAligned() and +// ar.End.IsPageAligned(). +func (ar AddrRange) IsPageAligned() bool { + return ar.Start.IsPageAligned() && ar.End.IsPageAligned() +} + +// String implements fmt.Stringer.String. +func (ar AddrRange) String() string { + return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End) +} diff --git a/pkg/usermem/addr_range_seq_test.go b/pkg/usermem/addr_range_seq_test.go new file mode 100644 index 000000000..82f735026 --- /dev/null +++ b/pkg/usermem/addr_range_seq_test.go @@ -0,0 +1,197 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "testing" +) + +var addrRangeSeqTests = []struct { + desc string + ranges []AddrRange +}{ + { + desc: "Empty sequence", + }, + { + desc: "Single empty AddrRange", + ranges: []AddrRange{ + {0x10, 0x10}, + }, + }, + { + desc: "Single non-empty AddrRange of length 1", + ranges: []AddrRange{ + {0x10, 0x11}, + }, + }, + { + desc: "Single non-empty AddrRange of length 2", + ranges: []AddrRange{ + {0x10, 0x12}, + }, + }, + { + desc: "Multiple non-empty AddrRanges", + ranges: []AddrRange{ + {0x10, 0x11}, + {0x20, 0x22}, + }, + }, + { + desc: "Multiple AddrRanges including empty AddrRanges", + ranges: []AddrRange{ + {0x10, 0x10}, + {0x20, 0x20}, + {0x30, 0x33}, + {0x40, 0x44}, + {0x50, 0x50}, + {0x60, 0x60}, + {0x70, 0x77}, + {0x80, 0x88}, + {0x90, 0x90}, + {0xa0, 0xa0}, + }, + }, +} + +func testAddrRangeSeqEqualityWithTailIteration(t *testing.T, ars AddrRangeSeq, wantRanges []AddrRange) { + var wantLen int64 + for _, ar := range wantRanges { + wantLen += int64(ar.Length()) + } + + var i int + for !ars.IsEmpty() { + if gotLen := ars.NumBytes(); gotLen != wantLen { + t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen) + } + if gotN, wantN := ars.NumRanges(), len(wantRanges)-i; gotN != wantN { + t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted %d", i, ars, gotN, wantN) + } + got := ars.Head() + if i >= len(wantRanges) { + t.Errorf("Iteration %d: %v.Head(): got %s, wanted ", i, ars, got) + } else if want := wantRanges[i]; got != want { + t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want) + } + ars = ars.Tail() + wantLen -= int64(got.Length()) + i++ + } + if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 { + t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen) + } + if gotN := ars.NumRanges(); gotN != 0 { + t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted 0", i, ars, gotN) + } +} + +func TestAddrRangeSeqTailIteration(t *testing.T) { + for _, test := range addrRangeSeqTests { + t.Run(test.desc, func(t *testing.T) { + testAddrRangeSeqEqualityWithTailIteration(t, AddrRangeSeqFromSlice(test.ranges), test.ranges) + }) + } +} + +func TestAddrRangeSeqDropFirstEmpty(t *testing.T) { + var ars AddrRangeSeq + if got, want := ars.DropFirst(1), ars; got != want { + t.Errorf("%v.DropFirst(1): got %v, wanted %v", ars, got, want) + } +} + +func TestAddrRangeSeqDropSingleByteIteration(t *testing.T) { + // Tests AddrRangeSeq iteration using Head/DropFirst, simulating + // I/O-per-AddrRange. + for _, test := range addrRangeSeqTests { + t.Run(test.desc, func(t *testing.T) { + // Figure out what AddrRanges we expect to see. + var wantLen int64 + var wantRanges []AddrRange + for _, ar := range test.ranges { + wantLen += int64(ar.Length()) + wantRanges = append(wantRanges, ar) + if ar.Length() == 0 { + // We "do" 0 bytes of I/O and then call DropFirst(0), + // advancing to the next AddrRange. + continue + } + // Otherwise we "do" 1 byte of I/O and then call DropFirst(1), + // advancing the AddrRange by 1 byte, or to the next AddrRange + // if this one is exhausted. + for ar.Start++; ar.Length() != 0; ar.Start++ { + wantRanges = append(wantRanges, ar) + } + } + t.Logf("Expected AddrRanges: %s (%d bytes)", wantRanges, wantLen) + + ars := AddrRangeSeqFromSlice(test.ranges) + var i int + for !ars.IsEmpty() { + if gotLen := ars.NumBytes(); gotLen != wantLen { + t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen) + } + got := ars.Head() + if i >= len(wantRanges) { + t.Errorf("Iteration %d: %v.Head(): got %s, wanted ", i, ars, got) + } else if want := wantRanges[i]; got != want { + t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want) + } + if got.Length() == 0 { + ars = ars.DropFirst(0) + } else { + ars = ars.DropFirst(1) + wantLen-- + } + i++ + } + if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 { + t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen) + } + }) + } +} + +func TestAddrRangeSeqTakeFirstEmpty(t *testing.T) { + var ars AddrRangeSeq + if got, want := ars.TakeFirst(1), ars; got != want { + t.Errorf("%v.TakeFirst(1): got %v, wanted %v", ars, got, want) + } +} + +func TestAddrRangeSeqTakeFirst(t *testing.T) { + ranges := []AddrRange{ + {0x10, 0x11}, + {0x20, 0x22}, + {0x30, 0x30}, + {0x40, 0x44}, + {0x50, 0x55}, + {0x60, 0x60}, + {0x70, 0x77}, + } + ars := AddrRangeSeqFromSlice(ranges).TakeFirst(5) + want := []AddrRange{ + {0x10, 0x11}, // +1 byte (total 1 byte), not truncated + {0x20, 0x22}, // +2 bytes (total 3 bytes), not truncated + {0x30, 0x30}, // +0 bytes (total 3 bytes), no change + {0x40, 0x42}, // +2 bytes (total 5 bytes), partially truncated + {0x50, 0x50}, // +0 bytes (total 5 bytes), fully truncated + {0x60, 0x60}, // +0 bytes (total 5 bytes), "fully truncated" (no change) + {0x70, 0x70}, // +0 bytes (total 5 bytes), fully truncated + } + testAddrRangeSeqEqualityWithTailIteration(t, ars, want) +} diff --git a/pkg/usermem/addr_range_seq_unsafe.go b/pkg/usermem/addr_range_seq_unsafe.go new file mode 100644 index 000000000..c09337c15 --- /dev/null +++ b/pkg/usermem/addr_range_seq_unsafe.go @@ -0,0 +1,277 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "bytes" + "fmt" + "reflect" + "unsafe" +) + +// An AddrRangeSeq represents a sequence of AddrRanges. +// +// AddrRangeSeqs are immutable and may be copied by value. The zero value of +// AddrRangeSeq represents an empty sequence. +// +// An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary +// since zero-length AddrRanges are significant to MM bounds checks. +type AddrRangeSeq struct { + // If length is 0, then the AddrRangeSeq represents no AddrRanges. + // Invariants: data == 0; offset == 0; limit == 0. + // + // If length is 1, then the AddrRangeSeq represents the single + // AddrRange{offset, offset+limit}. Invariants: data == 0. + // + // Otherwise, length >= 2, and the AddrRangeSeq represents the `length` + // AddrRanges in the array of AddrRanges starting at address `data`, + // starting at `offset` bytes into the first AddrRange and limited to the + // following `limit` bytes. (AddrRanges after `limit` are still iterated, + // but are truncated to a length of 0.) Invariants: data != 0; offset <= + // data[0].Length(); limit > 0; offset+limit <= the combined length of all + // AddrRanges in the array. + data unsafe.Pointer + length int + offset Addr + limit Addr +} + +// AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar. +func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq { + return AddrRangeSeq{ + length: 1, + offset: ar.Start, + limit: ar.Length(), + } +} + +// AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in +// slice. +// +// Whether the returned AddrRangeSeq shares memory with slice is unspecified; +// clients should avoid mutating slices passed to AddrRangeSeqFromSlice. +// +// Preconditions: The combined length of all AddrRanges in slice <= +// math.MaxInt64. +func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq { + var limit int64 + for _, ar := range slice { + len64 := int64(ar.Length()) + if len64 < 0 { + panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar)) + } + sum := limit + len64 + if sum < limit { + panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice)) + } + limit = sum + } + return addrRangeSeqFromSliceLimited(slice, limit) +} + +// Preconditions: The combined length of all AddrRanges in slice <= limit. +// limit >= 0. If len(slice) != 0, then limit > 0. +func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq { + switch len(slice) { + case 0: + return AddrRangeSeq{} + case 1: + return AddrRangeSeq{ + length: 1, + offset: slice[0].Start, + limit: Addr(limit), + } + default: + return AddrRangeSeq{ + data: unsafe.Pointer(&slice[0]), + length: len(slice), + limit: Addr(limit), + } + } +} + +// IsEmpty returns true if ars.NumRanges() == 0. +// +// Note that since AddrRangeSeq may contain AddrRanges with a length of zero, +// an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not +// necessarily empty. +func (ars AddrRangeSeq) IsEmpty() bool { + return ars.length == 0 +} + +// NumRanges returns the number of AddrRanges in ars. +func (ars AddrRangeSeq) NumRanges() int { + return ars.length +} + +// NumBytes returns the number of bytes represented by ars. +func (ars AddrRangeSeq) NumBytes() int64 { + return int64(ars.limit) +} + +// Head returns the first AddrRange in ars. +// +// Preconditions: !ars.IsEmpty(). +func (ars AddrRangeSeq) Head() AddrRange { + if ars.length == 0 { + panic("empty AddrRangeSeq") + } + if ars.length == 1 { + return AddrRange{ars.offset, ars.offset + ars.limit} + } + ar := *(*AddrRange)(ars.data) + ar.Start += ars.offset + if ar.Length() > ars.limit { + ar.End = ar.Start + ars.limit + } + return ar +} + +// Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the +// first. +// +// Preconditions: !ars.IsEmpty(). +func (ars AddrRangeSeq) Tail() AddrRangeSeq { + if ars.length == 0 { + panic("empty AddrRangeSeq") + } + if ars.length == 1 { + return AddrRangeSeq{} + } + return ars.externalTail() +} + +// Preconditions: ars.length >= 2. +func (ars AddrRangeSeq) externalTail() AddrRangeSeq { + headLen := (*AddrRange)(ars.data).Length() - ars.offset + var tailLimit int64 + if ars.limit > headLen { + tailLimit = int64(ars.limit - headLen) + } + var extSlice []AddrRange + extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice)) + extSliceHdr.Data = uintptr(ars.data) + extSliceHdr.Len = ars.length + extSliceHdr.Cap = ars.length + return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit) +} + +// DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n +// bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty +// AddrRangeSeq. +// +// If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit +// at least ars.Head(), even if n == 0. This guarantees that the basic pattern +// of: +// +// for !ars.IsEmpty() { +// n, err = doIOWith(ars.Head()) +// if err != nil { +// return err +// } +// ars = ars.DropFirst(n) +// } +// +// works even in the presence of zero-length AddrRanges. +// +// Preconditions: n >= 0. +func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return ars.DropFirst64(int64(n)) +} + +// DropFirst64 is equivalent to DropFirst but takes an int64. +func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + if Addr(n) > ars.limit { + return AddrRangeSeq{} + } + // Handle initial empty AddrRange. + switch ars.length { + case 0: + return AddrRangeSeq{} + case 1: + if ars.limit == 0 { + return AddrRangeSeq{} + } + default: + if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen { + ars = ars.externalTail() + } + } + for n != 0 { + // Calling ars.Head() here is surprisingly expensive, so inline getting + // the head's length. + var headLen Addr + if ars.length == 1 { + headLen = ars.limit + } else { + headLen = (*AddrRange)(ars.data).Length() - ars.offset + } + if Addr(n) < headLen { + // Dropping ends partway through the head AddrRange. + ars.offset += Addr(n) + ars.limit -= Addr(n) + return ars + } + n -= int64(headLen) + ars = ars.Tail() + } + return ars +} + +// TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n +// bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the +// first n bytes are reduced to a length of zero, but will still be iterated. +// +// Preconditions: n >= 0. +func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return ars.TakeFirst64(int64(n)) +} + +// TakeFirst64 is equivalent to TakeFirst but takes an int64. +func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + if ars.limit > Addr(n) { + ars.limit = Addr(n) + } + return ars +} + +// String implements fmt.Stringer.String. +func (ars AddrRangeSeq) String() string { + // This is deliberately chosen to be the same as fmt's automatic stringer + // for []AddrRange. + var buf bytes.Buffer + buf.WriteByte('[') + var sep string + for !ars.IsEmpty() { + buf.WriteString(sep) + sep = " " + buf.WriteString(ars.Head().String()) + ars = ars.Tail() + } + buf.WriteByte(']') + return buf.String() +} diff --git a/pkg/usermem/bytes_io.go b/pkg/usermem/bytes_io.go new file mode 100644 index 000000000..e177d30eb --- /dev/null +++ b/pkg/usermem/bytes_io.go @@ -0,0 +1,141 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/syserror" +) + +const maxInt = int(^uint(0) >> 1) + +// BytesIO implements IO using a byte slice. Addresses are interpreted as +// offsets into the slice. Reads and writes beyond the end of the slice return +// EFAULT. +type BytesIO struct { + Bytes []byte +} + +// CopyOut implements IO.CopyOut. +func (b *BytesIO) CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) { + rngN, rngErr := b.rangeCheck(addr, len(src)) + if rngN == 0 { + return 0, rngErr + } + return copy(b.Bytes[int(addr):], src[:rngN]), rngErr +} + +// CopyIn implements IO.CopyIn. +func (b *BytesIO) CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) { + rngN, rngErr := b.rangeCheck(addr, len(dst)) + if rngN == 0 { + return 0, rngErr + } + return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr +} + +// ZeroOut implements IO.ZeroOut. +func (b *BytesIO) ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) { + if toZero > int64(maxInt) { + return 0, syserror.EINVAL + } + rngN, rngErr := b.rangeCheck(addr, int(toZero)) + if rngN == 0 { + return 0, rngErr + } + zeroSlice := b.Bytes[int(addr) : int(addr)+rngN] + for i := range zeroSlice { + zeroSlice[i] = 0 + } + return int64(rngN), rngErr +} + +// CopyOutFrom implements IO.CopyOutFrom. +func (b *BytesIO) CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) { + dsts, rngErr := b.blocksFromAddrRanges(ars) + n, err := src.ReadToBlocks(dsts) + if err != nil { + return int64(n), err + } + return int64(n), rngErr +} + +// CopyInTo implements IO.CopyInTo. +func (b *BytesIO) CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) { + srcs, rngErr := b.blocksFromAddrRanges(ars) + n, err := dst.WriteFromBlocks(srcs) + if err != nil { + return int64(n), err + } + return int64(n), rngErr +} + +func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) { + if length == 0 { + return 0, nil + } + if length < 0 { + return 0, syserror.EINVAL + } + max := Addr(len(b.Bytes)) + if addr >= max { + return 0, syserror.EFAULT + } + end, ok := addr.AddLength(uint64(length)) + if !ok || end > max { + return int(max - addr), syserror.EFAULT + } + return length, nil +} + +func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) { + switch ars.NumRanges() { + case 0: + return safemem.BlockSeq{}, nil + case 1: + block, err := b.blockFromAddrRange(ars.Head()) + return safemem.BlockSeqOf(block), err + default: + blocks := make([]safemem.Block, 0, ars.NumRanges()) + for !ars.IsEmpty() { + block, err := b.blockFromAddrRange(ars.Head()) + if block.Len() != 0 { + blocks = append(blocks, block) + } + if err != nil { + return safemem.BlockSeqFromSlice(blocks), err + } + ars = ars.Tail() + } + return safemem.BlockSeqFromSlice(blocks), nil + } +} + +func (b *BytesIO) blockFromAddrRange(ar AddrRange) (safemem.Block, error) { + n, err := b.rangeCheck(ar.Start, int(ar.Length())) + if n == 0 { + return safemem.Block{}, err + } + return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err +} + +// BytesIOSequence returns an IOSequence representing the given byte slice. +func BytesIOSequence(buf []byte) IOSequence { + return IOSequence{ + IO: &BytesIO{buf}, + Addrs: AddrRangeSeqOf(AddrRange{0, Addr(len(buf))}), + } +} diff --git a/pkg/usermem/bytes_io_unsafe.go b/pkg/usermem/bytes_io_unsafe.go new file mode 100644 index 000000000..20de5037d --- /dev/null +++ b/pkg/usermem/bytes_io_unsafe.go @@ -0,0 +1,47 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/context" +) + +// SwapUint32 implements IO.SwapUint32. +func (b *BytesIO) SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) { + if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { + return 0, rngErr + } + return atomic.SwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), new), nil +} + +// CompareAndSwapUint32 implements IO.CompareAndSwapUint32. +func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) { + if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { + return 0, rngErr + } + return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil +} + +// LoadUint32 implements IO.LoadUint32. +func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) { + if _, err := b.rangeCheck(addr, 4); err != nil { + return 0, err + } + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil +} diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go new file mode 100644 index 000000000..71fd4e155 --- /dev/null +++ b/pkg/usermem/usermem.go @@ -0,0 +1,597 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package usermem governs access to user memory. +package usermem + +import ( + "bytes" + "errors" + "io" + "strconv" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/syserror" +) + +// IO provides access to the contents of a virtual memory space. +// +// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any +// meaningful data. +type IO interface { + // CopyOut copies len(src) bytes from src to the memory mapped at addr. It + // returns the number of bytes copied. If the number of bytes copied is < + // len(src), it returns a non-nil error explaining why. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. + // + // Postconditions: CopyOut does not retain src. + CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) + + // CopyIn copies len(dst) bytes from the memory mapped at addr to dst. + // It returns the number of bytes copied. If the number of bytes copied is + // < len(dst), it returns a non-nil error explaining why. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. + // + // Postconditions: CopyIn does not retain dst. + CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) + + // ZeroOut sets toZero bytes to 0, starting at addr. It returns the number + // of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a + // non-nil error explaining why. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. toZero >= 0. + ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) + + // CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at + // ars. It returns the number of bytes copied, which may be less than the + // number of bytes read from src if copying fails. CopyOutFrom may return a + // partial copy without an error iff src.ReadToBlocks returns a partial + // read without an error. + // + // CopyOutFrom calls src.ReadToBlocks at most once. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. src.ReadToBlocks must not block + // on mm.MemoryManager.activeMu or any preceding locks in the lock order. + CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) + + // CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to + // dst. It returns the number of bytes copied. CopyInTo may return a + // partial copy without an error iff dst.WriteFromBlocks returns a partial + // write without an error. + // + // CopyInTo calls dst.WriteFromBlocks at most once. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. dst.WriteFromBlocks must not + // block on mm.MemoryManager.activeMu or any preceding locks in the lock + // order. + CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) + + // TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst + // at most once, which is unnecessary in most cases, forces implementations + // to gather safemem.Blocks into a single slice to pass to src/dst. Add + // CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid + // this allocation. + + // SwapUint32 atomically sets the uint32 value at addr to new and + // returns the previous value. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. addr must be aligned to a 4-byte + // boundary. + SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) + + // CompareAndSwapUint32 atomically compares the uint32 value at addr to + // old; if they are equal, the value in memory is replaced by new. In + // either case, the previous value stored in memory is returned. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. addr must be aligned to a 4-byte + // boundary. + CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) + + // LoadUint32 atomically loads the uint32 value at addr and returns it. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. addr must be aligned to a 4-byte + // boundary. + LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) +} + +// IOOpts contains options applicable to all IO methods. +type IOOpts struct { + // If IgnorePermissions is true, application-defined memory protections set + // by mmap(2) or mprotect(2) will be ignored. (Memory protections required + // by the target of the mapping are never ignored.) + IgnorePermissions bool + + // If AddressSpaceActive is true, the IO implementation may assume that it + // has an active AddressSpace and can therefore use AddressSpace copying + // without performing activation. See mm/io.go for details. + AddressSpaceActive bool +} + +// IOReadWriter is an io.ReadWriter that reads from / writes to addresses +// starting at addr in IO. The preconditions that apply to IO.CopyIn and +// IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write +// respectively. +type IOReadWriter struct { + Ctx context.Context + IO IO + Addr Addr + Opts IOOpts +} + +// Read implements io.Reader.Read. +// +// Note that an address space does not have an "end of file", so Read can only +// return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or +// unreadable memory, or beyond the end of the address space, should return +// EFAULT. +func (rw *IOReadWriter) Read(dst []byte) (int, error) { + n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts) + end, ok := rw.Addr.AddLength(uint64(n)) + if ok { + rw.Addr = end + } else { + // Disallow wraparound. + rw.Addr = ^Addr(0) + if err != nil { + err = syserror.EFAULT + } + } + return n, err +} + +// Writer implements io.Writer.Write. +func (rw *IOReadWriter) Write(src []byte) (int, error) { + n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts) + end, ok := rw.Addr.AddLength(uint64(n)) + if ok { + rw.Addr = end + } else { + // Disallow wraparound. + rw.Addr = ^Addr(0) + if err != nil { + err = syserror.EFAULT + } + } + return n, err +} + +// CopyObjectOut copies a fixed-size value or slice of fixed-size values from +// src to the memory mapped at addr in uio. It returns the number of bytes +// copied. +// +// CopyObjectOut must use reflection to encode src; performance-sensitive +// clients should do encoding manually and use uio.CopyOut directly. +// +// Preconditions: As for IO.CopyOut. +func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) { + w := &IOReadWriter{ + Ctx: ctx, + IO: uio, + Addr: addr, + Opts: opts, + } + // Allocate a byte slice the size of the object being marshaled. This + // adds an extra reflection call, but avoids needing to grow the slice + // during encoding, which can result in many heap-allocated slices. + b := make([]byte, 0, binary.Size(src)) + return w.Write(binary.Marshal(b, ByteOrder, src)) +} + +// CopyObjectIn copies a fixed-size value or slice of fixed-size values from +// the memory mapped at addr in uio to dst. It returns the number of bytes +// copied. +// +// CopyObjectIn must use reflection to decode dst; performance-sensitive +// clients should use uio.CopyIn directly and do decoding manually. +// +// Preconditions: As for IO.CopyIn. +func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) { + r := &IOReadWriter{ + Ctx: ctx, + IO: uio, + Addr: addr, + Opts: opts, + } + buf := make([]byte, binary.Size(dst)) + if _, err := io.ReadFull(r, buf); err != nil { + return 0, err + } + binary.Unmarshal(buf, ByteOrder, dst) + return int(r.Addr - addr), nil +} + +// CopyStringIn tuning parameters, defined outside that function for tests. +const ( + copyStringIncrement = 64 + copyStringMaxInitBufLen = 256 +) + +// CopyStringIn copies a NUL-terminated string of unknown length from the +// memory mapped at addr in uio and returns it as a string (not including the +// trailing NUL). If the length of the string, including the terminating NUL, +// would exceed maxlen, CopyStringIn returns the string truncated to maxlen and +// ENAMETOOLONG. +// +// Preconditions: As for IO.CopyFromUser. maxlen >= 0. +func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) { + initLen := maxlen + if initLen > copyStringMaxInitBufLen { + initLen = copyStringMaxInitBufLen + } + buf := make([]byte, initLen) + var done int + for done < maxlen { + // Read up to copyStringIncrement bytes at a time. + readlen := copyStringIncrement + if readlen > maxlen-done { + readlen = maxlen - done + } + end, ok := addr.AddLength(uint64(readlen)) + if !ok { + return stringFromImmutableBytes(buf[:done]), syserror.EFAULT + } + // Shorten the read to avoid crossing page boundaries, since faulting + // in a page unnecessarily is expensive. This also ensures that partial + // copies up to the end of application-mappable memory succeed. + if addr.RoundDown() != end.RoundDown() { + end = end.RoundDown() + readlen = int(end - addr) + } + // Ensure that our buffer is large enough to accommodate the read. + if done+readlen > len(buf) { + newBufLen := len(buf) * 2 + if newBufLen > maxlen { + newBufLen = maxlen + } + buf = append(buf, make([]byte, newBufLen-len(buf))...) + } + n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts) + // Look for the terminating zero byte, which may have occurred before + // hitting err. + if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 { + return stringFromImmutableBytes(buf[:done+i]), nil + } + + done += n + if err != nil { + return stringFromImmutableBytes(buf[:done]), err + } + addr = end + } + return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG +} + +// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The +// maximum number of bytes copied is ars.NumBytes() or len(src), whichever is +// less. CopyOutVec returns the number of bytes copied; if this is less than +// the maximum, it returns a non-nil error explaining why. +// +// Preconditions: As for IO.CopyOut. +func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) { + var done int + for !ars.IsEmpty() && done < len(src) { + ar := ars.Head() + cplen := len(src) - done + if Addr(cplen) >= ar.Length() { + cplen = int(ar.Length()) + } + n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts) + done += n + if err != nil { + return done, err + } + ars = ars.DropFirst(n) + } + return done, nil +} + +// CopyInVec copies bytes from the memory mapped at ars in uio to dst. The +// maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is +// less. CopyInVec returns the number of bytes copied; if this is less than the +// maximum, it returns a non-nil error explaining why. +// +// Preconditions: As for IO.CopyIn. +func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) { + var done int + for !ars.IsEmpty() && done < len(dst) { + ar := ars.Head() + cplen := len(dst) - done + if Addr(cplen) >= ar.Length() { + cplen = int(ar.Length()) + } + n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts) + done += n + if err != nil { + return done, err + } + ars = ars.DropFirst(n) + } + return done, nil +} + +// ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum +// number of bytes written is ars.NumBytes() or toZero, whichever is less. +// ZeroOutVec returns the number of bytes written; if this is less than the +// maximum, it returns a non-nil error explaining why. +// +// Preconditions: As for IO.ZeroOut. +func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) { + var done int64 + for !ars.IsEmpty() && done < toZero { + ar := ars.Head() + cplen := toZero - done + if Addr(cplen) >= ar.Length() { + cplen = int64(ar.Length()) + } + n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts) + done += n + if err != nil { + return done, err + } + ars = ars.DropFirst64(n) + } + return done, nil +} + +func isASCIIWhitespace(b byte) bool { + // Compare Linux include/linux/ctype.h, lib/ctype.c. + // 9 => horizontal tab '\t' + // 10 => line feed '\n' + // 11 => vertical tab '\v' + // 12 => form feed '\c' + // 13 => carriage return '\r' + return b == ' ' || (b >= 9 && b <= 13) +} + +// CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal +// strings from the memory mapped at ars in uio and converts them to int32 +// values in dsts. It returns the number of bytes read. +// +// CopyInt32StringsInVec shares the following properties with Linux's +// kernel/sysctl.c:proc_dointvec(write=1): +// +// - If any read value overflows the range of int32, or any invalid characters +// are encountered during the read, CopyInt32StringsInVec returns EINVAL. +// +// - If, upon reaching the end of ars, fewer than len(dsts) values have been +// read, CopyInt32StringsInVec returns no error if at least 1 value was read +// and EINVAL otherwise. +// +// - Trailing whitespace after the last successfully read value is counted in +// the number of bytes read. +// +// Unlike proc_dointvec(): +// +// - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to +// PageSize-1; callers that require this must do so explicitly. +// +// - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0. +// +// Preconditions: As for CopyInVec. +func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) { + if len(dsts) == 0 { + return 0, nil + } + + buf := make([]byte, ars.NumBytes()) + n, cperr := CopyInVec(ctx, uio, ars, buf, opts) + buf = buf[:n] + + var i, j int + for ; j < len(dsts); j++ { + // Skip leading whitespace. + for i < len(buf) && isASCIIWhitespace(buf[i]) { + i++ + } + if i == len(buf) { + break + } + + // Find the end of the value to be parsed (next whitespace or end of string). + nextI := i + 1 + for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) { + nextI++ + } + + // Parse a single value. + val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32) + if err != nil { + return int64(i), syserror.EINVAL + } + dsts[j] = int32(val) + + i = nextI + } + + // Skip trailing whitespace. + for i < len(buf) && isASCIIWhitespace(buf[i]) { + i++ + } + + if cperr != nil { + return int64(i), cperr + } + if j == 0 { + return int64(i), syserror.EINVAL + } + return int64(i), nil +} + +// CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at +// most one int32. +func CopyInt32StringInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) { + dsts := [1]int32{*dst} + n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts) + *dst = dsts[0] + return n, err +} + +// IOSequence holds arguments to IO methods. +type IOSequence struct { + IO IO + Addrs AddrRangeSeq + Opts IOOpts +} + +// NumBytes returns s.Addrs.NumBytes(). +// +// Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since +// s.Addrs may contain a non-zero number of zero-length AddrRanges. +// Many clients of +// IOSequence currently do something like: +// +// if ioseq.NumBytes() == 0 { +// return 0, nil +// } +// if f.availableBytes == 0 { +// return 0, syserror.ErrWouldBlock +// } +// return ioseq.CopyOutFrom(..., reader) +// +// In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong +// behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means +// that we will return success for zero-length I/O in cases where Linux would +// return EFAULT due to a failed access_ok() check, so in the long term we +// should move checks for ErrWouldBlock etc. into the body of +// reader.ReadToBlocks and use s.Addrs.IsEmpty() instead. +func (s IOSequence) NumBytes() int64 { + return s.Addrs.NumBytes() +} + +// DropFirst returns a copy of s with s.Addrs.DropFirst(n). +// +// Preconditions: As for AddrRangeSeq.DropFirst. +func (s IOSequence) DropFirst(n int) IOSequence { + return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts} +} + +// DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n). +// +// Preconditions: As for AddrRangeSeq.DropFirst64. +func (s IOSequence) DropFirst64(n int64) IOSequence { + return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts} +} + +// TakeFirst returns a copy of s with s.Addrs.TakeFirst(n). +// +// Preconditions: As for AddrRangeSeq.TakeFirst. +func (s IOSequence) TakeFirst(n int) IOSequence { + return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts} +} + +// TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n). +// +// Preconditions: As for AddrRangeSeq.TakeFirst64. +func (s IOSequence) TakeFirst64(n int64) IOSequence { + return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts} +} + +// CopyOut invokes CopyOutVec over s.Addrs. +// +// As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated +// to s.NumBytes(), and a nil error will be returned. +// +// Preconditions: As for CopyOutVec. +func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) { + return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts) +} + +// CopyIn invokes CopyInVec over s.Addrs. +// +// As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to +// s.NumBytes(), and a nil error will be returned. +// +// Preconditions: As for CopyInVec. +func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) { + return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts) +} + +// ZeroOut invokes ZeroOutVec over s.Addrs. +// +// As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated +// to s.NumBytes(), and a nil error will be returned. +// +// Preconditions: As for ZeroOutVec. +func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) { + return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts) +} + +// CopyOutFrom invokes s.CopyOutFrom over s.Addrs. +// +// Preconditions: As for IO.CopyOutFrom. +func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) { + return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts) +} + +// CopyInTo invokes s.CopyInTo over s.Addrs. +// +// Preconditions: As for IO.CopyInTo. +func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) { + return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts) +} + +// Reader returns an io.Reader that reads from s. Reads beyond the end of s +// return io.EOF. The preconditions that apply to s.CopyIn also apply to the +// returned io.Reader.Read. +func (s IOSequence) Reader(ctx context.Context) io.Reader { + return &ioSequenceReadWriter{ctx, s} +} + +// Writer returns an io.Writer that writes to s. Writes beyond the end of s +// return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also +// apply to the returned io.Writer.Write. +func (s IOSequence) Writer(ctx context.Context) io.Writer { + return &ioSequenceReadWriter{ctx, s} +} + +// ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when +// attempting to write beyond the end of the IOSequence. +var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence") + +type ioSequenceReadWriter struct { + ctx context.Context + s IOSequence +} + +// Read implements io.Reader.Read. +func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) { + n, err := rw.s.CopyIn(rw.ctx, dst) + rw.s = rw.s.DropFirst(n) + if err == nil && rw.s.NumBytes() == 0 { + err = io.EOF + } + return n, err +} + +// Write implements io.Writer.Write. +func (rw *ioSequenceReadWriter) Write(src []byte) (int, error) { + n, err := rw.s.CopyOut(rw.ctx, src) + rw.s = rw.s.DropFirst(n) + if err == nil && n < len(src) { + err = ErrEndOfIOSequence + } + return n, err +} diff --git a/pkg/usermem/usermem_arm64.go b/pkg/usermem/usermem_arm64.go new file mode 100644 index 000000000..fdfc30a66 --- /dev/null +++ b/pkg/usermem/usermem_arm64.go @@ -0,0 +1,53 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package usermem + +import ( + "encoding/binary" + "syscall" +) + +const ( + // PageSize is the system page size. + // arm64 support 4K/16K/64K page size, + // which can be get by syscall.Getpagesize(). + // Currently, only 4K page size is supported. + PageSize = 1 << PageShift + + // HugePageSize is the system huge page size. + HugePageSize = 1 << HugePageShift + + // PageShift is the binary log of the system page size. + PageShift = 12 + + // HugePageShift is the binary log of the system huge page size. + // Should be calculated by "PageShift + (PageShift - 3)" + // when multiple page size support is ready. + HugePageShift = 21 +) + +var ( + // ByteOrder is the native byte order (little endian). + ByteOrder = binary.LittleEndian +) + +func init() { + // Make sure the page size is 4K on arm64 platform. + if size := syscall.Getpagesize(); size != PageSize { + panic("Only 4K page size is supported on arm64!") + } +} diff --git a/pkg/usermem/usermem_test.go b/pkg/usermem/usermem_test.go new file mode 100644 index 000000000..bf3c5df2b --- /dev/null +++ b/pkg/usermem/usermem_test.go @@ -0,0 +1,424 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "bytes" + "encoding/binary" + "fmt" + "reflect" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/syserror" +) + +// newContext returns a context.Context that we can use in these tests (we +// can't use contexttest because it depends on usermem). +func newContext() context.Context { + return context.Background() +} + +func newBytesIOString(s string) *BytesIO { + return &BytesIO{[]byte(s)} +} + +func TestBytesIOCopyOutSuccess(t *testing.T) { + b := newBytesIOString("ABCDE") + n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) + if wantN := 3; n != wantN || err != nil { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := b.Bytes, []byte("AfooE"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyOutFailure(t *testing.T) { + b := newBytesIOString("ABC") + n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) + if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyInSuccess(t *testing.T) { + b := newBytesIOString("AfooE") + var dst [3]byte + n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) + if wantN := 3; n != wantN || err != nil { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyInFailure(t *testing.T) { + b := newBytesIOString("Afo") + var dst [3]byte + n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) + if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +func TestBytesIOZeroOutSuccess(t *testing.T) { + b := newBytesIOString("ABCD") + n, err := b.ZeroOut(newContext(), 1, 2, IOOpts{}) + if wantN := int64(2); n != wantN || err != nil { + t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := b.Bytes, []byte("A\x00\x00D"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOZeroOutFailure(t *testing.T) { + b := newBytesIOString("ABC") + n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{}) + if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyOutFromSuccess(t *testing.T) { + b := newBytesIOString("ABCDEFGH") + n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{ + {Start: 4, End: 7}, + {Start: 1, End: 4}, + }), safemem.FromIOReader{bytes.NewBufferString("barfoo")}, IOOpts{}) + if wantN := int64(6); n != wantN || err != nil { + t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := b.Bytes, []byte("AfoobarH"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyOutFromFailure(t *testing.T) { + b := newBytesIOString("ABCDE") + n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{ + {Start: 1, End: 4}, + {Start: 4, End: 7}, + }), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{}) + if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyInToSuccess(t *testing.T) { + b := newBytesIOString("AfoobarH") + var dst bytes.Buffer + n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{ + {Start: 4, End: 7}, + {Start: 1, End: 4}, + }), safemem.FromIOWriter{&dst}, IOOpts{}) + if wantN := int64(6); n != wantN || err != nil { + t.Errorf("CopyInTo: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("barfoo"); !bytes.Equal(got, want) { + t.Errorf("dst.Bytes(): got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyInToFailure(t *testing.T) { + b := newBytesIOString("Afoob") + var dst bytes.Buffer + n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{ + {Start: 1, End: 4}, + {Start: 4, End: 7}, + }), safemem.FromIOWriter{&dst}, IOOpts{}) + if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { + t.Errorf("dst.Bytes(): got %q, wanted %q", got, want) + } +} + +type testStruct struct { + Int8 int8 + Uint8 uint8 + Int16 int16 + Uint16 uint16 + Int32 int32 + Uint32 uint32 + Int64 int64 + Uint64 uint64 +} + +func TestCopyObject(t *testing.T) { + wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8} + wantN := binary.Size(wantObj) + b := &BytesIO{make([]byte, wantN)} + ctx := newContext() + if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil { + t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + var gotObj testStruct + if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil { + t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if gotObj != wantObj { + t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj) + } +} + +func TestCopyStringInShort(t *testing.T) { + // Tests for string length <= copyStringIncrement. + want := strings.Repeat("A", copyStringIncrement-2) + mem := want + "\x00" + if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) + } +} + +func TestCopyStringInLong(t *testing.T) { + // Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen + // (requiring multiple calls to IO.CopyIn()). + want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4) + mem := want + "\x00" + if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) + } +} + +func TestCopyStringInVeryLong(t *testing.T) { + // Tests for string length > copyStringMaxInitBufLen (requiring buffer + // reallocation). + want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4) + mem := want + "\x00" + if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) + } +} + +func TestCopyStringInNoTerminatingZeroByte(t *testing.T) { + want := strings.Repeat("A", copyStringIncrement-1) + got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{}) + if wantErr := syserror.EFAULT; got != want || err != wantErr { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) + } +} + +func TestCopyStringInTruncatedByMaxlen(t *testing.T) { + got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{}) + if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) + } +} + +func TestCopyInt32StringsInVec(t *testing.T) { + for _, test := range []struct { + str string + n int + initial []int32 + final []int32 + }{ + { + str: "100 200", + n: len("100 200"), + initial: []int32{1, 2}, + final: []int32{100, 200}, + }, + { + // Fewer values ok + str: "100", + n: len("100"), + initial: []int32{1, 2}, + final: []int32{100, 2}, + }, + { + // Extra values ok + str: "100 200 300", + n: len("100 200 "), + initial: []int32{1, 2}, + final: []int32{100, 200}, + }, + { + // Leading and trailing whitespace ok + str: " 100\t200\n", + n: len(" 100\t200\n"), + initial: []int32{1, 2}, + final: []int32{100, 200}, + }, + } { + t.Run(fmt.Sprintf("%q", test.str), func(t *testing.T) { + src := BytesIOSequence([]byte(test.str)) + dsts := append([]int32(nil), test.initial...) + if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); n != int64(test.n) || err != nil { + t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (%d, nil)", n, err, test.n) + } + if !reflect.DeepEqual(dsts, test.final) { + t.Errorf("dsts: got %v, wanted %v", dsts, test.final) + } + }) + } +} + +func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) { + for _, s := range []string{"", "\n", "a123"} { + t.Run(fmt.Sprintf("%q", s), func(t *testing.T) { + src := BytesIOSequence([]byte(s)) + initial := []int32{1, 2} + dsts := append([]int32(nil), initial...) + if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); err != syserror.EINVAL { + t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL) + } + if !reflect.DeepEqual(dsts, initial) { + t.Errorf("dsts: got %v, wanted %v", dsts, initial) + } + }) + } +} + +func TestIOSequenceCopyOut(t *testing.T) { + buf := []byte("ABCD") + s := BytesIOSequence(buf) + + // CopyOut limited by len(src). + n, err := s.CopyOut(newContext(), []byte("fo")) + if wantN := 2; n != wantN || err != nil { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("foCD"); !bytes.Equal(buf, want) { + t.Errorf("buf: got %q, wanted %q", buf, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(2); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + // CopyOut limited by s.NumBytes(). + n, err = s.CopyOut(newContext(), []byte("obar")) + if wantN := 2; n != wantN || err != nil { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("foob"); !bytes.Equal(buf, want) { + t.Errorf("buf: got %q, wanted %q", buf, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(0); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } +} + +func TestIOSequenceCopyIn(t *testing.T) { + s := BytesIOSequence([]byte("foob")) + dst := []byte("ABCDEF") + + // CopyIn limited by len(dst). + n, err := s.CopyIn(newContext(), dst[:2]) + if wantN := 2; n != wantN || err != nil { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("foCDEF"); !bytes.Equal(dst, want) { + t.Errorf("dst: got %q, wanted %q", dst, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(2); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + // CopyIn limited by s.Remaining(). + n, err = s.CopyIn(newContext(), dst[2:]) + if wantN := 2; n != wantN || err != nil { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("foobEF"); !bytes.Equal(dst, want) { + t.Errorf("dst: got %q, wanted %q", dst, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(0); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } +} + +func TestIOSequenceZeroOut(t *testing.T) { + buf := []byte("ABCD") + s := BytesIOSequence(buf) + + // ZeroOut limited by toZero. + n, err := s.ZeroOut(newContext(), 2) + if wantN := int64(2); n != wantN || err != nil { + t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("\x00\x00CD"); !bytes.Equal(buf, want) { + t.Errorf("buf: got %q, wanted %q", buf, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(2); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + // ZeroOut limited by s.NumBytes(). + n, err = s.ZeroOut(newContext(), 4) + if wantN := int64(2); n != wantN || err != nil { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("\x00\x00\x00\x00"); !bytes.Equal(buf, want) { + t.Errorf("buf: got %q, wanted %q", buf, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(0); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } +} + +func TestIOSequenceTakeFirst(t *testing.T) { + s := BytesIOSequence([]byte("foobar")) + if got, want := s.NumBytes(), int64(6); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + s = s.TakeFirst(3) + if got, want := s.NumBytes(), int64(3); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + // TakeFirst(n) where n > s.NumBytes() is a no-op. + s = s.TakeFirst(9) + if got, want := s.NumBytes(), int64(3); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + var dst [3]byte + n, err := s.CopyIn(newContext(), dst[:]) + if wantN := 3; n != wantN || err != nil { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } + s = s.DropFirst(3) + if got, want := s.NumBytes(), int64(0); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } +} diff --git a/pkg/usermem/usermem_unsafe.go b/pkg/usermem/usermem_unsafe.go new file mode 100644 index 000000000..876783e78 --- /dev/null +++ b/pkg/usermem/usermem_unsafe.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "unsafe" +) + +// stringFromImmutableBytes is equivalent to string(bs), except that it never +// copies even if escape analysis can't prove that bs does not escape. This is +// only valid if bs is never mutated after stringFromImmutableBytes returns. +func stringFromImmutableBytes(bs []byte) string { + // Compare strings.Builder.String(). + return *(*string)(unsafe.Pointer(&bs)) +} diff --git a/pkg/usermem/usermem_x86.go b/pkg/usermem/usermem_x86.go new file mode 100644 index 000000000..8059b72d2 --- /dev/null +++ b/pkg/usermem/usermem_x86.go @@ -0,0 +1,38 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 i386 + +package usermem + +import "encoding/binary" + +const ( + // PageSize is the system page size. + PageSize = 1 << PageShift + + // HugePageSize is the system huge page size. + HugePageSize = 1 << HugePageShift + + // PageShift is the binary log of the system page size. + PageShift = 12 + + // HugePageShift is the binary log of the system huge page size. + HugePageShift = 21 +) + +var ( + // ByteOrder is the native byte order (little endian). + ByteOrder = binary.LittleEndian +) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index f3ebc0231..a96c80261 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -30,6 +30,7 @@ go_library( deps = [ "//pkg/abi", "//pkg/abi/linux", + "//pkg/context", "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", @@ -39,7 +40,6 @@ go_library( "//pkg/refs", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", - "//pkg/sentry/context", "//pkg/sentry/control", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", @@ -71,7 +71,6 @@ go_library( "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sentry/watchdog", "//pkg/sync", "//pkg/syserror", @@ -88,6 +87,7 @@ go_library( "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/urpc", + "//pkg/usermem", "//runsc/boot/filter", "//runsc/boot/platforms", "//runsc/specutils", @@ -111,7 +111,7 @@ go_test( "//pkg/control/server", "//pkg/log", "//pkg/p9", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sync", diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index e5de1f3d7..417d2d5fb 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -17,7 +17,7 @@ package boot import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/kernel" diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 421ccd255..0f62842ea 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -32,8 +32,8 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/gofer" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index bec0dc292..44aa63196 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -27,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" diff --git a/runsc/boot/user.go b/runsc/boot/user.go index 56cc12ee0..f0aa52135 100644 --- a/runsc/boot/user.go +++ b/runsc/boot/user.go @@ -22,10 +22,10 @@ import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type fileReader struct { diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go index 9aee2ad07..fb4e13dfb 100644 --- a/runsc/boot/user_test.go +++ b/runsc/boot/user_test.go @@ -23,7 +23,7 @@ import ( "testing" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl index 2918ceffe..d79786a68 100644 --- a/tools/go_marshal/defs.bzl +++ b/tools/go_marshal/defs.bzl @@ -54,8 +54,8 @@ go_marshal = rule( # marshal_deps are the dependencies requied by generated code. marshal_deps = [ "//tools/go_marshal/marshal", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", + "//pkg/safecopy", + "//pkg/usermem", ] # marshal_test_deps are required by test targets. diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go index 8392f3f6d..af90bdecb 100644 --- a/tools/go_marshal/gomarshal/generator.go +++ b/tools/go_marshal/gomarshal/generator.go @@ -27,8 +27,8 @@ import ( const ( marshalImport = "gvisor.dev/gvisor/tools/go_marshal/marshal" - usermemImport = "gvisor.dev/gvisor/pkg/sentry/usermem" - safecopyImport = "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" + safecopyImport = "gvisor.dev/gvisor/pkg/safecopy" + usermemImport = "gvisor.dev/gvisor/pkg/usermem" ) // List of identifiers we use in generated code, that may conflict a diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD index 38ba49fed..e345e3a8e 100644 --- a/tools/go_marshal/test/BUILD +++ b/tools/go_marshal/test/BUILD @@ -15,7 +15,7 @@ go_test( deps = [ ":test", "//pkg/binary", - "//pkg/sentry/usermem", + "//pkg/usermem", "//tools/go_marshal/analysis", ], ) diff --git a/tools/go_marshal/test/benchmark_test.go b/tools/go_marshal/test/benchmark_test.go index e70db06d8..e12403741 100644 --- a/tools/go_marshal/test/benchmark_test.go +++ b/tools/go_marshal/test/benchmark_test.go @@ -22,7 +22,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/tools/go_marshal/analysis" test "gvisor.dev/gvisor/tools/go_marshal/test" ) -- cgit v1.2.3 From 437c986c6a0ed0e1fccfbfb6706f43d2c801c444 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 28 Jan 2020 15:13:46 -0800 Subject: Add vfs.FileDescription to FD table FD table now holds both VFS1 and VFS2 types and uses the correct one based on what's set. Parts of this CL are just initial changes (e.g. sys_read.go, runsc/main.go) to serve as a template for the remaining changes. Updates #1487 Updates #1623 PiperOrigin-RevId: 292023223 --- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/fd_table.go | 166 ++++++++++++++++----- pkg/sentry/kernel/fd_table_test.go | 4 +- pkg/sentry/kernel/fd_table_unsafe.go | 98 ++++++++++-- pkg/sentry/kernel/kernel.go | 31 ++-- pkg/sentry/kernel/task.go | 9 ++ pkg/sentry/kernel/task_exec.go | 3 +- pkg/sentry/syscalls/linux/BUILD | 1 + pkg/sentry/syscalls/linux/error.go | 72 ++++++--- pkg/sentry/syscalls/linux/sys_file.go | 2 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 24 +++ pkg/sentry/syscalls/linux/vfs2/linux64.go | 16 ++ .../syscalls/linux/vfs2/linux64_override_amd64.go | 25 ++++ .../syscalls/linux/vfs2/linux64_override_arm64.go | 25 ++++ pkg/sentry/syscalls/linux/vfs2/sys_read.go | 95 ++++++++++++ runsc/boot/BUILD | 1 + runsc/boot/config.go | 3 + runsc/boot/loader.go | 9 ++ 18 files changed, 496 insertions(+), 89 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/BUILD create mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_read.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 0738946d9..a27628c0a 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -188,6 +188,7 @@ go_library( "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/state", "//pkg/state/statefile", "//pkg/sync", diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 9460bb235..56b70ce96 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) @@ -62,10 +63,14 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) { // Note that this is immutable and can only be changed via operations on the // descriptorTable. // +// It contains both VFS1 and VFS2 file types, but only one of them can be set. +// // +stateify savable type descriptor struct { - file *fs.File - flags FDFlags + // TODO(gvisor.dev/issue/1624): Remove fs.File. + file *fs.File + fileVFS2 *vfs.FileDescription + flags FDFlags } // FDTable is used to manage File references and flags. @@ -95,10 +100,11 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ - file: file, - flags: flags, + file: file, + fileVFS2: fileVFS2, + flags: flags, } }) return m @@ -107,13 +113,17 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { f.init() // Initialize table. for fd, d := range m { - f.set(fd, d.file, d.flags) - - // Note that we do _not_ need to acquire a extra table - // reference here. The table reference will already be - // accounted for in the file, so we drop the reference taken by - // set above. - d.file.DecRef() + f.setAll(fd, d.file, d.fileVFS2, d.flags) + + // Note that we do _not_ need to acquire a extra table reference here. The + // table reference will already be accounted for in the file, so we drop the + // reference taken by set above. + switch { + case d.file != nil: + d.file.DecRef() + case d.fileVFS2 != nil: + d.fileVFS2.DecRef() + } } } @@ -139,6 +149,15 @@ func (f *FDTable) drop(file *fs.File) { file.DecRef() } +// dropVFS2 drops the table reference. +func (f *FDTable) dropVFS2(file *vfs.FileDescription) { + // TODO(gvisor.dev/issue/1480): Release locks. + // TODO(gvisor.dev/issue/1479): Send inotify events. + + // Drop the table reference. + file.DecRef() +} + // ID returns a unique identifier for this FDTable. func (f *FDTable) ID() uint64 { return f.uid @@ -156,7 +175,7 @@ func (k *Kernel) NewFDTable() *FDTable { // destroy removes all of the file descriptors from the map. func (f *FDTable) destroy() { - f.RemoveIf(func(*fs.File, FDFlags) bool { + f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { return true }) } @@ -175,19 +194,26 @@ func (f *FDTable) Size() int { // forEach iterates over all non-nil files. // // It is the caller's responsibility to acquire an appropriate lock. -func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { +func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { fd := int32(0) for { - file, flags, ok := f.get(fd) + file, fileVFS2, flags, ok := f.getAll(fd) if !ok { break } - if file != nil { + switch { + case file != nil: if !file.TryIncRef() { continue // Race caught. } - fn(int32(fd), file, flags) + fn(fd, file, nil, flags) file.DecRef() + case fileVFS2 != nil: + if !fileVFS2.TryIncRef() { + continue // Race caught. + } + fn(fd, nil, fileVFS2, flags) + fileVFS2.DecRef() } fd++ } @@ -196,9 +222,21 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { // String is a stringer for FDTable. func (f *FDTable) String() string { var b bytes.Buffer - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { - n, _ := file.Dirent.FullName(nil /* root */) - b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + switch { + case file != nil: + n, _ := file.Dirent.FullName(nil /* root */) + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + + case fileVFS2 != nil: + fs := fileVFS2.VirtualDentry().Mount().Filesystem().VirtualFilesystem() + // TODO(gvisor.dev/issue/1623): We have no context nor root. Will this work? + name, err := fs.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + if err != nil { + b.WriteString(fmt.Sprintf("\n", err)) + } + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, name)) + } }) return b.String() } @@ -262,6 +300,17 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags // reference for that FD, the ref count for that existing reference is // decremented. func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { + return f.newFDAt(ctx, fd, file, nil, flags) +} + +// NewFDAtVFS2 sets the file reference for the given FD. If there is an active +// reference for that FD, the ref count for that existing reference is +// decremented. +func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error { + return f.newFDAt(ctx, fd, nil, file, flags) +} + +func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error { if fd < 0 { // Don't accept negative FDs. return syscall.EBADF @@ -278,7 +327,7 @@ func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FD } // Install the entry. - f.set(fd, file, flags) + f.setAll(fd, file, fileVFS2, flags) return nil } @@ -330,10 +379,35 @@ func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { } } +// GetVFS2 returns a reference to the file and the flags for the FD or nil if no +// file is defined for the given fd. +// +// N.B. Callers are required to use DecRef when they are done. +// +//go:nosplit +func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { + if fd < 0 { + return nil, FDFlags{} + } + + for { + file, flags, _ := f.getVFS2(fd) + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + // Reference acquired. + return file, flags + } + // No file available. + return nil, FDFlags{} + } +} + // GetFDs returns a list of valid fds. func (f *FDTable) GetFDs() []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) return fds @@ -344,7 +418,19 @@ func (f *FDTable) GetFDs() []int32 { // they're done using the slice. func (f *FDTable) GetRefs() []*fs.File { files := make([]*fs.File, 0, f.Size()) - f.forEach(func(_ int32, file *fs.File, flags FDFlags) { + f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + file.IncRef() // Acquire a reference for caller. + files = append(files, file) + }) + return files +} + +// GetRefsVFS2 returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { + files := make([]*vfs.FileDescription, 0, f.Size()) + f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -355,10 +441,15 @@ func (f *FDTable) GetRefs() []*fs.File { func (f *FDTable) Fork() *FDTable { clone := f.k.NewFDTable() - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. - clone.set(fd, file, flags) + switch { + case file != nil: + clone.set(fd, file, flags) + case fileVFS2 != nil: + clone.setVFS2(fd, fileVFS2, flags) + } }) return clone } @@ -366,9 +457,9 @@ func (f *FDTable) Fork() *FDTable { // Remove removes an FD from and returns a non-file iff successful. // // N.B. Callers are required to use DecRef when they are done. -func (f *FDTable) Remove(fd int32) *fs.File { +func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { if fd < 0 { - return nil + return nil, nil } f.mu.Lock() @@ -379,21 +470,26 @@ func (f *FDTable) Remove(fd int32) *fs.File { f.next = fd } - orig, _, _ := f.get(fd) - if orig != nil { - orig.IncRef() // Reference for caller. - f.set(fd, nil, FDFlags{}) // Zap entry. + orig, orig2, _, _ := f.getAll(fd) + + // Add reference for caller. + switch { + case orig != nil: + orig.IncRef() + case orig2 != nil: + orig2.IncRef() } - return orig + f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + return orig, orig2 } // RemoveIf removes all FDs where cond is true. -func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) { +func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { f.mu.Lock() defer f.mu.Unlock() - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { - if cond(file, flags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + if cond(file, fileVFS2, flags) { f.set(fd, nil, FDFlags{}) // Clear from table. // Update current available position. if fd < f.next { diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 261b815f2..29f95a2c4 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) { t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) } - ref := fdTable.Remove(1) + ref, _ := fdTable.Remove(1) if ref == nil { t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") } ref.DecRef() - if ref := fdTable.Remove(1); ref != nil { + if ref, _ := fdTable.Remove(1); ref != nil { t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") } }) diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index e9fdb0917..7fd97dc53 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -19,6 +19,7 @@ import ( "unsafe" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" ) type descriptorTable struct { @@ -41,15 +42,38 @@ func (f *FDTable) init() { // //go:nosplit func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { + file, _, flags, ok := f.getAll(fd) + return file, flags, ok +} + +// getVFS2 gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) getVFS2(fd int32) (*vfs.FileDescription, FDFlags, bool) { + _, file, flags, ok := f.getAll(fd) + return file, flags, ok +} + +// getAll gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, bool) { slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) if fd >= int32(len(slice)) { - return nil, FDFlags{}, false + return nil, nil, FDFlags{}, false } d := (*descriptor)(atomic.LoadPointer(&slice[fd])) if d == nil { - return nil, FDFlags{}, true + return nil, nil, FDFlags{}, true } - return d.file, d.flags, true + if d.file != nil && d.fileVFS2 != nil { + panic("VFS1 and VFS2 files set") + } + return d.file, d.fileVFS2, d.flags, true } // set sets an entry. @@ -59,6 +83,30 @@ func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { // // Precondition: mu must be held. func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { + f.setAll(fd, file, nil, flags) +} + +// setVFS2 sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) { + f.setAll(fd, nil, file, flags) +} + +// setAll sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + if file != nil && fileVFS2 != nil { + panic("VFS1 and VFS2 files set") + } + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) // Grow the table as required. @@ -71,33 +119,51 @@ func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) } - // Create the new element. - var d *descriptor - if file != nil { - d = &descriptor{ - file: file, - flags: flags, + var desc *descriptor + if file != nil || fileVFS2 != nil { + desc = &descriptor{ + file: file, + fileVFS2: fileVFS2, + flags: flags, } } // Update the single element. - orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d))) + orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(desc))) // Acquire a table reference. - if file != nil && (orig == nil || file != orig.file) { - file.IncRef() + if desc != nil { + switch { + case desc.file != nil: + if orig == nil || desc.file != orig.file { + desc.file.IncRef() + } + case desc.fileVFS2 != nil: + if orig == nil || desc.fileVFS2 != orig.fileVFS2 { + desc.fileVFS2.IncRef() + } + } } // Drop the table reference. - if orig != nil && file != orig.file { - f.drop(orig.file) + if orig != nil { + switch { + case orig.file != nil: + if desc == nil || desc.file != orig.file { + f.drop(orig.file) + } + case orig.fileVFS2 != nil: + if desc == nil || desc.fileVFS2 != orig.fileVFS2 { + f.dropVFS2(orig.fileVFS2) + } + } } // Adjust used. switch { - case orig == nil && file != nil: + case orig == nil && desc != nil: atomic.AddInt32(&f.used, 1) - case orig != nil && file == nil: + case orig != nil && desc == nil: atomic.AddInt32(&f.used, -1) } } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 7b90fac5a..dcd6e91c4 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -65,6 +65,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" @@ -435,17 +436,17 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(file *fs.File) error { + return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) } -// forEachFDPaused applies the given function to each open file descriptor in each -// task. +// forEachFDPaused applies the given function to each open file descriptor in +// each task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { +func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -453,8 +454,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { if t.fdTable == nil { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { - if lastErr := f(file); lastErr != nil && err == nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } }) @@ -463,7 +464,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - return ts.forEachFDPaused(func(file *fs.File) error { + // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -474,12 +476,9 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) if err := fs.SaveFileFsyncError(syncErr); err != nil { name, _ := file.Dirent.FullName(nil /* root */) - // Wrap this error in ErrSaveRejection - // so that it will trigger a save - // error, rather than a panic. This - // also allows us to distinguish Fsync - // errors from state file errors in - // state.Save. + // Wrap this error in ErrSaveRejection so that it will trigger a save + // error, rather than a panic. This also allows us to distinguish Fsync + // errors from state file errors in state.Save. return fs.ErrSaveRejection{ Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), } @@ -519,7 +518,7 @@ func (ts *TaskSet) unregisterEpollWaiters() { for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if e, ok := file.FileOperations.(*epoll.EventPoll); ok { e.UnregisterEpollWaiters() } @@ -921,7 +920,7 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } @@ -951,7 +950,7 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 95adf2778..981e8c7fe 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -743,6 +744,14 @@ func (t *Task) GetFile(fd int32) *fs.File { return f } +// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2. +// +// Precondition: same as FDTable.Get. +func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription { + f, _ := t.fdTable.GetVFS2(fd) + return f +} + // NewFDs is a convenience wrapper for t.FDTable().NewFDs. // // This automatically passes the task as the context. diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index fa6528386..8f57a34a6 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -69,6 +69,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -198,7 +199,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() // Remove FDs with the CloseOnExec flag set. - t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool { + t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 8d6c52850..be16ee686 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -93,6 +93,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 60469549d..64de56ac5 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,20 +32,58 @@ var ( partialResultOnce sync.Once ) +// HandleIOErrorVFS2 handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// op and f are used only for panics. +func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error { + known, err := handleIOErrorImpl(t, partialResult, err, intr, op) + if err != nil { + return err + } + if !known { + // An unknown error is encountered with a partial read/write. + fs := f.Mount().Filesystem().VirtualFilesystem() + root := vfs.RootFromContext(t) + name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry()) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name) + partialResultOnce.Do(partialResultMetric.Increment) + } + return nil +} + // handleIOError handles special error cases for partial results. For some // errors, we may consume the error and return only the partial read/write. // // op and f are used only for panics. func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error { + known, err := handleIOErrorImpl(t, partialResult, err, intr, op) + if err != nil { + return err + } + if !known { + // An unknown error is encountered with a partial read/write. + name, _ := f.Dirent.FullName(nil /* ignore chroot */) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) + partialResultOnce.Do(partialResultMetric.Increment) + } + return nil +} + +// handleIOError handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// Returns false if error is unknown. +func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) { switch err { case nil: // Typical successful syscall. - return nil + return true, nil case io.EOF: // EOF is always consumed. If this is a partial read/write // (result != 0), the application will see that, otherwise // they will see 0. - return nil + return true, nil case syserror.ErrExceedsFileSizeLimit: // Ignore partialResult because this error only applies to // normal files, and for those files we cannot accumulate @@ -53,20 +92,20 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return syserror.EFBIG + return true, syserror.EFBIG case syserror.ErrInterrupted: // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall // needs (to indicate to the kernel what it should do). if partialResult { - return nil + return true, nil } - return intr + return true, intr } if !partialResult { // Typical syscall error. - return err + return true, err } switch err { @@ -75,14 +114,14 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // read/write. Like ErrWouldBlock, since we have a // partial read/write, we consume the error and return // the partial result. - return nil + return true, nil case syserror.EFAULT: // EFAULT is only shown the user if nothing was // read/written. If we read something (this case), they see // a partial read/write. They will then presumably try again // with an incremented buffer, which will EFAULT with // result == 0. - return nil + return true, nil case syserror.EPIPE: // Writes to a pipe or socket will return EPIPE if the other // side is gone. The partial write is returned. EPIPE will be @@ -90,32 +129,29 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should // also be sent to the application. - return nil + return true, nil case syserror.ENOSPC: // Similar to EPIPE. Return what we wrote this time, and let // ENOSPC be returned on the next call. - return nil + return true, nil case syserror.ECONNRESET: // For TCP sendfile connections, we may have a reset. But we // should just return n as the result. - return nil + return true, nil case syserror.ErrWouldBlock: // Syscall would block, but completed a partial read/write. // This case should only be returned by IssueIO for nonblocking // files. Since we have a partial read/write, we consume // ErrWouldBlock, returning the partial result. - return nil + return true, nil } switch err.(type) { case kernel.SyscallRestartErrno: // Identical to the EINTR case. - return nil + return true, nil } - // An unknown error is encountered with a partial read/write. - name, _ := f.Dirent.FullName(nil /* ignore chroot */) - log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) - partialResultOnce.Do(partialResultMetric.Increment) - return nil + // Error is unknown and cannot be properly handled. + return false, nil } diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index c54735148..421845ebb 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -767,7 +767,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that Remove provides a reference on the file that we may use to // flush. It is still active until we drop the final reference below // (and other reference-holding operations complete). - file := t.FDTable().Remove(fd) + file, _ := t.FDTable().Remove(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD new file mode 100644 index 000000000..6b8a00b6e --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "vfs2", + srcs = [ + "linux64.go", + "linux64_override_amd64.go", + "linux64_override_arm64.go", + "sys_read.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/sentry/arch", + "//pkg/sentry/kernel", + "//pkg/sentry/syscalls", + "//pkg/sentry/syscalls/linux", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/sentry/syscalls/linux/vfs2/linux64.go new file mode 100644 index 000000000..19ee36081 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64.go @@ -0,0 +1,16 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs2 provides syscall implementations that use VFS2. +package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go new file mode 100644 index 000000000..c134714ee --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" +) + +// Override syscall table to add syscalls implementations from this package. +func Override(table map[uintptr]kernel.Syscall) { + table[0] = syscalls.Supported("read", Read) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go new file mode 100644 index 000000000..6af5c400f --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" +) + +// Override syscall table to add syscalls implementations from this package. +func Override(table map[uintptr]kernel.Syscall) { + table[63] = syscalls.Supported("read", Read) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go new file mode 100644 index 000000000..b9fb58464 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sys_read.go @@ -0,0 +1,95 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Read implements linux syscall read(2). Note that we try to get a buffer that +// is exactly the size requested because some applications like qemu expect +// they can do large reads all at once. Bug for bug. Same for other read +// calls below. +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.IsReadable() { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := file.Read(t, dst, opts) + if err != syserror.ErrWouldBlock { + return n, err + } + + // Register for notifications. + _, ch := waiter.NewChannelEntry(nil) + // file.EventRegister(&w, EventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Read(t, dst, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + //file.EventUnregister(&w) + + return total, err +} diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index a96c80261..ae4dd102a 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -68,6 +68,7 @@ go_library( "//pkg/sentry/state", "//pkg/sentry/strace", "//pkg/sentry/syscalls/linux", + "//pkg/sentry/syscalls/linux/vfs2", "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/usage", diff --git a/runsc/boot/config.go b/runsc/boot/config.go index a878bc2ce..35391030f 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -256,6 +256,9 @@ type Config struct { // // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. CPUNumFromQuota bool + + // Enables VFS2 (not plumbled through yet). + VFS2 bool } // ToFlags returns a slice of flags that correspond to the given Config. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index fad72f4ab..9f0d5d7af 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -26,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" @@ -42,6 +43,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/sighandling" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/watchdog" @@ -184,6 +186,13 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("setting up memory usage: %v", err) } + if args.Conf.VFS2 { + st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host) + if ok { + vfs2.Override(st.Table) + } + } + // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { -- cgit v1.2.3 From 3d046fef06ece6ba20770fa62e0a21569226adaa Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 28 Jan 2020 16:42:05 -0800 Subject: Changes missing in last submit Updates #1487 Updates #1623 PiperOrigin-RevId: 292040835 --- pkg/sentry/kernel/fd_table.go | 18 +++++++++--------- pkg/sentry/syscalls/linux/sys_read.go | 2 +- pkg/sentry/syscalls/linux/vfs2/sys_read.go | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 56b70ce96..23b88f7a6 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -15,9 +15,9 @@ package kernel import ( - "bytes" "fmt" "math" + "strings" "sync/atomic" "syscall" @@ -221,24 +221,24 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes // String is a stringer for FDTable. func (f *FDTable) String() string { - var b bytes.Buffer + var buf strings.Builder f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { switch { case file != nil: n, _ := file.Dirent.FullName(nil /* root */) - b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n) case fileVFS2 != nil: - fs := fileVFS2.VirtualDentry().Mount().Filesystem().VirtualFilesystem() - // TODO(gvisor.dev/issue/1623): We have no context nor root. Will this work? - name, err := fs.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() + name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) if err != nil { - b.WriteString(fmt.Sprintf("\n", err)) + fmt.Fprintf(&buf, "\n", err) + return } - b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, name)) + fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name) } }) - return b.String() + return buf.String() } // NewFDs allocates new FDs guaranteed to be the lowest number available diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index f9f594190..227692f06 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -29,7 +29,7 @@ import ( ) const ( - // EventMaskRead contains events that can be triggerd on reads. + // EventMaskRead contains events that can be triggered on reads. EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr ) diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go index b9fb58464..7667524c7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go +++ b/pkg/sentry/syscalls/linux/vfs2/sys_read.go @@ -24,6 +24,11 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +const ( + // EventMaskRead contains events that can be triggered on reads. + EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr +) + // Read implements linux syscall read(2). Note that we try to get a buffer that // is exactly the size requested because some applications like qemu expect // they can do large reads all at once. Bug for bug. Same for other read @@ -39,11 +44,6 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer file.DecRef() - // Check that the file is readable. - if !file.IsReadable() { - return 0, nil, syserror.EBADF - } - // Check that the size is legitimate. si := int(size) if si < 0 { @@ -70,8 +70,8 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt } // Register for notifications. - _, ch := waiter.NewChannelEntry(nil) - // file.EventRegister(&w, EventMaskRead) + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, EventMaskRead) total := n for { @@ -89,7 +89,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt break } } - //file.EventUnregister(&w) + file.EventUnregister(&w) return total, err } -- cgit v1.2.3 From 1b6a12a768216a99a5e0428c42ea4faf79cf3b50 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 5 Feb 2020 22:45:44 -0800 Subject: Add notes to relevant tests. These were out-of-band notes that can help provide additional context and simplify automated imports. PiperOrigin-RevId: 293525915 --- pkg/metric/metric.go | 1 - pkg/sentry/arch/arch_x86.go | 4 ++ pkg/sentry/arch/signal_amd64.go | 2 +- pkg/sentry/fs/file_overlay_test.go | 1 + pkg/sentry/fs/proc/README.md | 4 ++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 3 ++ pkg/sentry/kernel/kernel_opts.go | 20 +++++++ pkg/sentry/socket/hostinet/BUILD | 1 + pkg/sentry/socket/hostinet/socket.go | 5 +- pkg/sentry/socket/hostinet/sockopt_impl.go | 27 ++++++++++ pkg/tcpip/transport/tcp/endpoint.go | 3 ++ runsc/boot/filter/BUILD | 1 + runsc/boot/filter/config.go | 13 ----- runsc/boot/filter/config_profile.go | 34 ++++++++++++ runsc/container/console_test.go | 5 +- runsc/dockerutil/dockerutil.go | 11 ++-- runsc/testutil/BUILD | 5 +- runsc/testutil/testutil.go | 54 ------------------- runsc/testutil/testutil_runfiles.go | 75 +++++++++++++++++++++++++++ test/image/image_test.go | 8 +-- test/syscalls/build_defs.bzl | 35 +++++++++++-- test/syscalls/linux/chroot.cc | 2 +- test/syscalls/linux/concurrency.cc | 3 +- test/syscalls/linux/exec_proc_exe_workload.cc | 6 +++ test/syscalls/linux/fork.cc | 5 +- test/syscalls/linux/mmap.cc | 8 +-- test/syscalls/linux/open_create.cc | 1 + test/syscalls/linux/preadv.cc | 1 + test/syscalls/linux/proc.cc | 46 +++++++++++++--- test/syscalls/linux/readv.cc | 4 +- test/syscalls/linux/rseq.cc | 2 +- test/syscalls/linux/select.cc | 2 +- test/syscalls/linux/shm.cc | 2 +- test/syscalls/linux/sigprocmask.cc | 2 +- test/syscalls/linux/socket_unix_non_stream.cc | 4 +- test/syscalls/linux/symlink.cc | 2 +- test/syscalls/linux/tcp_socket.cc | 3 +- test/syscalls/linux/time.cc | 1 + test/syscalls/linux/tkill.cc | 2 +- test/util/temp_path.cc | 1 + tools/build/tags.bzl | 4 ++ tools/defs.bzl | 17 +++++- 43 files changed, 318 insertions(+), 113 deletions(-) create mode 100644 pkg/sentry/kernel/kernel_opts.go create mode 100644 pkg/sentry/socket/hostinet/sockopt_impl.go create mode 100644 runsc/boot/filter/config_profile.go create mode 100644 runsc/testutil/testutil_runfiles.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go index 93d4f2b8c..006fcd9ab 100644 --- a/pkg/metric/metric.go +++ b/pkg/metric/metric.go @@ -46,7 +46,6 @@ var ( // // TODO(b/67298402): Support non-cumulative metrics. // TODO(b/67298427): Support metric fields. -// type Uint64Metric struct { // value is the actual value of the metric. It must be accessed // atomically. diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index a18093155..3db8bd34b 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -114,6 +114,10 @@ func newX86FPStateSlice() []byte { size, align := cpuid.HostFeatureSet().ExtendedStateSize() capacity := size // Always use at least 4096 bytes. + // + // For the KVM platform, this state is a fixed 4096 bytes, so make sure + // that the underlying array is at _least_ that size otherwise we will + // corrupt random memory. This is not a pleasant thing to debug. if capacity < 4096 { capacity = 4096 } diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 81b92bb43..6fb756f0e 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -55,7 +55,7 @@ type SignalContext64 struct { Trapno uint64 Oldmask linux.SignalSet Cr2 uint64 - // Pointer to a struct _fpstate. + // Pointer to a struct _fpstate. See b/33003106#comment8. Fpstate uint64 Reserved [8]uint64 } diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go index 02538bb4f..a76d87e3a 100644 --- a/pkg/sentry/fs/file_overlay_test.go +++ b/pkg/sentry/fs/file_overlay_test.go @@ -177,6 +177,7 @@ func TestReaddirRevalidation(t *testing.T) { // TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with // a frozen dirent tree does not make Readdir calls to the underlying files. +// This is a regression test for b/114808269. func TestReaddirOverlayFrozen(t *testing.T) { ctx := contexttest.Context(t) diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md index 5d4ec6c7b..6667a0916 100644 --- a/pkg/sentry/fs/proc/README.md +++ b/pkg/sentry/fs/proc/README.md @@ -11,6 +11,8 @@ inconsistency, please file a bug. The following files are implemented: + + | File /proc/ | Content | | :------------------------ | :---------------------------------------------------- | | [cpuinfo](#cpuinfo) | Info about the CPU | @@ -22,6 +24,8 @@ The following files are implemented: | [uptime](#uptime) | Wall clock since boot, combined idle time of all cpus | | [version](#version) | Kernel version | + + ### cpuinfo ```bash diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index a27628c0a..2231d6973 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -91,6 +91,7 @@ go_library( "fs_context.go", "ipc_namespace.go", "kernel.go", + "kernel_opts.go", "kernel_state.go", "pending_signals.go", "pending_signals_list.go", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index dcd6e91c4..3ee760ba2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -235,6 +235,9 @@ type Kernel struct { // events. This is initialized lazily on the first unimplemented // syscall. unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` + + // SpecialOpts contains special kernel options. + SpecialOpts } // InitKernelArgs holds arguments to Init. diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go new file mode 100644 index 000000000..2e66ec587 --- /dev/null +++ b/pkg/sentry/kernel/kernel_opts.go @@ -0,0 +1,20 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// SpecialOpts contains non-standard options for the kernel. +// +// +stateify savable +type SpecialOpts struct{} diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 5a07d5d0e..023bad156 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -10,6 +10,7 @@ go_library( "save_restore.go", "socket.go", "socket_unsafe.go", + "sockopt_impl.go", "stack.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 34f63986f..de76388ac 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -285,7 +285,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt } // Whitelist options and constrain option length. - var optlen int + optlen := getSockOptLen(t, level, name) switch level { case linux.SOL_IP: switch name { @@ -330,7 +330,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt // SetSockOpt implements socket.Socket.SetSockOpt. func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { // Whitelist options and constrain option length. - var optlen int + optlen := setSockOptLen(t, level, name) switch level { case linux.SOL_IP: switch name { @@ -353,6 +353,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ optlen = sizeofInt32 } } + if optlen == 0 { // Pretend to accept socket options we don't understand. This seems // dangerous, but it's what netstack does... diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go new file mode 100644 index 000000000..8a783712e --- /dev/null +++ b/pkg/sentry/socket/hostinet/sockopt_impl.go @@ -0,0 +1,27 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +func getSockOptLen(t *kernel.Task, level, name int) int { + return 0 // No custom options. +} + +func setSockOptLen(t *kernel.Task, level, name int) int { + return 0 // No custom options. +} diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index e4a6b1b8b..f2be0e651 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -2166,6 +2166,9 @@ func (e *endpoint) listen(backlog int) *tcpip.Error { e.isRegistered = true e.setEndpointState(StateListen) + // The channel may be non-nil when we're restoring the endpoint, and it + // may be pre-populated with some previously accepted (but not Accepted) + // endpoints. if e.acceptedChan == nil { e.acceptedChan = make(chan *endpoint, backlog) } diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index ce30f6c53..ed18f0047 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -8,6 +8,7 @@ go_library( "config.go", "config_amd64.go", "config_arm64.go", + "config_profile.go", "extra_filters.go", "extra_filters_msan.go", "extra_filters_race.go", diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index f8d351c7b..c69f4c602 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -536,16 +536,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules { }, } } - -// profileFilters returns extra syscalls made by runtime/pprof package. -func profileFilters() seccomp.SyscallRules { - return seccomp.SyscallRules{ - syscall.SYS_OPENAT: []seccomp.Rule{ - { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), - }, - }, - } -} diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go new file mode 100644 index 000000000..194952a7b --- /dev/null +++ b/runsc/boot/filter/config_profile.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// profileFilters returns extra syscalls made by runtime/pprof package. +func profileFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_OPENAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + }, + }, + } +} diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 060b63bf3..c2518d52b 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -196,7 +196,10 @@ func TestJobControlSignalExec(t *testing.T) { defer ptyMaster.Close() defer ptySlave.Close() - // Exec bash and attach a terminal. + // Exec bash and attach a terminal. Note that occasionally /bin/sh + // may be a different shell or have a different configuration (such + // as disabling interactive mode and job control). Since we want to + // explicitly test interactive mode, use /bin/bash. See b/116981926. execArgs := &control.ExecArgs{ Filename: "/bin/bash", // Don't let bash execute from profile or rc files, otherwise diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go index 9b6346ca2..1ff5e8cc3 100644 --- a/runsc/dockerutil/dockerutil.go +++ b/runsc/dockerutil/dockerutil.go @@ -143,8 +143,11 @@ func PrepareFiles(names ...string) (string, error) { return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err) } for _, name := range names { - src := getLocalPath(name) - dst := path.Join(dir, name) + src, err := testutil.FindFile(name) + if err != nil { + return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err) + } + dst := path.Join(dir, path.Base(name)) if err := testutil.Copy(src, dst); err != nil { return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err) } @@ -152,10 +155,6 @@ func PrepareFiles(names ...string) (string, error) { return dir, nil } -func getLocalPath(file string) string { - return path.Join(".", file) -} - // do executes docker command. func do(args ...string) (string, error) { log.Printf("Running: docker %s\n", args) diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD index f845120b0..945405303 100644 --- a/runsc/testutil/BUILD +++ b/runsc/testutil/BUILD @@ -5,7 +5,10 @@ package(licenses = ["notice"]) go_library( name = "testutil", testonly = 1, - srcs = ["testutil.go"], + srcs = [ + "testutil.go", + "testutil_runfiles.go", + ], visibility = ["//:sandbox"], deps = [ "//pkg/log", diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go index edf2e809a..80c2c9680 100644 --- a/runsc/testutil/testutil.go +++ b/runsc/testutil/testutil.go @@ -79,60 +79,6 @@ func ConfigureExePath() error { return nil } -// FindFile searchs for a file inside the test run environment. It returns the -// full path to the file. It fails if none or more than one file is found. -func FindFile(path string) (string, error) { - wd, err := os.Getwd() - if err != nil { - return "", err - } - - // The test root is demarcated by a path element called "__main__". Search for - // it backwards from the working directory. - root := wd - for { - dir, name := filepath.Split(root) - if name == "__main__" { - break - } - if len(dir) == 0 { - return "", fmt.Errorf("directory __main__ not found in %q", wd) - } - // Remove ending slash to loop around. - root = dir[:len(dir)-1] - } - - // Annoyingly, bazel adds the build type to the directory path for go - // binaries, but not for c++ binaries. We use two different patterns to - // to find our file. - patterns := []string{ - // Try the obvious path first. - filepath.Join(root, path), - // If it was a go binary, use a wildcard to match the build - // type. The pattern is: /test-path/__main__/directories/*/file. - filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)), - } - - for _, p := range patterns { - matches, err := filepath.Glob(p) - if err != nil { - // "The only possible returned error is ErrBadPattern, - // when pattern is malformed." -godoc - return "", fmt.Errorf("error globbing %q: %v", p, err) - } - switch len(matches) { - case 0: - // Try the next pattern. - case 1: - // We found it. - return matches[0], nil - default: - return "", fmt.Errorf("more than one match found for %q: %s", path, matches) - } - } - return "", fmt.Errorf("file %q not found", path) -} - // TestConfig returns the default configuration to use in tests. Note that // 'RootDir' must be set by caller if required. func TestConfig() *boot.Config { diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go new file mode 100644 index 000000000..ece9ea9a1 --- /dev/null +++ b/runsc/testutil/testutil_runfiles.go @@ -0,0 +1,75 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "fmt" + "os" + "path/filepath" +) + +// FindFile searchs for a file inside the test run environment. It returns the +// full path to the file. It fails if none or more than one file is found. +func FindFile(path string) (string, error) { + wd, err := os.Getwd() + if err != nil { + return "", err + } + + // The test root is demarcated by a path element called "__main__". Search for + // it backwards from the working directory. + root := wd + for { + dir, name := filepath.Split(root) + if name == "__main__" { + break + } + if len(dir) == 0 { + return "", fmt.Errorf("directory __main__ not found in %q", wd) + } + // Remove ending slash to loop around. + root = dir[:len(dir)-1] + } + + // Annoyingly, bazel adds the build type to the directory path for go + // binaries, but not for c++ binaries. We use two different patterns to + // to find our file. + patterns := []string{ + // Try the obvious path first. + filepath.Join(root, path), + // If it was a go binary, use a wildcard to match the build + // type. The pattern is: /test-path/__main__/directories/*/file. + filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)), + } + + for _, p := range patterns { + matches, err := filepath.Glob(p) + if err != nil { + // "The only possible returned error is ErrBadPattern, + // when pattern is malformed." -godoc + return "", fmt.Errorf("error globbing %q: %v", p, err) + } + switch len(matches) { + case 0: + // Try the next pattern. + case 1: + // We found it. + return matches[0], nil + default: + return "", fmt.Errorf("more than one match found for %q: %s", path, matches) + } + } + return "", fmt.Errorf("file %q not found", path) +} diff --git a/test/image/image_test.go b/test/image/image_test.go index d0dcb1861..0a1e19d6f 100644 --- a/test/image/image_test.go +++ b/test/image/image_test.go @@ -107,7 +107,7 @@ func TestHttpd(t *testing.T) { } d := dockerutil.MakeDocker("http-test") - dir, err := dockerutil.PrepareFiles("latin10k.txt") + dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt") if err != nil { t.Fatalf("PrepareFiles() failed: %v", err) } @@ -139,7 +139,7 @@ func TestNginx(t *testing.T) { } d := dockerutil.MakeDocker("net-test") - dir, err := dockerutil.PrepareFiles("latin10k.txt") + dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt") if err != nil { t.Fatalf("PrepareFiles() failed: %v", err) } @@ -183,7 +183,7 @@ func TestMysql(t *testing.T) { } client := dockerutil.MakeDocker("mysql-client-test") - dir, err := dockerutil.PrepareFiles("mysql.sql") + dir, err := dockerutil.PrepareFiles("test/image/mysql.sql") if err != nil { t.Fatalf("PrepareFiles() failed: %v", err) } @@ -283,7 +283,7 @@ func TestRuby(t *testing.T) { } d := dockerutil.MakeDocker("ruby-test") - dir, err := dockerutil.PrepareFiles("ruby.rb", "ruby.sh") + dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh") if err != nil { t.Fatalf("PrepareFiles() failed: %v", err) } diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl index 1df761dd0..cbab85ef7 100644 --- a/test/syscalls/build_defs.bzl +++ b/test/syscalls/build_defs.bzl @@ -2,8 +2,6 @@ load("//tools:defs.bzl", "loopback") -# syscall_test is a macro that will create targets to run the given test target -# on the host (native) and runsc. def syscall_test( test, shard_count = 5, @@ -13,6 +11,19 @@ def syscall_test( add_uds_tree = False, add_hostinet = False, tags = None): + """syscall_test is a macro that will create targets for all platforms. + + Args: + test: the test target. + shard_count: shards for defined tests. + size: the defined test size. + use_tmpfs: use tmpfs in the defined tests. + add_overlay: add an overlay test. + add_uds_tree: add a UDS test. + add_hostinet: add a hostinet test. + tags: starting test tags. + """ + _syscall_test( test = test, shard_count = shard_count, @@ -111,6 +122,19 @@ def _syscall_test( # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared. tags += [full_platform, "file_" + file_access] + # Hash this target into one of 15 buckets. This can be used to + # randomly split targets between different workflows. + hash15 = hash(native.package_name() + name) % 15 + tags.append("hash15:" + str(hash15)) + + # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until + # we figure out how to request ipv4 sockets on Guitar machines. + if network == "host": + tags.append("noguitar") + + # Disable off-host networking. + tags.append("requires-net:loopback") + # Add tag to prevent the tests from running in a Bazel sandbox. # TODO(b/120560048): Make the tests run without this tag. tags.append("no-sandbox") @@ -118,8 +142,11 @@ def _syscall_test( # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is # more stable. if platform == "kvm": - tags += ["manual"] - tags += ["requires-kvm"] + tags.append("manual") + tags.append("requires-kvm") + + # TODO(b/112165693): Remove when tests pass reliably. + tags.append("notap") args = [ # Arguments are passed directly to syscall_test_runner binary. diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc index 0a2d44a2c..85ec013d5 100644 --- a/test/syscalls/linux/chroot.cc +++ b/test/syscalls/linux/chroot.cc @@ -167,7 +167,7 @@ TEST(ChrootTest, DotDotFromOpenFD) { } // Test that link resolution in a chroot can escape the root by following an -// open proc fd. +// open proc fd. Regression test for b/32316719. TEST(ChrootTest, ProcFdLinkResolutionInChroot) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT))); diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc index f41f99900..7cd6a75bd 100644 --- a/test/syscalls/linux/concurrency.cc +++ b/test/syscalls/linux/concurrency.cc @@ -46,7 +46,8 @@ TEST(ConcurrencyTest, SingleProcessMultithreaded) { } // Test that multiple threads in this process continue to execute in parallel, -// even if an unrelated second process is spawned. +// even if an unrelated second process is spawned. Regression test for +// b/32119508. TEST(ConcurrencyTest, MultiProcessMultithreaded) { // In PID 1, start TIDs 1 and 2, and put both to sleep. // diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc index b790fe5be..2989379b7 100644 --- a/test/syscalls/linux/exec_proc_exe_workload.cc +++ b/test/syscalls/linux/exec_proc_exe_workload.cc @@ -21,6 +21,12 @@ #include "test/util/posix_error.h" int main(int argc, char** argv, char** envp) { + // This is annoying. Because remote build systems may put these binaries + // in a content-addressable-store, you may wind up with /proc/self/exe + // pointing to some random path (but with a sensible argv[0]). + // + // Therefore, this test simply checks that the /proc/self/exe + // is absolute and *doesn't* match argv[1]. std::string exe = gvisor::testing::ProcessExePath(getpid()).ValueOrDie(); if (exe[0] != '/') { diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc index 906f3358d..ff8bdfeb0 100644 --- a/test/syscalls/linux/fork.cc +++ b/test/syscalls/linux/fork.cc @@ -271,7 +271,7 @@ TEST_F(ForkTest, Alarm) { EXPECT_EQ(0, alarmed); } -// Child cannot affect parent private memory. +// Child cannot affect parent private memory. Regression test for b/24137240. TEST_F(ForkTest, PrivateMemory) { std::atomic local(0); @@ -298,6 +298,9 @@ TEST_F(ForkTest, PrivateMemory) { } // Kernel-accessed buffers should remain coherent across COW. +// +// The buffer must be >= usermem.ZeroCopyMinBytes, as UnsafeAccess operates +// differently. Regression test for b/33811887. TEST_F(ForkTest, COWSegment) { constexpr int kBufSize = 1024; char* read_buf = private_; diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc index 1c4d9f1c7..11fb1b457 100644 --- a/test/syscalls/linux/mmap.cc +++ b/test/syscalls/linux/mmap.cc @@ -1418,7 +1418,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) { // // On most platforms this is trivial, but when the file is mapped via the sentry // page cache (which does not yet support writing to shared mappings), a bug -// caused reads to fail unnecessarily on such mappings. +// caused reads to fail unnecessarily on such mappings. See b/28913513. TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) { uintptr_t addr; size_t len = strlen(kFileContents); @@ -1435,7 +1435,7 @@ TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) { // Tests that EFAULT is returned when invoking a syscall that requires the OS to // read past end of file (resulting in a fault in sentry context in the gVisor -// case). +// case). See b/28913513. TEST_F(MMapFileTest, InternalSigBus) { uintptr_t addr; ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, @@ -1578,7 +1578,7 @@ TEST_F(MMapFileTest, Bug38498194) { } // Tests that reading from a file to a memory mapping of the same file does not -// deadlock. +// deadlock. See b/34813270. TEST_F(MMapFileTest, SelfRead) { uintptr_t addr; ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, @@ -1590,7 +1590,7 @@ TEST_F(MMapFileTest, SelfRead) { } // Tests that writing to a file from a memory mapping of the same file does not -// deadlock. +// deadlock. Regression test for b/34813270. TEST_F(MMapFileTest, SelfWrite) { uintptr_t addr; ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0), diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc index 431733dbe..902d0a0dc 100644 --- a/test/syscalls/linux/open_create.cc +++ b/test/syscalls/linux/open_create.cc @@ -132,6 +132,7 @@ TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) { } // A file originally created RW, but opened RO can later be opened RW. +// Regression test for b/65385065. TEST(CreateTest, OpenCreateROThenRW) { TempPath file(NewTempAbsPath()); diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc index f7ea44054..5b0743fe9 100644 --- a/test/syscalls/linux/preadv.cc +++ b/test/syscalls/linux/preadv.cc @@ -37,6 +37,7 @@ namespace testing { namespace { +// Stress copy-on-write. Attempts to reproduce b/38430174. TEST(PreadvTest, MMConcurrencyStress) { // Fill a one-page file with zeroes (the contents don't really matter). const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index 169b723eb..a23fdb58d 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -1352,13 +1352,19 @@ TEST(ProcPidSymlink, SubprocessZombied) { // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux // on proc files. - // 4.17 & gVisor: Syscall succeeds and returns 1 + // + // ~4.3: Syscall fails with EACCES. + // 4.17 & gVisor: Syscall succeeds and returns 1. + // // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)), // SyscallFailsWithErrno(EACCES)); // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux // on proc files. - // 4.17 & gVisor: Syscall succeeds and returns 1. + // + // ~4.3: Syscall fails with EACCES. + // 4.17 & gVisor: Syscall succeeds and returns 1. + // // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)), // SyscallFailsWithErrno(EACCES)); } @@ -1431,8 +1437,12 @@ TEST(ProcPidFile, SubprocessRunning) { TEST(ProcPidFile, SubprocessZombie) { char buf[1]; - // 4.17: Succeeds and returns 1 - // gVisor: Succeeds and returns 0 + // FIXME(gvisor.dev/issue/164): Loosen requirement due to inconsistent + // behavior on different kernels. + // + // ~4.3: Succeds and returns 0. + // 4.17: Succeeds and returns 1. + // gVisor: Succeeds and returns 0. EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds()); EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)), @@ -1458,7 +1468,10 @@ TEST(ProcPidFile, SubprocessZombie) { // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux // on proc files. + // + // ~4.3: Fails and returns EACCES. // gVisor & 4.17: Succeeds and returns 1. + // // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)), // SyscallFailsWithErrno(EACCES)); } @@ -1467,9 +1480,12 @@ TEST(ProcPidFile, SubprocessZombie) { TEST(ProcPidFile, SubprocessExited) { char buf[1]; - // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels + // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels. + // + // ~4.3: Fails and returns ESRCH. // gVisor: Fails with ESRCH. // 4.17: Succeeds and returns 1. + // // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)), // SyscallFailsWithErrno(ESRCH)); @@ -1641,7 +1657,7 @@ TEST(ProcTask, KilledThreadsDisappear) { EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task", TaskFiles(initial, {child1.Tid()}))); - // Stat child1's task file. + // Stat child1's task file. Regression test for b/32097707. struct stat statbuf; const std::string child1_task_file = absl::StrCat("/proc/self/task/", child1.Tid()); @@ -1669,7 +1685,7 @@ TEST(ProcTask, KilledThreadsDisappear) { EXPECT_NO_ERRNO(EventuallyDirContainsExactly( "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()}))); - // Stat child1's task file again. This time it should fail. + // Stat child1's task file again. This time it should fail. See b/32097707. EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf), SyscallFailsWithErrno(ENOENT)); @@ -1824,7 +1840,7 @@ TEST(ProcSysVmOvercommitMemory, HasNumericValue) { } // Check that link for proc fd entries point the target node, not the -// symlink itself. +// symlink itself. Regression test for b/31155070. TEST(ProcTaskFd, FstatatFollowsSymlink) { const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); const FileDescriptor fd = @@ -1883,6 +1899,20 @@ TEST(ProcMounts, IsSymlink) { EXPECT_EQ(link, "self/mounts"); } +TEST(ProcSelfMountinfo, RequiredFieldsArePresent) { + auto mountinfo = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mountinfo")); + EXPECT_THAT( + mountinfo, + AllOf( + // Root mount. + ContainsRegex( + R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / / (rw|ro).*- \S+ \S+ (rw|ro)\S*)"), + // Proc mount - always rw. + ContainsRegex( + R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / /proc rw.*- \S+ \S+ rw\S*)"))); +} + // Check that /proc/self/mounts looks something like a real mounts file. TEST(ProcSelfMounts, RequiredFieldsArePresent) { auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts")); diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc index 4069cbc7e..baaf9f757 100644 --- a/test/syscalls/linux/readv.cc +++ b/test/syscalls/linux/readv.cc @@ -254,7 +254,9 @@ TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) { // This test depends on the maximum extent of a single readv() syscall, so // we can't tolerate interruption from saving. TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) { - // Ensure that we won't be interrupted by ITIMER_PROF. + // Ensure that we won't be interrupted by ITIMER_PROF. This is particularly + // important in environments where automated profiling tools may start + // ITIMER_PROF automatically. struct itimerval itv = {}; auto const cleanup_itimer = ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv)); diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc index 106c045e3..4bfb1ff56 100644 --- a/test/syscalls/linux/rseq.cc +++ b/test/syscalls/linux/rseq.cc @@ -36,7 +36,7 @@ namespace { // We must be very careful about how these tests are written. Each thread may // only have one struct rseq registration, which may be done automatically at // thread start (as of 2019-11-13, glibc does *not* support rseq and thus does -// not do so). +// not do so, but other libraries do). // // Testing of rseq is thus done primarily in a child process with no // registration. This means exec'ing a nostdlib binary, as rseq registration can diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc index 424e2a67f..be2364fb8 100644 --- a/test/syscalls/linux/select.cc +++ b/test/syscalls/linux/select.cc @@ -146,7 +146,7 @@ TEST_F(SelectTest, IgnoreBitsAboveNfds) { // This test illustrates Linux's behavior of 'select' calls passing after // setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on -// this behavior. +// this behavior. See b/122318458. TEST_F(SelectTest, SetrlimitCallNOFILE) { fd_set read_set; FD_ZERO(&read_set); diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc index 7ba752599..c7fdbb924 100644 --- a/test/syscalls/linux/shm.cc +++ b/test/syscalls/linux/shm.cc @@ -473,7 +473,7 @@ TEST(ShmTest, PartialUnmap) { } // Check that sentry does not panic when asked for a zero-length private shm -// segment. +// segment. Regression test for b/110694797. TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) { EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _)); } diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc index 654c6a47f..a603fc1d1 100644 --- a/test/syscalls/linux/sigprocmask.cc +++ b/test/syscalls/linux/sigprocmask.cc @@ -237,7 +237,7 @@ TEST_F(SigProcMaskTest, SignalHandler) { } // Check that sigprocmask correctly handles aliasing of the set and oldset -// pointers. +// pointers. Regression test for b/30502311. TEST_F(SigProcMaskTest, AliasedSets) { sigset_t mask; diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc index 276a94eb8..884319e1d 100644 --- a/test/syscalls/linux/socket_unix_non_stream.cc +++ b/test/syscalls/linux/socket_unix_non_stream.cc @@ -109,7 +109,7 @@ PosixErrorOr> CreateFragmentedRegion(const int size, } // A contiguous iov that is heavily fragmented in FileMem can still be sent -// successfully. +// successfully. See b/115833655. TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); @@ -165,7 +165,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) { } // A contiguous iov that is heavily fragmented in FileMem can still be received -// into successfully. +// into successfully. Regression test for b/115833655. TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc index b249ff91f..03ee1250d 100644 --- a/test/syscalls/linux/symlink.cc +++ b/test/syscalls/linux/symlink.cc @@ -38,7 +38,7 @@ mode_t FilePermission(const std::string& path) { } // Test that name collisions are checked on the new link path, not the source -// path. +// path. Regression test for b/31782115. TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) { const std::string srcname = NewTempAbsPath(); const std::string newname = NewTempAbsPath(); diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index 8a8b68e75..c4591a3b9 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -244,7 +244,8 @@ TEST_P(TcpSocketTest, ZeroWriteAllowed) { } // Test that a non-blocking write with a buffer that is larger than the send -// buffer size will not actually write the whole thing at once. +// buffer size will not actually write the whole thing at once. Regression test +// for b/64438887. TEST_P(TcpSocketTest, NonblockingLargeWrite) { // Set the FD to O_NONBLOCK. int opts; diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc index c7eead17e..1ccb95733 100644 --- a/test/syscalls/linux/time.cc +++ b/test/syscalls/linux/time.cc @@ -62,6 +62,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) { ::testing::KilledBySignal(SIGSEGV), ""); } +// Mimics the gettimeofday(2) wrapper from the Go runtime <= 1.2. int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) { constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000; return reinterpret_cast( diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc index bae377c69..8d8ebbb24 100644 --- a/test/syscalls/linux/tkill.cc +++ b/test/syscalls/linux/tkill.cc @@ -54,7 +54,7 @@ void SigHandler(int sig, siginfo_t* info, void* context) { TEST_CHECK(info->si_code == SI_TKILL); } -// Test with a real signal. +// Test with a real signal. Regression test for b/24790092. TEST(TkillTest, ValidTIDAndRealSignal) { struct sigaction sa; sa.sa_sigaction = SigHandler; diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc index 35aacb172..9c10b6674 100644 --- a/test/util/temp_path.cc +++ b/test/util/temp_path.cc @@ -77,6 +77,7 @@ std::string NewTempAbsPath() { std::string NewTempRelPath() { return NextTempBasename(); } std::string GetAbsoluteTestTmpdir() { + // Note that TEST_TMPDIR is guaranteed to be set. char* env_tmpdir = getenv("TEST_TMPDIR"); std::string tmp_dir = env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp"; diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl index e99c87f81..a6db44e47 100644 --- a/tools/build/tags.bzl +++ b/tools/build/tags.bzl @@ -33,4 +33,8 @@ go_suffixes = [ "_wasm_unsafe", "_linux", "_linux_unsafe", + "_opts", + "_opts_unsafe", + "_impl", + "_impl_unsafe", ] diff --git a/tools/defs.bzl b/tools/defs.bzl index 5d5fa134a..c03b557ae 100644 --- a/tools/defs.bzl +++ b/tools/defs.bzl @@ -73,6 +73,16 @@ def calculate_sets(srcs): result[target].append(file) return result +def go_imports(name, src, out): + """Simplify a single Go source file by eliminating unused imports.""" + native.genrule( + name = name, + srcs = [src], + outs = [out], + tools = ["@org_golang_x_tools//cmd/goimports:goimports"], + cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"), + ) + def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs): """Wraps the standard go_library and does stateification and marshalling. @@ -107,10 +117,15 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F state_sets = calculate_sets(srcs) for (suffix, srcs) in state_sets.items(): go_stateify( - name = name + suffix + "_state_autogen", + name = name + suffix + "_state_autogen_with_imports", srcs = srcs, imports = imports, package = name, + out = name + suffix + "_state_autogen_with_imports.go", + ) + go_imports( + name = name + suffix + "_state_autogen", + src = name + suffix + "_state_autogen_with_imports.go", out = name + suffix + "_state_autogen.go", ) all_srcs = all_srcs + [ -- cgit v1.2.3 From 4075de11be44372c454aae7f9650cdc814c52229 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 14 Feb 2020 11:11:55 -0800 Subject: Plumb VFS2 inside the Sentry - Added fsbridge package with interface that can be used to open and read from VFS1 and VFS2 files. - Converted ELF loader to use fsbridge - Added VFS2 types to FSContext - Added vfs.MountNamespace to ThreadGroup Updates #1623 PiperOrigin-RevId: 295183950 --- pkg/sentry/control/BUILD | 5 + pkg/sentry/control/proc.go | 127 +++++++++++++-- pkg/sentry/fs/proc/BUILD | 1 + pkg/sentry/fs/proc/task.go | 17 +- pkg/sentry/fsbridge/BUILD | 24 +++ pkg/sentry/fsbridge/bridge.go | 54 ++++++ pkg/sentry/fsbridge/fs.go | 181 +++++++++++++++++++++ pkg/sentry/fsbridge/vfs.go | 134 +++++++++++++++ pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 4 + pkg/sentry/fsimpl/gofer/filesystem.go | 5 +- pkg/sentry/fsimpl/gofer/gofer.go | 3 + pkg/sentry/fsimpl/kernfs/filesystem.go | 10 +- pkg/sentry/fsimpl/proc/BUILD | 1 + pkg/sentry/fsimpl/proc/filesystem.go | 18 +- pkg/sentry/fsimpl/proc/tasks_test.go | 17 +- pkg/sentry/fsimpl/sys/BUILD | 1 + pkg/sentry/fsimpl/sys/sys.go | 3 + pkg/sentry/fsimpl/sys/sys_test.go | 7 +- pkg/sentry/fsimpl/testutil/BUILD | 2 +- pkg/sentry/fsimpl/testutil/kernel.go | 24 +-- pkg/sentry/fsimpl/testutil/testutil.go | 12 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 12 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 3 + pkg/sentry/kernel/BUILD | 2 + pkg/sentry/kernel/fs_context.go | 98 +++++++++-- pkg/sentry/kernel/kernel.go | 145 +++++++++++++---- pkg/sentry/kernel/task.go | 27 +++ pkg/sentry/kernel/task_clone.go | 11 +- pkg/sentry/kernel/task_context.go | 2 +- pkg/sentry/kernel/task_exit.go | 7 + pkg/sentry/kernel/task_log.go | 15 +- pkg/sentry/kernel/task_start.go | 49 +++--- pkg/sentry/kernel/thread_group.go | 6 +- pkg/sentry/loader/BUILD | 2 + pkg/sentry/loader/elf.go | 28 ++-- pkg/sentry/loader/interpreter.go | 6 +- pkg/sentry/loader/loader.go | 179 ++++++-------------- pkg/sentry/loader/vdso.go | 7 +- pkg/sentry/mm/BUILD | 2 +- pkg/sentry/mm/metadata.go | 10 +- pkg/sentry/mm/mm.go | 4 +- pkg/sentry/strace/strace.go | 28 ++++ pkg/sentry/syscalls/linux/BUILD | 1 + pkg/sentry/syscalls/linux/sys_prctl.go | 3 +- pkg/sentry/syscalls/linux/sys_thread.go | 17 +- .../syscalls/linux/vfs2/linux64_override_amd64.go | 106 ++++++++++++ pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/context.go | 7 +- pkg/sentry/vfs/mount.go | 10 +- pkg/sentry/vfs/options.go | 2 +- pkg/sentry/vfs/vfs.go | 5 +- runsc/boot/loader.go | 11 +- 52 files changed, 1134 insertions(+), 322 deletions(-) create mode 100644 pkg/sentry/fsbridge/BUILD create mode 100644 pkg/sentry/fsbridge/bridge.go create mode 100644 pkg/sentry/fsbridge/fs.go create mode 100644 pkg/sentry/fsbridge/vfs.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index e69496477..d16d78aa5 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -16,10 +16,13 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fd", + "//pkg/fspath", "//pkg/log", "//pkg/sentry/fs", "//pkg/sentry/fs/host", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -27,8 +30,10 @@ go_library( "//pkg/sentry/state", "//pkg/sentry/strace", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/sync", + "//pkg/syserror", "//pkg/tcpip/link/sniffer", "//pkg/urpc", ], diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index ced51c66c..8973754c8 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -18,19 +18,26 @@ import ( "bytes" "encoding/json" "fmt" + "path" "sort" "strings" "text/tabwriter" "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/urpc" ) @@ -60,6 +67,12 @@ type ExecArgs struct { // process's MountNamespace. MountNamespace *fs.MountNamespace + // MountNamespaceVFS2 is the mount namespace to execute the new process in. + // A reference on MountNamespace must be held for the lifetime of the + // ExecArgs. If MountNamespace is nil, it will default to the init + // process's MountNamespace. + MountNamespaceVFS2 *vfs.MountNamespace + // WorkingDirectory defines the working directory for the new process. WorkingDirectory string `json:"wd"` @@ -150,6 +163,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI Envv: args.Envv, WorkingDirectory: args.WorkingDirectory, MountNamespace: args.MountNamespace, + MountNamespaceVFS2: args.MountNamespaceVFS2, Credentials: creds, FDTable: fdTable, Umask: 0022, @@ -166,24 +180,53 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI // be donated to the new process in CreateProcess. initArgs.MountNamespace.IncRef() } + if initArgs.MountNamespaceVFS2 != nil { + // initArgs must hold a reference on MountNamespaceVFS2, which will + // be donated to the new process in CreateProcess. + initArgs.MountNamespaceVFS2.IncRef() + } ctx := initArgs.NewContext(proc.Kernel) if initArgs.Filename == "" { - // Get the full path to the filename from the PATH env variable. - paths := fs.GetPath(initArgs.Envv) - mns := initArgs.MountNamespace - if mns == nil { - mns = proc.Kernel.GlobalInit().Leader().MountNamespace() - } - f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) - if err != nil { - return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + if kernel.VFS2Enabled { + // Get the full path to the filename from the PATH env variable. + if initArgs.MountNamespaceVFS2 == nil { + // Set initArgs so that 'ctx' returns the namespace. + // + // MountNamespaceVFS2 adds a reference to the namespace, which is + // transferred to the new process. + initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2() + } + + paths := fs.GetPath(initArgs.Envv) + vfsObj := proc.Kernel.VFS + file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths) + if err != nil { + return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + } + initArgs.File = fsbridge.NewVFSFile(file) + } else { + // Get the full path to the filename from the PATH env variable. + paths := fs.GetPath(initArgs.Envv) + if initArgs.MountNamespace == nil { + // Set initArgs so that 'ctx' returns the namespace. + initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() + + // initArgs must hold a reference on MountNamespace, which will + // be donated to the new process in CreateProcess. + initArgs.MountNamespaceVFS2.IncRef() + } + f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) + if err != nil { + return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + } + initArgs.Filename = f } - initArgs.Filename = f } mounter := fs.FileOwnerFromContext(ctx) + // TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2. var ttyFile *fs.File for appFD, hostFile := range args.FilePayload.Files { var appFile *fs.File @@ -411,3 +454,67 @@ func ttyName(tty *kernel.TTY) string { } return fmt.Sprintf("pts/%d", tty.Index) } + +// ResolveExecutablePath resolves the given executable name given a set of +// paths that might contain it. +func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) { + root := vfs.RootFromContext(ctx) + defer root.DecRef() + creds := auth.CredentialsFromContext(ctx) + + // Absolute paths can be used directly. + if path.IsAbs(name) { + return openExecutable(ctx, vfsObj, creds, root, name) + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(name, '/') > 0 { + if len(wd) == 0 { + wd = "/" + } + if !path.IsAbs(wd) { + return nil, fmt.Errorf("working directory %q must be absolute", wd) + } + return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name)) + } + + // Otherwise, we must lookup the name in the paths, starting from the + // calling context's root directory. + for _, p := range paths { + if !path.IsAbs(p) { + // Relative paths aren't safe, no one should be using them. + log.Warningf("Skipping relative path %q in $PATH", p) + continue + } + + binPath := path.Join(p, name) + f, err := openExecutable(ctx, vfsObj, creds, root, binPath) + if err != nil { + return nil, err + } + if f == nil { + continue // Not found/no access. + } + return f, nil + } + return nil, syserror.ENOENT +} + +func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) { + pop := vfs.PathOperation{ + Root: root, + Start: root, // binPath is absolute, Start can be anything. + Path: fspath.Parse(path), + FollowFinalSymlink: true, + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, + } + f, err := vfsObj.OpenAt(ctx, creds, &pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + return nil, nil + } + return f, err +} diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 280093c5e..77c2c5c0e 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/sentry/fs/proc/device", "//pkg/sentry/fs/proc/seqfile", "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fsbridge", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index ca020e11e..8ab8d8a02 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -28,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -249,7 +250,7 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { return newProcInode(t, exeSymlink, msrc, fs.Symlink, t) } -func (e *exe) executable() (d *fs.Dirent, err error) { +func (e *exe) executable() (file fsbridge.File, err error) { e.t.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { @@ -262,8 +263,8 @@ func (e *exe) executable() (d *fs.Dirent, err error) { // The MemoryManager may be destroyed, in which case // MemoryManager.destroy will simply set the executable to nil // (with locks held). - d = mm.Executable() - if d == nil { + file = mm.Executable() + if file == nil { err = syserror.ENOENT } }) @@ -283,15 +284,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { } defer exec.DecRef() - root := fs.RootFromContext(ctx) - if root == nil { - // This doesn't correspond to anything in Linux because the vfs is - // global there. - return "", syserror.EINVAL - } - defer root.DecRef() - n, _ := exec.FullName(root) - return n, nil + return exec.PathnameWithDeleted(ctx), nil } // namespaceSymlink represents a symlink in the namespacefs, such as the files diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD new file mode 100644 index 000000000..6c798f0bd --- /dev/null +++ b/pkg/sentry/fsbridge/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "fsbridge", + srcs = [ + "bridge.go", + "fs.go", + "vfs.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go new file mode 100644 index 000000000..8e7590721 --- /dev/null +++ b/pkg/sentry/fsbridge/bridge.go @@ -0,0 +1,54 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsbridge provides common interfaces to bridge between VFS1 and VFS2 +// files. +package fsbridge + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// File provides a common interface to bridge between VFS1 and VFS2 files. +type File interface { + // PathnameWithDeleted returns an absolute pathname to vd, consistent with + // Linux's d_path(). In particular, if vd.Dentry() has been disowned, + // PathnameWithDeleted appends " (deleted)" to the returned pathname. + PathnameWithDeleted(ctx context.Context) string + + // ReadFull read all contents from the file. + ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) + + // ConfigureMMap mutates opts to implement mmap(2) for the file. + ConfigureMMap(context.Context, *memmap.MMapOpts) error + + // Type returns the file type, e.g. linux.S_IFREG. + Type(context.Context) (linux.FileMode, error) + + // IncRef increments reference. + IncRef() + + // DecRef decrements reference. + DecRef() +} + +// Lookup provides a common interface to open files. +type Lookup interface { + // OpenPath opens a file. + OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) +} diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go new file mode 100644 index 000000000..093ce1fb3 --- /dev/null +++ b/pkg/sentry/fsbridge/fs.go @@ -0,0 +1,181 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsbridge + +import ( + "io" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fsFile implements File interface over fs.File. +// +// +stateify savable +type fsFile struct { + file *fs.File +} + +var _ File = (*fsFile)(nil) + +// NewFSFile creates a new File over fs.File. +func NewFSFile(file *fs.File) File { + return &fsFile{file: file} +} + +// PathnameWithDeleted implements File. +func (f *fsFile) PathnameWithDeleted(ctx context.Context) string { + root := fs.RootFromContext(ctx) + if root == nil { + // This doesn't correspond to anything in Linux because the vfs is + // global there. + return "" + } + defer root.DecRef() + + name, _ := f.file.Dirent.FullName(root) + return name +} + +// ReadFull implements File. +func (f *fsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + var total int64 + for dst.NumBytes() > 0 { + n, err := f.file.Preadv(ctx, dst, offset+total) + total += n + if err == io.EOF && total != 0 { + return total, io.ErrUnexpectedEOF + } else if err != nil { + return total, err + } + dst = dst.DropFirst64(n) + } + return total, nil +} + +// ConfigureMMap implements File. +func (f *fsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return f.file.ConfigureMMap(ctx, opts) +} + +// Type implements File. +func (f *fsFile) Type(context.Context) (linux.FileMode, error) { + return linux.FileMode(f.file.Dirent.Inode.StableAttr.Type.LinuxType()), nil +} + +// IncRef implements File. +func (f *fsFile) IncRef() { + f.file.IncRef() +} + +// DecRef implements File. +func (f *fsFile) DecRef() { + f.file.DecRef() +} + +// fsLookup implements Lookup interface using fs.File. +// +// +stateify savable +type fsLookup struct { + mntns *fs.MountNamespace + + root *fs.Dirent + workingDir *fs.Dirent +} + +var _ Lookup = (*fsLookup)(nil) + +// NewFSLookup creates a new Lookup using VFS1. +func NewFSLookup(mntns *fs.MountNamespace, root, workingDir *fs.Dirent) Lookup { + return &fsLookup{ + mntns: mntns, + root: root, + workingDir: workingDir, + } +} + +// OpenPath implements Lookup. +func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) { + var d *fs.Dirent + var err error + if resolveFinal { + d, err = l.mntns.FindInode(ctx, l.root, l.workingDir, path, remainingTraversals) + } else { + d, err = l.mntns.FindLink(ctx, l.root, l.workingDir, path, remainingTraversals) + } + if err != nil { + return nil, err + } + defer d.DecRef() + + if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { + return nil, syserror.ELOOP + } + + fsPerm := openOptionsToPermMask(&opts) + if err := d.Inode.CheckPermission(ctx, fsPerm); err != nil { + return nil, err + } + + // If they claim it's a directory, then make sure. + if strings.HasSuffix(path, "/") { + if d.Inode.StableAttr.Type != fs.Directory { + return nil, syserror.ENOTDIR + } + } + + if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile { + ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type) + return nil, syserror.EACCES + } + + f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags)) + if err != nil { + return nil, err + } + + return &fsFile{file: f}, nil +} + +func openOptionsToPermMask(opts *vfs.OpenOptions) fs.PermMask { + mode := opts.Flags & linux.O_ACCMODE + return fs.PermMask{ + Read: mode == linux.O_RDONLY || mode == linux.O_RDWR, + Write: mode == linux.O_WRONLY || mode == linux.O_RDWR, + Execute: opts.FileExec, + } +} + +func flagsToFileFlags(flags uint32) fs.FileFlags { + return fs.FileFlags{ + Direct: flags&linux.O_DIRECT != 0, + DSync: flags&(linux.O_DSYNC|linux.O_SYNC) != 0, + Sync: flags&linux.O_SYNC != 0, + NonBlocking: flags&linux.O_NONBLOCK != 0, + Read: (flags & linux.O_ACCMODE) != linux.O_WRONLY, + Write: (flags & linux.O_ACCMODE) != linux.O_RDONLY, + Append: flags&linux.O_APPEND != 0, + Directory: flags&linux.O_DIRECTORY != 0, + Async: flags&linux.O_ASYNC != 0, + LargeFile: flags&linux.O_LARGEFILE != 0, + Truncate: flags&linux.O_TRUNC != 0, + } +} diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go new file mode 100644 index 000000000..e657c39bc --- /dev/null +++ b/pkg/sentry/fsbridge/vfs.go @@ -0,0 +1,134 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsbridge + +import ( + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fsFile implements File interface over vfs.FileDescription. +// +// +stateify savable +type vfsFile struct { + file *vfs.FileDescription +} + +var _ File = (*vfsFile)(nil) + +// NewVFSFile creates a new File over fs.File. +func NewVFSFile(file *vfs.FileDescription) File { + return &vfsFile{file: file} +} + +// PathnameWithDeleted implements File. +func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string { + root := vfs.RootFromContext(ctx) + defer root.DecRef() + + vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() + name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry()) + return name +} + +// ReadFull implements File. +func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + var total int64 + for dst.NumBytes() > 0 { + n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{}) + total += n + if err == io.EOF && total != 0 { + return total, io.ErrUnexpectedEOF + } else if err != nil { + return total, err + } + dst = dst.DropFirst64(n) + } + return total, nil +} + +// ConfigureMMap implements File. +func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return f.file.ConfigureMMap(ctx, opts) +} + +// Type implements File. +func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) { + stat, err := f.file.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return 0, err + } + return linux.FileMode(stat.Mode).FileType(), nil +} + +// IncRef implements File. +func (f *vfsFile) IncRef() { + f.file.IncRef() +} + +// DecRef implements File. +func (f *vfsFile) DecRef() { + f.file.DecRef() +} + +// fsLookup implements Lookup interface using fs.File. +// +// +stateify savable +type vfsLookup struct { + mntns *vfs.MountNamespace + + root vfs.VirtualDentry + workingDir vfs.VirtualDentry +} + +var _ Lookup = (*vfsLookup)(nil) + +// NewVFSLookup creates a new Lookup using VFS2. +func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) Lookup { + return &vfsLookup{ + mntns: mntns, + root: root, + workingDir: workingDir, + } +} + +// OpenPath implements Lookup. +// +// remainingTraversals is not configurable in VFS2, all callers are using the +// default anyways. +// +// TODO(gvisor.dev/issue/1623): Check mount has read and exec permission. +func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) { + vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem() + creds := auth.CredentialsFromContext(ctx) + pop := &vfs.PathOperation{ + Root: l.root, + Start: l.root, + Path: fspath.Parse(path), + FollowFinalSymlink: resolveFinal, + } + fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts) + if err != nil { + return nil, err + } + return &vfsFile{file: fd}, nil +} diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index e03a0c665..abd4f24e7 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -28,6 +28,9 @@ import ( "gvisor.dev/gvisor/pkg/sync" ) +// Name is the default filesystem name. +const Name = "devtmpfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct { initOnce sync.Once @@ -107,6 +110,7 @@ func (a *Accessor) wrapContext(ctx context.Context) *accessorContext { func (ac *accessorContext) Value(key interface{}) interface{} { switch key { case vfs.CtxMountNamespace: + ac.a.mntns.IncRef() return ac.a.mntns case vfs.CtxRoot: ac.a.root.IncRef() diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 138adb9f7..5cfb0dc4c 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -400,6 +400,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() parent.dirMu.Lock() defer parent.dirMu.Unlock() childVFSD := parent.vfsd.Child(name) @@ -934,7 +935,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if oldParent == newParent && oldName == newName { return nil } - if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index d0552bd99..d00850e25 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -52,6 +52,9 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// Name is the default filesystem name. +const Name = "9p" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index ee98eb66a..292f58afd 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -544,6 +544,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() virtfs := rp.VirtualFilesystem() srcDirDentry := srcDirVFSD.Impl().(*Dentry) @@ -595,7 +596,10 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry := vfsd.Parent().Impl().(*Dentry) parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() - if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { @@ -697,7 +701,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry := vfsd.Parent().Impl().(*Dentry) parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() - if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 12aac2e6a..a83245866 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -14,6 +14,7 @@ go_library( "tasks_net.go", "tasks_sys.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 11477b6a9..5c19d5522 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -26,15 +26,18 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) -// procFSType is the factory class for procfs. +// Name is the default filesystem name. +const Name = "proc" + +// FilesystemType is the factory class for procfs. // // +stateify savable -type procFSType struct{} +type FilesystemType struct{} -var _ vfs.FilesystemType = (*procFSType)(nil) +var _ vfs.FilesystemType = (*FilesystemType)(nil) // GetFilesystem implements vfs.FilesystemType. -func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, nil, fmt.Errorf("procfs requires a kernel") @@ -47,12 +50,13 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile procfs := &kernfs.Filesystem{} procfs.VFSFilesystem().Init(vfsObj, procfs) - var data *InternalData + var cgroups map[string]string if opts.InternalData != nil { - data = opts.InternalData.(*InternalData) + data := opts.InternalData.(*InternalData) + cgroups = data.Cgroups } - _, dentry := newTasksInode(procfs, k, pidns, data.Cgroups) + _, dentry := newTasksInode(procfs, k, pidns, cgroups) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 6fc3524db..96c72cbc9 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -90,8 +90,7 @@ func setup(t *testing.T) *testutil.System { ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS.MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) fsOpts := vfs.GetFilesystemOptions{ @@ -102,11 +101,11 @@ func setup(t *testing.T) *testutil.System { }, }, } - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts) + mntns, err := k.VFS.NewMountNamespace(ctx, creds, "", Name, &fsOpts) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } - return testutil.NewSystem(ctx, t, vfsObj, mntns) + return testutil.NewSystem(ctx, t, k.VFS, mntns) } func TestTasksEmpty(t *testing.T) { @@ -131,7 +130,7 @@ func TestTasks(t *testing.T) { var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -213,7 +212,7 @@ func TestTasksOffset(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) for i := 0; i < 3; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil { + if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil { t.Fatalf("CreateTask(): %v", err) } } @@ -337,7 +336,7 @@ func TestTask(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - _, err := testutil.CreateTask(s.Ctx, "name", tc) + _, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -352,7 +351,7 @@ func TestProcSelf(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, "name", tc) + task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -433,7 +432,7 @@ func TestTree(t *testing.T) { var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 66c0d8bc8..a741e2bb6 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "sys.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index d693fceae..c36c4fa11 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -28,6 +28,9 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// Name is the default filesystem name. +const Name = "sysfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 8b1cf0bd0..5d1ba5867 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -34,16 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System { } ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - v := vfs.New() - v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS.MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{}) + mns, err := k.VFS.NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("Failed to create new mount namespace: %v", err) } - return testutil.NewSystem(ctx, t, v, mns) + return testutil.NewSystem(ctx, t, k.VFS, mns) } func TestReadCPUFile(t *testing.T) { diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index efd5974c4..e4f36f4ae 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -16,7 +16,7 @@ go_library( "//pkg/cpuid", "//pkg/fspath", "//pkg/memutil", - "//pkg/sentry/fs", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/sched", diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 89f8c4915..a91b3ec4d 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" @@ -33,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" // Platforms are plugable. _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" @@ -99,26 +100,27 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } - ctx := k.SupervisorContext() + kernel.VFS2Enabled = true + + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + k.VFS = vfsObj - // Create mount namespace without root as it's the minimum required to create - // the global thread group. - mntns, err := fs.NewMountNamespace(ctx, nil) - if err != nil { - return nil, err - } ls, err := limits.NewLinuxLimitSet() if err != nil { return nil, err } - tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) + tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) k.TestOnly_SetGlobalInit(tg) return k, nil } // CreateTask creates a new bare bones task for tests. -func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) { +func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) config := &kernel.TaskConfig{ Kernel: k, @@ -129,6 +131,8 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kern UTSNamespace: kernel.UTSNamespaceFromContext(ctx), IPCNamespace: kernel.IPCNamespaceFromContext(ctx), AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + MountNamespaceVFS2: mntns, + FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), } return k.TaskSet().NewTask(config) } diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 69fd84ddd..b97e3534a 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -41,12 +41,12 @@ type System struct { Creds *auth.Credentials VFS *vfs.VirtualFilesystem Root vfs.VirtualDentry - mns *vfs.MountNamespace + MntNs *vfs.MountNamespace } // NewSystem constructs a System. // -// Precondition: Caller must hold a reference on mns, whose ownership +// Precondition: Caller must hold a reference on MntNs, whose ownership // is transferred to the new System. func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System { s := &System{ @@ -54,7 +54,7 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns Ctx: ctx, Creds: auth.CredentialsFromContext(ctx), VFS: v, - mns: mns, + MntNs: mns, Root: mns.Root(), } return s @@ -75,7 +75,7 @@ func (s *System) WithSubtest(t *testing.T) *System { Ctx: s.Ctx, Creds: s.Creds, VFS: s.VFS, - mns: s.mns, + MntNs: s.MntNs, Root: s.Root, } } @@ -90,7 +90,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { Ctx: ctx, Creds: s.Creds, VFS: s.VFS, - mns: s.mns, + MntNs: s.MntNs, Root: s.Root, } } @@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { s.Root.DecRef() - s.mns.DecRef() // Reference on mns passed to NewSystem. + s.MntNs.DecRef() // Reference on MntNs passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 8785452b6..7f7b791c4 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -486,7 +486,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa vfsObj := rp.VirtualFilesystem() oldParentDir := oldParent.inode.impl.(*directory) newParentDir := newParent.inode.impl.(*directory) - if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil { return err } if replaced != nil { @@ -543,7 +545,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() - if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { return err } parent.inode.impl.(*directory).childList.Remove(child) @@ -631,7 +635,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() - if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { return err } parent.inode.impl.(*directory).childList.Remove(child) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 2108d0f4d..c5bb17562 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -40,6 +40,9 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// Name is the default filesystem name. +const Name = "tmpfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 2231d6973..46306945f 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -157,6 +157,7 @@ go_library( "//pkg/context", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fspath", "//pkg/log", "//pkg/metric", "//pkg/refs", @@ -167,6 +168,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", + "//pkg/sentry/fsbridge", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 2448c1d99..7218aa24e 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) @@ -37,10 +38,16 @@ type FSContext struct { // destroyed. root *fs.Dirent + // rootVFS2 is the filesystem root. + rootVFS2 vfs.VirtualDentry + // cwd is the current working directory. Will be nil iff the FSContext // has been destroyed. cwd *fs.Dirent + // cwdVFS2 is the current working directory. + cwdVFS2 vfs.VirtualDentry + // umask is the current file mode creation mask. When a thread using this // context invokes a syscall that creates a file, bits set in umask are // removed from the permissions that the file is created with. @@ -60,6 +67,19 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { return &f } +// NewFSContextVFS2 returns a new filesystem context. +func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { + root.IncRef() + cwd.IncRef() + f := FSContext{ + rootVFS2: root, + cwdVFS2: cwd, + umask: umask, + } + f.EnableLeakCheck("kernel.FSContext") + return &f +} + // destroy is the destructor for an FSContext. // // This will call DecRef on both root and cwd Dirents. If either call to @@ -75,11 +95,17 @@ func (f *FSContext) destroy() { f.mu.Lock() defer f.mu.Unlock() - f.root.DecRef() - f.root = nil - - f.cwd.DecRef() - f.cwd = nil + if VFS2Enabled { + f.rootVFS2.DecRef() + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef() + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef() + f.root = nil + f.cwd.DecRef() + f.cwd = nil + } } // DecRef implements RefCounter.DecRef with destructor f.destroy. @@ -93,12 +119,21 @@ func (f *FSContext) DecRef() { func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() - f.cwd.IncRef() - f.root.IncRef() + + if VFS2Enabled { + f.cwdVFS2.IncRef() + f.rootVFS2.IncRef() + } else { + f.cwd.IncRef() + f.root.IncRef() + } + return &FSContext{ - cwd: f.cwd, - root: f.root, - umask: f.umask, + cwd: f.cwd, + root: f.root, + cwdVFS2: f.cwdVFS2, + rootVFS2: f.rootVFS2, + umask: f.umask, } } @@ -109,12 +144,23 @@ func (f *FSContext) Fork() *FSContext { func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() - if f.cwd != nil { - f.cwd.IncRef() - } + + f.cwd.IncRef() return f.cwd } +// WorkingDirectoryVFS2 returns the current working directory. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { + f.mu.Lock() + defer f.mu.Unlock() + + f.cwdVFS2.IncRef() + return f.cwdVFS2 +} + // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the Dirent. // @@ -137,6 +183,20 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { old.DecRef() } +// SetWorkingDirectoryVFS2 sets the current working directory. +// This will take an extra reference on the VirtualDentry. +// +// This is not a valid call after destroy. +func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { + f.mu.Lock() + defer f.mu.Unlock() + + old := f.cwdVFS2 + f.cwdVFS2 = d + d.IncRef() + old.DecRef() +} + // RootDirectory returns the current filesystem root. // // This will return nil if called after destroy(), otherwise it will return a @@ -150,6 +210,18 @@ func (f *FSContext) RootDirectory() *fs.Dirent { return f.root } +// RootDirectoryVFS2 returns the current filesystem root. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { + f.mu.Lock() + defer f.mu.Unlock() + + f.rootVFS2.IncRef() + return f.rootVFS2 +} + // SetRootDirectory sets the root directory. // This will take an extra reference on the Dirent. // diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 3ee760ba2..2665f057c 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -43,11 +43,13 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -71,6 +73,10 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" ) +// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow +// easy access everywhere. To be removed once VFS2 becomes the default. +var VFS2Enabled = false + // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // @@ -238,6 +244,9 @@ type Kernel struct { // SpecialOpts contains special kernel options. SpecialOpts + + // VFS keeps the filesystem state used across the kernel. + VFS *vfs.VirtualFilesystem } // InitKernelArgs holds arguments to Init. @@ -624,7 +633,7 @@ type CreateProcessArgs struct { // File is a passed host FD pointing to a file to load as the init binary. // // This is checked if and only if Filename is "". - File *fs.File + File fsbridge.File // Argvv is a list of arguments. Argv []string @@ -673,6 +682,13 @@ type CreateProcessArgs struct { // increment it). MountNamespace *fs.MountNamespace + // MountNamespaceVFS2 optionally contains the mount namespace for this + // process. If nil, the init process's mount namespace is used. + // + // Anyone setting MountNamespaceVFS2 must donate a reference (i.e. + // increment it). + MountNamespaceVFS2 *vfs.MountNamespace + // ContainerID is the container that the process belongs to. ContainerID string } @@ -711,11 +727,22 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.args.Credentials case fs.CtxRoot: if ctx.args.MountNamespace != nil { - // MountNamespace.Root() will take a reference on the root - // dirent for us. + // MountNamespace.Root() will take a reference on the root dirent for us. return ctx.args.MountNamespace.Root() } return nil + case vfs.CtxRoot: + if ctx.args.MountNamespaceVFS2 == nil { + return nil + } + // MountNamespaceVFS2.Root() takes a reference on the root dirent for us. + return ctx.args.MountNamespaceVFS2.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2 takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: @@ -757,34 +784,77 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, defer k.extMu.Unlock() log.Infof("EXEC: %v", args.Argv) - // Grab the mount namespace. - mounts := args.MountNamespace - if mounts == nil { - mounts = k.GlobalInit().Leader().MountNamespace() - mounts.IncRef() - } - - tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) ctx := args.NewContext(k) - // Get the root directory from the MountNamespace. - root := mounts.Root() - // The call to newFSContext below will take a reference on root, so we - // don't need to hold this one. - defer root.DecRef() - - // Grab the working directory. - remainingTraversals := uint(args.MaxSymlinkTraversals) - wd := root // Default. - if args.WorkingDirectory != "" { - var err error - wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) - if err != nil { - return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + var ( + opener fsbridge.Lookup + fsContext *FSContext + mntns *fs.MountNamespace + ) + + if VFS2Enabled { + mntnsVFS2 := args.MountNamespaceVFS2 + if mntnsVFS2 == nil { + // MountNamespaceVFS2 adds a reference to the namespace, which is + // transferred to the new process. + mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2() + } + // Get the root directory from the MountNamespace. + root := args.MountNamespaceVFS2.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + wd := root // Default. + if args.WorkingDirectory != "" { + pop := vfs.PathOperation{ + Root: root, + Start: wd, + Path: fspath.Parse(args.WorkingDirectory), + FollowFinalSymlink: true, + } + var err error + wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) + fsContext = NewFSContextVFS2(root, wd, args.Umask) + + } else { + mntns = args.MountNamespace + if mntns == nil { + mntns = k.GlobalInit().Leader().MountNamespace() + mntns.IncRef() } - defer wd.DecRef() + // Get the root directory from the MountNamespace. + root := mntns.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + remainingTraversals := args.MaxSymlinkTraversals + wd := root // Default. + if args.WorkingDirectory != "" { + var err error + wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewFSLookup(mntns, root, wd) + fsContext = newFSContext(root, wd, args.Umask) } + tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) + // Check which file to start from. switch { case args.Filename != "": @@ -805,11 +875,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, } // Create a fresh task context. - remainingTraversals = uint(args.MaxSymlinkTraversals) + remainingTraversals := args.MaxSymlinkTraversals loadArgs := loader.LoadArgs{ - Mounts: mounts, - Root: root, - WorkingDirectory: wd, + Opener: opener, RemainingTraversals: &remainingTraversals, ResolveFinal: true, Filename: args.Filename, @@ -834,13 +902,14 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, Kernel: k, ThreadGroup: tg, TaskContext: tc, - FSContext: newFSContext(root, wd, args.Umask), + FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, AbstractSocketNamespace: args.AbstractSocketNamespace, + MountNamespaceVFS2: args.MountNamespaceVFS2, ContainerID: args.ContainerID, } t, err := k.tasks.NewTask(config) @@ -1378,6 +1447,20 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return ctx.k.globalInit.mounts.Root() } return nil + case vfs.CtxRoot: + if ctx.k.globalInit == nil { + return vfs.VirtualDentry{} + } + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + defer mntns.DecRef() + // Root() takes a reference on the root dirent for us. + return mntns.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2() takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 981e8c7fe..a3443ff21 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -424,6 +424,11 @@ type Task struct { // abstractSockets is protected by mu. abstractSockets *AbstractSocketNamespace + // mountNamespaceVFS2 is the task's mount namespace. + // + // It is protected by mu. It is owned by the task goroutine. + mountNamespaceVFS2 *vfs.MountNamespace + // parentDeathSignal is sent to this task's thread group when its parent exits. // // parentDeathSignal is protected by mu. @@ -638,6 +643,11 @@ func (t *Task) Value(key interface{}) interface{} { return int32(t.ThreadGroup().ID()) case fs.CtxRoot: return t.fsContext.RootDirectory() + case vfs.CtxRoot: + return t.fsContext.RootDirectoryVFS2() + case vfs.CtxMountNamespace: + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 case fs.CtxDirentCacheLimiter: return t.k.DirentCacheLimiter case inet.CtxStack: @@ -701,6 +711,14 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) IsChrooted() bool { + if VFS2Enabled { + realRoot := t.mountNamespaceVFS2.Root() + defer realRoot.DecRef() + root := t.fsContext.RootDirectoryVFS2() + defer root.DecRef() + return root != realRoot + } + realRoot := t.tg.mounts.Root() defer realRoot.DecRef() root := t.fsContext.RootDirectory() @@ -796,6 +814,15 @@ func (t *Task) MountNamespace() *fs.MountNamespace { return t.tg.mounts } +// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the +// returned mount namespace. +func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace { + t.mu.Lock() + defer t.mu.Unlock() + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 +} + // AbstractSockets returns t's AbstractSocketNamespace. func (t *Task) AbstractSockets() *AbstractSocketNamespace { return t.abstractSockets diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 53d4d211b..ba74b4c1c 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -199,6 +199,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { ipcns = NewIPCNamespace(userns) } + // TODO(b/63601033): Implement CLONE_NEWNS. + mntnsVFS2 := t.mountNamespaceVFS2 + if mntnsVFS2 != nil { + mntnsVFS2.IncRef() + } + tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err @@ -241,7 +247,9 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { rseqAddr := usermem.Addr(0) rseqSignature := uint32(0) if opts.NewThreadGroup { - tg.mounts.IncRef() + if tg.mounts != nil { + tg.mounts.IncRef() + } sh := t.tg.signalHandlers if opts.NewSignalHandlers { sh = sh.Fork() @@ -265,6 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { UTSNamespace: utsns, IPCNamespace: ipcns, AbstractSocketNamespace: t.abstractSockets, + MountNamespaceVFS2: mntnsVFS2, RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 2d6e7733c..2be982684 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -136,7 +136,7 @@ func (t *Task) Stack() *arch.Stack { func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { // If File is not nil, we should load that instead of resolving Filename. if args.File != nil { - args.Filename = args.File.MappedName(ctx) + args.Filename = args.File.PathnameWithDeleted(ctx) } // Prepare a new user address space to load into. diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 435761e5a..c4ade6e8e 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -269,6 +269,13 @@ func (*runExitMain) execute(t *Task) taskRunState { t.fsContext.DecRef() t.fdTable.DecRef() + t.mu.Lock() + if t.mountNamespaceVFS2 != nil { + t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2 = nil + } + t.mu.Unlock() + // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 41259210c..6d737d3e5 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -198,18 +198,11 @@ func (t *Task) traceExecEvent(tc *TaskContext) { if !trace.IsEnabled() { return } - d := tc.MemoryManager.Executable() - if d == nil { + file := tc.MemoryManager.Executable() + if file == nil { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } - defer d.DecRef() - root := t.fsContext.RootDirectory() - if root == nil { - trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>") - return - } - defer root.DecRef() - n, _ := d.FullName(root) - trace.Logf(t.traceContext, traceCategory, "exec: %s", n) + defer file.DecRef() + trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index de838beef..f9236a842 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -80,6 +81,9 @@ type TaskConfig struct { // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. AbstractSocketNamespace *AbstractSocketNamespace + // MountNamespaceVFS2 is the MountNamespace of the new task. + MountNamespaceVFS2 *vfs.MountNamespace + // RSeqAddr is a pointer to the the userspace linux.RSeq structure. RSeqAddr usermem.Addr @@ -116,28 +120,29 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { parent: cfg.Parent, children: make(map[*Task]struct{}), }, - runState: (*runApp)(nil), - interruptChan: make(chan struct{}, 1), - signalMask: cfg.SignalMask, - signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, - tc: *tc, - fsContext: cfg.FSContext, - fdTable: cfg.FDTable, - p: cfg.Kernel.Platform.NewContext(), - k: cfg.Kernel, - ptraceTracees: make(map[*Task]struct{}), - allowedCPUMask: cfg.AllowedCPUMask.Copy(), - ioUsage: &usage.IO{}, - niceness: cfg.Niceness, - netns: cfg.NetworkNamespaced, - utsns: cfg.UTSNamespace, - ipcns: cfg.IPCNamespace, - abstractSockets: cfg.AbstractSocketNamespace, - rseqCPU: -1, - rseqAddr: cfg.RSeqAddr, - rseqSignature: cfg.RSeqSignature, - futexWaiter: futex.NewWaiter(), - containerID: cfg.ContainerID, + runState: (*runApp)(nil), + interruptChan: make(chan struct{}, 1), + signalMask: cfg.SignalMask, + signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, + tc: *tc, + fsContext: cfg.FSContext, + fdTable: cfg.FDTable, + p: cfg.Kernel.Platform.NewContext(), + k: cfg.Kernel, + ptraceTracees: make(map[*Task]struct{}), + allowedCPUMask: cfg.AllowedCPUMask.Copy(), + ioUsage: &usage.IO{}, + niceness: cfg.Niceness, + netns: cfg.NetworkNamespaced, + utsns: cfg.UTSNamespace, + ipcns: cfg.IPCNamespace, + abstractSockets: cfg.AbstractSocketNamespace, + mountNamespaceVFS2: cfg.MountNamespaceVFS2, + rseqCPU: -1, + rseqAddr: cfg.RSeqAddr, + rseqSignature: cfg.RSeqSignature, + futexWaiter: futex.NewWaiter(), + containerID: cfg.ContainerID, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 768e958d2..268f62e9d 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -256,7 +256,7 @@ type ThreadGroup struct { tty *TTY } -// NewThreadGroup returns a new, empty thread group in PID namespace ns. The +// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. @@ -317,7 +317,9 @@ func (tg *ThreadGroup) release() { for _, it := range its { it.DestroyTimer() } - tg.mounts.DecRef() + if tg.mounts != nil { + tg.mounts.DecRef() + } } // forEachChildThreadGroupLocked indicates over all child ThreadGroups. diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index 23790378a..c6aa65f28 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -33,6 +33,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", "//pkg/sentry/memmap", @@ -40,6 +41,7 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 122ed05c2..616fafa2c 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -27,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -97,11 +97,11 @@ type elfInfo struct { // accepts from the ELF, and it doesn't parse unnecessary parts of the file. // // ctx may be nil if f does not need it. -func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { +func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) { // Check ident first; it will tell us the endianness of the rest of the // structs. var ident [elf.EI_NIDENT]byte - _, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0) + _, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0) if err != nil { log.Infof("Error reading ELF ident: %v", err) // The entire ident array always exists. @@ -137,7 +137,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { var hdr elf.Header64 hdrBuf := make([]byte, header64Size) - _, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0) + _, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0) if err != nil { log.Infof("Error reading ELF header: %v", err) // The entire header always exists. @@ -187,7 +187,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { } phdrBuf := make([]byte, totalPhdrSize) - _, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff)) + _, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff)) if err != nil { log.Infof("Error reading ELF phdrs: %v", err) // If phdrs were specified, they should all exist. @@ -227,7 +227,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { // mapSegment maps a phdr into the Task. offset is the offset to apply to // phdr.Vaddr. -func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error { +func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr *elf.ProgHeader, offset usermem.Addr) error { // We must make a page-aligned mapping. adjust := usermem.Addr(phdr.Vaddr).PageOffset() @@ -395,7 +395,7 @@ type loadedELF struct { // // Preconditions: // * f is an ELF file -func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) { +func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) { first := true var start, end usermem.Addr var interpreter string @@ -431,7 +431,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el } path := make([]byte, phdr.Filesz) - _, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off)) + _, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off)) if err != nil { // If an interpreter was specified, it should exist. ctx.Infof("Error reading PT_INTERP path: %v", err) @@ -564,7 +564,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el // Preconditions: // * f is an ELF file // * f is the first ELF loaded into m -func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) { +func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) { info, err := parseHeader(ctx, f) if err != nil { ctx.Infof("Failed to parse initial ELF: %v", err) @@ -602,7 +602,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS // // Preconditions: // * f is an ELF file -func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) { +func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) { info, err := parseHeader(ctx, f) if err != nil { if err == syserror.ENOEXEC { @@ -649,16 +649,14 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error // Refresh the traversal limit. *args.RemainingTraversals = linux.MaxSymlinkTraversals args.Filename = bin.interpreter - d, i, err := openPath(ctx, args) + intFile, err := openPath(ctx, args) if err != nil { ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err } - defer i.DecRef() - // We don't need the Dirent. - d.DecRef() + defer intFile.DecRef() - interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin) + interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin) if err != nil { ctx.Infof("Error loading interpreter: %v", err) return loadedELF{}, nil, err diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go index 098a45d36..3886b4d33 100644 --- a/pkg/sentry/loader/interpreter.go +++ b/pkg/sentry/loader/interpreter.go @@ -19,7 +19,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -37,9 +37,9 @@ const ( ) // parseInterpreterScript returns the interpreter path and argv. -func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv []string) (newpath string, newargv []string, err error) { +func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.File, argv []string) (newpath string, newargv []string, err error) { line := make([]byte, interpMaxLineLength) - n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0) + n, err := f.ReadFull(ctx, usermem.BytesIOSequence(line), 0) // Short read is OK. if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 9a613d6b7..d6675b8f0 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -20,7 +20,6 @@ import ( "fmt" "io" "path" - "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" @@ -29,8 +28,10 @@ import ( "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -41,16 +42,6 @@ type LoadArgs struct { // MemoryManager is the memory manager to load the executable into. MemoryManager *mm.MemoryManager - // Mounts is the mount namespace in which to look up Filename. - Mounts *fs.MountNamespace - - // Root is the root directory under which to look up Filename. - Root *fs.Dirent - - // WorkingDirectory is the working directory under which to look up - // Filename. - WorkingDirectory *fs.Dirent - // RemainingTraversals is the maximum number of symlinks to follow to // resolve Filename. This counter is passed by reference to keep it // updated throughout the call stack. @@ -65,7 +56,12 @@ type LoadArgs struct { // File is an open fs.File object of the executable. If File is not // nil, then File will be loaded and Filename will be ignored. - File *fs.File + // + // The caller is responsible for checking that the user can execute this file. + File fsbridge.File + + // Opener is used to open the executable file when 'File' is nil. + Opener fsbridge.Lookup // CloseOnExec indicates that the executable (or one of its parent // directories) was opened with O_CLOEXEC. If the executable is an @@ -106,103 +102,32 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in // installed in the Task FDTable. The caller takes ownership of both. // // args.Filename must be a readable, executable, regular file. -func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) { +func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) { if args.Filename == "" { ctx.Infof("cannot open empty name") - return nil, nil, syserror.ENOENT - } - - var d *fs.Dirent - var err error - if args.ResolveFinal { - d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals) - } else { - d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals) - } - if err != nil { - return nil, nil, err - } - // Defer a DecRef for the sake of failure cases. - defer d.DecRef() - - if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) { - return nil, nil, syserror.ELOOP - } - - if err := checkPermission(ctx, d); err != nil { - return nil, nil, err - } - - // If they claim it's a directory, then make sure. - // - // N.B. we reject directories below, but we must first reject - // non-directories passed as directories. - if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) { - return nil, nil, syserror.ENOTDIR - } - - if err := checkIsRegularFile(ctx, d, args.Filename); err != nil { - return nil, nil, err - } - - f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) - if err != nil { - return nil, nil, err - } - // Defer a DecRef for the sake of failure cases. - defer f.DecRef() - - if err := checkPread(ctx, f, args.Filename); err != nil { - return nil, nil, err - } - - d.IncRef() - f.IncRef() - return d, f, err -} - -// checkFile performs checks on a file to be executed. -func checkFile(ctx context.Context, f *fs.File, filename string) error { - if err := checkPermission(ctx, f.Dirent); err != nil { - return err - } - - if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil { - return err + return nil, syserror.ENOENT } - return checkPread(ctx, f, filename) -} - -// checkPermission checks whether the file is readable and executable. -func checkPermission(ctx context.Context, d *fs.Dirent) error { - perms := fs.PermMask{ - // TODO(gvisor.dev/issue/160): Linux requires only execute - // permission, not read. However, our backing filesystems may - // prevent us from reading the file without read permission. - // - // Additionally, a task with a non-readable executable has - // additional constraints on access via ptrace and procfs. - Read: true, - Execute: true, + // TODO(gvisor.dev/issue/160): Linux requires only execute permission, + // not read. However, our backing filesystems may prevent us from reading + // the file without read permission. Additionally, a task with a + // non-readable executable has additional constraints on access via + // ptrace and procfs. + opts := vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, } - return d.Inode.CheckPermission(ctx, perms) + return args.Opener.OpenPath(ctx, args.Filename, opts, args.RemainingTraversals, args.ResolveFinal) } // checkIsRegularFile prevents us from trying to execute a directory, pipe, etc. -func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error { - attr := d.Inode.StableAttr - if !fs.IsRegular(attr) { - ctx.Infof("%s is not regular: %v", filename, attr) - return syserror.EACCES +func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string) error { + t, err := file.Type(ctx) + if err != nil { + return err } - return nil -} - -// checkPread checks whether we can read the file at arbitrary offsets. -func checkPread(ctx context.Context, f *fs.File, filename string) error { - if !f.Flags().Pread { - ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags()) + if t != linux.ModeRegular { + ctx.Infof("%q is not a regular file: %v", filename, t) return syserror.EACCES } return nil @@ -224,8 +149,10 @@ const ( maxLoaderAttempts = 6 ) -// loadExecutable loads an executable that is pointed to by args.File. If nil, -// the path args.Filename is resolved and loaded. If the executable is an +// loadExecutable loads an executable that is pointed to by args.File. The +// caller is responsible for checking that the user can execute this file. +// If nil, the path args.Filename is resolved and loaded (check that the user +// can execute this file is done here in this case). If the executable is an // interpreter script rather than an ELF, the binary of the corresponding // interpreter will be loaded. // @@ -234,37 +161,27 @@ const ( // * arch.Context matching the binary arch // * fs.Dirent of the binary file // * Possibly updated args.Argv -func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) { +func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, fsbridge.File, []string, error) { for i := 0; i < maxLoaderAttempts; i++ { - var ( - d *fs.Dirent - err error - ) if args.File == nil { - d, args.File, err = openPath(ctx, args) - // We will return d in the successful case, but defer a DecRef for the - // sake of intermediate loops and failure cases. - if d != nil { - defer d.DecRef() - } - if args.File != nil { - defer args.File.DecRef() + var err error + args.File, err = openPath(ctx, args) + if err != nil { + ctx.Infof("Error opening %s: %v", args.Filename, err) + return loadedELF{}, nil, nil, nil, err } + // Ensure file is release in case the code loops or errors out. + defer args.File.DecRef() } else { - d = args.File.Dirent - d.IncRef() - defer d.DecRef() - err = checkFile(ctx, args.File, args.Filename) - } - if err != nil { - ctx.Infof("Error opening %s: %v", args.Filename, err) - return loadedELF{}, nil, nil, nil, err + if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil { + return loadedELF{}, nil, nil, nil, err + } } // Check the header. Is this an ELF or interpreter script? var hdr [4]uint8 // N.B. We assume that reading from a regular file cannot block. - _, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0) + _, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0) // Allow unexpected EOF, as a valid executable could be only three bytes // (e.g., #!a). if err != nil && err != io.ErrUnexpectedEOF { @@ -281,9 +198,10 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context ctx.Infof("Error loading ELF: %v", err) return loadedELF{}, nil, nil, nil, err } - // An ELF is always terminal. Hold on to d. - d.IncRef() - return loaded, ac, d, args.Argv, err + // An ELF is always terminal. Hold on to file. + args.File.IncRef() + return loaded, ac, args.File, args.Argv, err + case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)): if args.CloseOnExec { return loadedELF{}, nil, nil, nil, syserror.ENOENT @@ -295,6 +213,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context } // Refresh the traversal limit for the interpreter. *args.RemainingTraversals = linux.MaxSymlinkTraversals + default: ctx.Infof("Unknown magic: %v", hdr) return loadedELF{}, nil, nil, nil, syserror.ENOEXEC @@ -317,11 +236,11 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context // * Load is called on the Task goroutine. func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) { // Load the executable itself. - loaded, ac, d, newArgv, err := loadExecutable(ctx, args) + loaded, ac, file, newArgv, err := loadExecutable(ctx, args) if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } - defer d.DecRef() + defer file.DecRef() // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) @@ -390,7 +309,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvStart(sl.EnvvStart) m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) - m.SetExecutable(d) + m.SetExecutable(file) ac.SetIP(uintptr(loaded.entry)) ac.SetStack(uintptr(stack.Bottom)) diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index 52f446ed7..161b28c2c 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -69,6 +70,8 @@ type byteReader struct { var _ fs.FileOperations = (*byteReader)(nil) // newByteReaderFile creates a fake file to read data from. +// +// TODO(gvisor.dev/issue/1623): Convert to VFS2. func newByteReaderFile(ctx context.Context, data []byte) *fs.File { // Create a fake inode. inode := fs.NewInode( @@ -123,7 +126,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq // * PT_LOAD segments don't extend beyond the end of the file. // // ctx may be nil if f does not need it. -func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) { +func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) { info, err := parseHeader(ctx, f) if err != nil { log.Infof("Unable to parse VDSO header: %v", err) @@ -221,7 +224,7 @@ type VDSO struct { // PrepareVDSO validates the system VDSO and returns a VDSO, containing the // param page for updating by the kernel. func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) { - vdsoFile := newByteReaderFile(ctx, vdsoBin) + vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin)) // First make sure the VDSO is valid. vdsoFile does not use ctx, so a // nil context can be passed. diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index e5729ced5..73591dab7 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -105,8 +105,8 @@ go_library( "//pkg/safecopy", "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/fs", "//pkg/sentry/fs/proc/seqfile", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/futex", "//pkg/sentry/kernel/shm", diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index f550acae0..6a49334f4 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -16,7 +16,7 @@ package mm import ( "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/usermem" ) @@ -132,7 +132,7 @@ func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) { // // An additional reference will be taken in the case of a non-nil executable, // which must be released by the caller. -func (mm *MemoryManager) Executable() *fs.Dirent { +func (mm *MemoryManager) Executable() fsbridge.File { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() @@ -147,15 +147,15 @@ func (mm *MemoryManager) Executable() *fs.Dirent { // SetExecutable sets the executable. // // This takes a reference on d. -func (mm *MemoryManager) SetExecutable(d *fs.Dirent) { +func (mm *MemoryManager) SetExecutable(file fsbridge.File) { mm.metadataMu.Lock() // Grab a new reference. - d.IncRef() + file.IncRef() // Set the executable. orig := mm.executable - mm.executable = d + mm.executable = file mm.metadataMu.Unlock() diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 09e582dd3..637383c7a 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -37,7 +37,7 @@ package mm import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" @@ -215,7 +215,7 @@ type MemoryManager struct { // is not nil, it holds a reference on the Dirent. // // executable is protected by metadataMu. - executable *fs.Dirent + executable fsbridge.File // dumpability describes if and how this MemoryManager may be dumped to // userspace. diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index a796b2396..46cb2a1cc 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -141,6 +141,10 @@ func path(t *kernel.Task, addr usermem.Addr) string { } func fd(t *kernel.Task, fd int32) string { + if kernel.VFS2Enabled { + return fdVFS2(t, fd) + } + root := t.FSContext().RootDirectory() if root != nil { defer root.DecRef() @@ -169,6 +173,30 @@ func fd(t *kernel.Task, fd int32) string { return fmt.Sprintf("%#x %s", fd, name) } +func fdVFS2(t *kernel.Task, fd int32) string { + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + + vfsObj := root.Mount().Filesystem().VirtualFilesystem() + if fd == linux.AT_FDCWD { + wd := t.FSContext().WorkingDirectoryVFS2() + defer wd.DecRef() + + name, _ := vfsObj.PathnameWithDeleted(t, root, wd) + return fmt.Sprintf("AT_FDCWD %s", name) + } + + file := t.GetFileVFS2(fd) + if file == nil { + // Cast FD to uint64 to avoid printing negative hex. + return fmt.Sprintf("%#x (bad FD)", uint64(fd)) + } + defer file.DecRef() + + name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry()) + return fmt.Sprintf("%#x %s", fd, name) +} + func fdpair(t *kernel.Task, addr usermem.Addr) string { var fds [2]int32 _, err := t.CopyIn(addr, &fds) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index be16ee686..0d24fd3c4 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -74,6 +74,7 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 98db32d77..9c6728530 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -135,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } // Set the underlying executable. - t.MemoryManager().SetExecutable(file.Dirent) + t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file)) case linux.PR_SET_MM_AUXV, linux.PR_SET_MM_START_CODE, diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 0c9e2255d..00915fdde 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/loader" @@ -119,7 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user defer root.DecRef() var wd *fs.Dirent - var executable *fs.File + var executable fsbridge.File var closeOnExec bool if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) { // Even if the pathname is absolute, we may still need the wd @@ -136,7 +137,15 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user closeOnExec = fdFlags.CloseOnExec if atEmptyPath && len(pathname) == 0 { - executable = f + // TODO(gvisor.dev/issue/160): Linux requires only execute permission, + // not read. However, our backing filesystems may prevent us from reading + // the file without read permission. Additionally, a task with a + // non-readable executable has additional constraints on access via + // ptrace and procfs. + if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil { + return 0, nil, err + } + executable = fsbridge.NewFSFile(f) } else { wd = f.Dirent wd.IncRef() @@ -152,9 +161,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user // Load the new TaskContext. remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ - Mounts: t.MountNamespace(), - Root: root, - WorkingDirectory: wd, + Opener: fsbridge.NewFSLookup(t.MountNamespace(), root, wd), RemainingTraversals: &remainingTraversals, ResolveFinal: resolveFinal, Filename: pathname, diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index c134714ee..e0ac32b33 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -22,4 +22,110 @@ import ( // Override syscall table to add syscalls implementations from this package. func Override(table map[uintptr]kernel.Syscall) { table[0] = syscalls.Supported("read", Read) + + // Remove syscalls that haven't been converted yet. It's better to get ENOSYS + // rather than a SIGSEGV deep in the stack. + delete(table, 1) // write + delete(table, 2) // open + delete(table, 3) // close + delete(table, 4) // stat + delete(table, 5) // fstat + delete(table, 6) // lstat + delete(table, 7) // poll + delete(table, 8) // lseek + delete(table, 9) // mmap + delete(table, 16) // ioctl + delete(table, 17) // pread64 + delete(table, 18) // pwrite64 + delete(table, 19) // readv + delete(table, 20) // writev + delete(table, 21) // access + delete(table, 22) // pipe + delete(table, 32) // dup + delete(table, 33) // dup2 + delete(table, 40) // sendfile + delete(table, 59) // execve + delete(table, 72) // fcntl + delete(table, 73) // flock + delete(table, 74) // fsync + delete(table, 75) // fdatasync + delete(table, 76) // truncate + delete(table, 77) // ftruncate + delete(table, 78) // getdents + delete(table, 79) // getcwd + delete(table, 80) // chdir + delete(table, 81) // fchdir + delete(table, 82) // rename + delete(table, 83) // mkdir + delete(table, 84) // rmdir + delete(table, 85) // creat + delete(table, 86) // link + delete(table, 87) // unlink + delete(table, 88) // symlink + delete(table, 89) // readlink + delete(table, 90) // chmod + delete(table, 91) // fchmod + delete(table, 92) // chown + delete(table, 93) // fchown + delete(table, 94) // lchown + delete(table, 133) // mknod + delete(table, 137) // statfs + delete(table, 138) // fstatfs + delete(table, 161) // chroot + delete(table, 162) // sync + delete(table, 165) // mount + delete(table, 166) // umount2 + delete(table, 172) // iopl + delete(table, 173) // ioperm + delete(table, 187) // readahead + delete(table, 188) // setxattr + delete(table, 189) // lsetxattr + delete(table, 190) // fsetxattr + delete(table, 191) // getxattr + delete(table, 192) // lgetxattr + delete(table, 193) // fgetxattr + delete(table, 206) // io_setup + delete(table, 207) // io_destroy + delete(table, 208) // io_getevents + delete(table, 209) // io_submit + delete(table, 210) // io_cancel + delete(table, 213) // epoll_create + delete(table, 214) // epoll_ctl_old + delete(table, 215) // epoll_wait_old + delete(table, 216) // remap_file_pages + delete(table, 217) // getdents64 + delete(table, 232) // epoll_wait + delete(table, 233) // epoll_ctl + delete(table, 253) // inotify_init + delete(table, 254) // inotify_add_watch + delete(table, 255) // inotify_rm_watch + delete(table, 257) // openat + delete(table, 258) // mkdirat + delete(table, 259) // mknodat + delete(table, 260) // fchownat + delete(table, 261) // futimesat + delete(table, 262) // fstatat + delete(table, 263) // unlinkat + delete(table, 264) // renameat + delete(table, 265) // linkat + delete(table, 266) // symlinkat + delete(table, 267) // readlinkat + delete(table, 268) // fchmodat + delete(table, 269) // faccessat + delete(table, 270) // pselect + delete(table, 271) // ppoll + delete(table, 285) // fallocate + delete(table, 291) // epoll_create1 + delete(table, 292) // dup3 + delete(table, 293) // pipe2 + delete(table, 294) // inotify_init1 + delete(table, 295) // preadv + delete(table, 296) // pwritev + delete(table, 306) // syncfs + delete(table, 316) // renameat2 + delete(table, 319) // memfd_create + delete(table, 322) // execveat + delete(table, 327) // preadv2 + delete(table, 328) // pwritev2 + delete(table, 332) // statx } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 14b39eb9d..0b4f18ab5 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -43,6 +43,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index d97362b9a..82781e6d3 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -29,9 +29,10 @@ const ( CtxRoot ) -// MountNamespaceFromContext returns the MountNamespace used by ctx. It does -// not take a reference on the returned MountNamespace. If ctx is not -// associated with a MountNamespace, MountNamespaceFromContext returns nil. +// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is +// not associated with a MountNamespace, MountNamespaceFromContext returns nil. +// +// A reference is taken on the returned MountNamespace. func MountNamespaceFromContext(ctx context.Context) *MountNamespace { if v := ctx.Value(CtxMountNamespace); v != nil { return v.(*MountNamespace) diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 1fbb420f9..ad2c9fcf4 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -114,6 +114,7 @@ type MountNamespace struct { func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { + ctx.Warningf("Unknown filesystem: %s", fsTypeName) return nil, syserror.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) @@ -231,9 +232,12 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti return syserror.EINVAL } vfs.mountMu.Lock() - if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns { - vfs.mountMu.Unlock() - return syserror.EINVAL + if mntns := MountNamespaceFromContext(ctx); mntns != nil { + defer mntns.DecRef() + if mntns != vd.mount.ns { + vfs.mountMu.Unlock() + return syserror.EINVAL + } } // TODO(jamieliu): Linux special-cases umount of the caller's root, which diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index fdf8be157..6af7fdac1 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -61,7 +61,7 @@ type MountOptions struct { type OpenOptions struct { // Flags contains access mode and flags as specified for open(2). // - // FilesystemImpls is reponsible for implementing the following flags: + // FilesystemImpls are responsible for implementing the following flags: // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 9629afee9..51deae313 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -393,7 +393,8 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential // be executed. return nil, syserror.EACCES } - if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular { + if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular { + ctx.Infof("%q is not a regular file: %v", pop.Path, t) return nil, syserror.EACCES } } @@ -743,6 +744,8 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { // VirtualDentry methods require that a reference is held on the VirtualDentry. // // VirtualDentry is analogous to Linux's struct path. +// +// +stateify savable type VirtualDentry struct { mount *Mount dentry *Dentry diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 9f0d5d7af..239ca5302 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -795,16 +795,19 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("container %q not started", args.ContainerID) } + // TODO(gvisor.dev/issue/1623): Add VFS2 support + // Get the container MountNamespace from the Task. tg.Leader().WithMuLocked(func(t *kernel.Task) { - // task.MountNamespace() does not take a ref, so we must do so - // ourselves. + // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespace = t.MountNamespace() args.MountNamespace.IncRef() }) - defer args.MountNamespace.DecRef() + if args.MountNamespace != nil { + defer args.MountNamespace.DecRef() + } - // Add the HOME enviroment varible if it is not already set. + // Add the HOME environment variable if it is not already set. root := args.MountNamespace.Root() defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) -- cgit v1.2.3 From 3c26f5ecb0087337b1f194b6d429ce68f3af70eb Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 14 Feb 2020 12:07:08 -0800 Subject: Enable automated marshalling for struct stat. This requires fixing a few build issues for non-am64 platforms. PiperOrigin-RevId: 295196922 --- pkg/abi/linux/BUILD | 1 + pkg/abi/linux/file_amd64.go | 2 + pkg/abi/linux/file_arm64.go | 2 + pkg/abi/linux/time.go | 2 + pkg/sentry/kernel/BUILD | 1 + pkg/sentry/syscalls/linux/BUILD | 2 - pkg/sentry/syscalls/linux/sys_stat.go | 26 +++++++++- pkg/sentry/syscalls/linux/sys_stat_amd64.go | 75 ---------------------------- pkg/sentry/syscalls/linux/sys_stat_arm64.go | 77 ----------------------------- 9 files changed, 32 insertions(+), 156 deletions(-) delete mode 100644 pkg/sentry/syscalls/linux/sys_stat_amd64.go delete mode 100644 pkg/sentry/syscalls/linux/sys_stat_arm64.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 1f3c0c687..b7015367b 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -59,6 +59,7 @@ go_library( "wait.go", "xattr.go", ], + marshal = True, visibility = ["//visibility:public"], deps = [ "//pkg/abi", diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go index 8693d49c8..6b72364ea 100644 --- a/pkg/abi/linux/file_amd64.go +++ b/pkg/abi/linux/file_amd64.go @@ -25,6 +25,8 @@ const ( ) // Stat represents struct stat. +// +// +marshal type Stat struct { Dev uint64 Ino uint64 diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go index ea3adc5f5..6492c9038 100644 --- a/pkg/abi/linux/file_arm64.go +++ b/pkg/abi/linux/file_arm64.go @@ -25,6 +25,8 @@ const ( ) // Stat represents struct stat. +// +// +marshal type Stat struct { Dev uint64 Ino uint64 diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index 5c5a58cd4..e562b46d9 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -101,6 +101,8 @@ func NsecToTimeT(nsec int64) TimeT { } // Timespec represents struct timespec in . +// +// +marshal type Timespec struct { Sec int64 Nsec int64 diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 46306945f..beba29a09 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -201,6 +201,7 @@ go_library( "//pkg/tcpip/stack", "//pkg/usermem", "//pkg/waiter", + "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 0d24fd3c4..c7883e68e 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -42,8 +42,6 @@ go_library( "sys_socket.go", "sys_splice.go", "sys_stat.go", - "sys_stat_amd64.go", - "sys_stat_arm64.go", "sys_sync.go", "sys_sysinfo.go", "sys_syslog.go", diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index c841abccb..8b66a9006 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -23,6 +23,24 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat { + return linux.Stat{ + Dev: sattr.DeviceID, + Ino: sattr.InodeID, + Nlink: uattr.Links, + Mode: sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()), + UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()), + GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)), + Size: uattr.Size, + Blksize: sattr.BlockSize, + Blocks: uattr.Usage / 512, + ATime: uattr.AccessTime.Timespec(), + MTime: uattr.ModificationTime.Timespec(), + CTime: uattr.StatusChangeTime.Timespec(), + } +} + // Stat implements linux syscall stat(2). func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() @@ -112,7 +130,9 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err if err != nil { return err } - return copyOutStat(t, statAddr, d.Inode.StableAttr, uattr) + s := statFromAttrs(t, d.Inode.StableAttr, uattr) + _, err = s.CopyOut(t, statAddr) + return err } // fstat implements fstat for the given *fs.File. @@ -121,7 +141,9 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error { if err != nil { return err } - return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr) + s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr) + _, err = s.CopyOut(t, statAddr) + return err } // Statx implements linux syscall statx(2). diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go deleted file mode 100644 index 75a567bd4..000000000 --- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//+build amd64 - -package linux - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/usermem" -) - -// copyOutStat copies the attributes (sattr, uattr) to the struct stat at -// address dst in t's address space. It encodes the stat struct to bytes -// manually, as stat() is a very common syscall for many applications, and -// t.CopyObjectOut has noticeable performance impact due to its many slice -// allocations and use of reflection. -func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error { - b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0] - - // Dev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID)) - // Ino (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID)) - // Nlink (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links) - // Mode (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode())) - // UID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) - // GID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow())) - // Padding (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, 0) - // Rdev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor))) - // Size (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size)) - // Blksize (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize)) - // Blocks (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512)) - - // ATime - atime := uattr.AccessTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec)) - - // MTime - mtime := uattr.ModificationTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec)) - - // CTime - ctime := uattr.StatusChangeTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec)) - - _, err := t.CopyOutBytes(dst, b) - return err -} diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go deleted file mode 100644 index 80c98d05c..000000000 --- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//+build arm64 - -package linux - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/usermem" -) - -// copyOutStat copies the attributes (sattr, uattr) to the struct stat at -// address dst in t's address space. It encodes the stat struct to bytes -// manually, as stat() is a very common syscall for many applications, and -// t.CopyObjectOut has noticeable performance impact due to its many slice -// allocations and use of reflection. -func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error { - b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0] - - // Dev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID)) - // Ino (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID)) - // Mode (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode())) - // Nlink (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links)) - // UID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) - // GID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow())) - // Rdev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor))) - // Padding (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, 0) - // Size (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size)) - // Blksize (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize)) - // Padding (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, 0) - // Blocks (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512)) - - // ATime - atime := uattr.AccessTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec)) - - // MTime - mtime := uattr.ModificationTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec)) - - // CTime - ctime := uattr.StatusChangeTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec)) - - _, err := t.CopyOutBytes(dst, b) - return err -} -- cgit v1.2.3 From e4c7f3e6f6c19f3259820a4c41b69e85c0454379 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 14 Feb 2020 13:39:51 -0800 Subject: Inline vfs.VirtualFilesystem in Kernel struct This saves one pointer dereference per VFS access. Updates #1623 PiperOrigin-RevId: 295216176 --- pkg/sentry/control/proc.go | 2 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 5 +++- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 5 +++- pkg/sentry/fsimpl/ext/ext_test.go | 5 +++- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 5 +++- pkg/sentry/fsimpl/proc/tasks_test.go | 6 ++-- pkg/sentry/fsimpl/sys/sys_test.go | 6 ++-- pkg/sentry/fsimpl/testutil/kernel.go | 7 +++-- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 10 +++++-- pkg/sentry/fsimpl/tmpfs/pipe_test.go | 5 +++- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 6 +++- pkg/sentry/kernel/kernel.go | 9 ++++-- pkg/sentry/vfs/dentry.go | 4 ++- pkg/sentry/vfs/device.go | 3 ++ pkg/sentry/vfs/file_description_impl_util_test.go | 10 +++++-- pkg/sentry/vfs/filesystem.go | 2 ++ pkg/sentry/vfs/filesystem_type.go | 1 + pkg/sentry/vfs/mount.go | 4 +++ pkg/sentry/vfs/mount_unsafe.go | 8 ++++-- pkg/sentry/vfs/vfs.go | 35 +++++++++++------------ 20 files changed, 94 insertions(+), 44 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 8973754c8..5457ba5e7 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -199,7 +199,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI } paths := fs.GetPath(initArgs.Envv) - vfsObj := proc.Kernel.VFS + vfsObj := proc.Kernel.VFS() file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths) if err != nil { return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 73308a2b5..b6d52c015 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -29,7 +29,10 @@ func TestDevtmpfs(t *testing.T) { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } // Register tmpfs just so that we can have a root filesystem that isn't // devtmpfs. vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 2015a8871..89caee3df 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -52,7 +52,10 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, nil, nil, nil, err + } vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 05f992826..ef6127f3c 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -65,7 +65,10 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 96a16e654..0459fb305 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -45,7 +45,10 @@ type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) - v := vfs.New() + v := &vfs.VirtualFilesystem{} + if err := v.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 96c72cbc9..c5d531fe0 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -90,7 +90,7 @@ func setup(t *testing.T) *testutil.System { ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - k.VFS.MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) fsOpts := vfs.GetFilesystemOptions{ @@ -101,11 +101,11 @@ func setup(t *testing.T) *testutil.System { }, }, } - mntns, err := k.VFS.NewMountNamespace(ctx, creds, "", Name, &fsOpts) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } - return testutil.NewSystem(ctx, t, k.VFS, mntns) + return testutil.NewSystem(ctx, t, k.VFS(), mntns) } func TestTasksEmpty(t *testing.T) { diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 5d1ba5867..4b3602d47 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -34,15 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System { } ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - k.VFS.MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := k.VFS.NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) + mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("Failed to create new mount namespace: %v", err) } - return testutil.NewSystem(ctx, t, k.VFS, mns) + return testutil.NewSystem(ctx, t, k.VFS(), mns) } func TestReadCPUFile(t *testing.T) { diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index a91b3ec4d..d0be32e72 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -102,12 +102,13 @@ func Boot() (*kernel.Kernel, error) { kernel.VFS2Enabled = true - vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + if err := k.VFS().Init(); err != nil { + return nil, fmt.Errorf("VFS init: %v", err) + } + k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) - k.VFS = vfsObj ls, err := limits.NewLinuxLimitSet() if err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 9fce5e4b4..383133e44 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -175,7 +175,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + b.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) @@ -366,7 +369,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + b.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 5ee7f2a72..1614f2c39 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -151,7 +151,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index e9f71e334..0399725cf 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -40,7 +40,11 @@ var nextFileID int64 func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 2665f057c..ea21af33f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -246,7 +246,7 @@ type Kernel struct { SpecialOpts // VFS keeps the filesystem state used across the kernel. - VFS *vfs.VirtualFilesystem + vfs vfs.VirtualFilesystem } // InitKernelArgs holds arguments to Init. @@ -815,7 +815,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, FollowFinalSymlink: true, } var err error - wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ + wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ CheckSearchable: true, }) if err != nil { @@ -1506,3 +1506,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { Registers: t.Arch().StateData().Proto(), }) } + +// VFS returns the virtual filesystem for the kernel. +func (k *Kernel) VFS() *vfs.VirtualFilesystem { + return &k.vfs +} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 486a76475..35b208721 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -71,6 +71,8 @@ import ( // lifetime. Dentry reference counts only indicate the extent to which VFS // requires Dentries to exist; Filesystems may elect to cache or discard // Dentries with zero references. +// +// +stateify savable type Dentry struct { // parent is this Dentry's parent in this Filesystem. If this Dentry is // independent, parent is nil. @@ -89,7 +91,7 @@ type Dentry struct { children map[string]*Dentry // mu synchronizes disowning and mounting over this Dentry. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // impl is the DentryImpl associated with this Dentry. impl is immutable. // This should be the last field in Dentry. diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index 3af2aa58d..bda5576fa 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -56,6 +56,7 @@ type Device interface { Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error) } +// +stateify savable type registeredDevice struct { dev Device opts RegisterDeviceOptions @@ -63,6 +64,8 @@ type registeredDevice struct { // RegisterDeviceOptions contains options to // VirtualFilesystem.RegisterDevice(). +// +// +stateify savable type RegisterDeviceOptions struct { // GroupName is the name shown for this device registration in // /proc/devices. If GroupName is empty, this registration will not be diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 8fa26418e..3a75d4d62 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -107,7 +107,10 @@ func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error { func TestGenCountFD(t *testing.T) { ctx := contexttest.Context(t) - vfsObj := New() // vfs.New() + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) defer fd.DecRef() @@ -162,7 +165,10 @@ func TestGenCountFD(t *testing.T) { func TestWritable(t *testing.T) { ctx := contexttest.Context(t) - vfsObj := New() // vfs.New() + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) defer fd.DecRef() diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index a06a6caf3..556976d0b 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -29,6 +29,8 @@ import ( // Filesystem methods require that a reference is held. // // Filesystem is analogous to Linux's struct super_block. +// +// +stateify savable type Filesystem struct { // refs is the reference count. refs is accessed using atomic memory // operations. diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index c58b70728..bb9cada81 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -44,6 +44,7 @@ type GetFilesystemOptions struct { InternalData interface{} } +// +stateify savable type registeredFilesystemType struct { fsType FilesystemType opts RegisterFilesystemTypeOptions diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index ad2c9fcf4..9912df799 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -38,6 +38,8 @@ import ( // // Mount is analogous to Linux's struct mount. (gVisor does not distinguish // between struct mount and struct vfsmount.) +// +// +stateify savable type Mount struct { // vfs, fs, and root are immutable. References are held on fs and root. // @@ -85,6 +87,8 @@ type Mount struct { // MountNamespace methods require that a reference is held. // // MountNamespace is analogous to Linux's struct mnt_namespace. +// +// +stateify savable type MountNamespace struct { // root is the MountNamespace's root mount. root is immutable. root *Mount diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index bd90d36c4..1fe766a44 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -64,6 +64,8 @@ func (mnt *Mount) storeKey(vd VirtualDentry) { // (provided mutation is sufficiently uncommon). // // mountTable.Init() must be called on new mountTables before use. +// +// +stateify savable type mountTable struct { // mountTable is implemented as a seqcount-protected hash table that // resolves collisions with linear probing, featuring Robin Hood insertion @@ -75,8 +77,8 @@ type mountTable struct { // intrinsics and inline assembly, limiting the performance of this // approach.) - seq sync.SeqCount - seed uint32 // for hashing keys + seq sync.SeqCount `state:"nosave"` + seed uint32 // for hashing keys // size holds both length (number of elements) and capacity (number of // slots): capacity is stored as its base-2 log (referred to as order) in @@ -89,7 +91,7 @@ type mountTable struct { // length and cap in separate uint32s) for ~free. size uint64 - slots unsafe.Pointer // []mountSlot; never nil after Init + slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init } type mountSlot struct { diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 51deae313..8f29031b2 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -46,11 +46,13 @@ import ( // // There is no analogue to the VirtualFilesystem type in Linux, as the // equivalent state in Linux is global. +// +// +stateify savable type VirtualFilesystem struct { // mountMu serializes mount mutations. // // mountMu is analogous to Linux's namespace_sem. - mountMu sync.Mutex + mountMu sync.Mutex `state:"nosave"` // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts // are uniquely namespaced, including mount parent in the key correctly @@ -89,44 +91,42 @@ type VirtualFilesystem struct { // devices contains all registered Devices. devices is protected by // devicesMu. - devicesMu sync.RWMutex + devicesMu sync.RWMutex `state:"nosave"` devices map[devTuple]*registeredDevice // anonBlockDevMinor contains all allocated anonymous block device minor // numbers. anonBlockDevMinorNext is a lower bound for the smallest // unallocated anonymous block device number. anonBlockDevMinorNext and // anonBlockDevMinor are protected by anonBlockDevMinorMu. - anonBlockDevMinorMu sync.Mutex + anonBlockDevMinorMu sync.Mutex `state:"nosave"` anonBlockDevMinorNext uint32 anonBlockDevMinor map[uint32]struct{} // fsTypes contains all registered FilesystemTypes. fsTypes is protected by // fsTypesMu. - fsTypesMu sync.RWMutex + fsTypesMu sync.RWMutex `state:"nosave"` fsTypes map[string]*registeredFilesystemType // filesystems contains all Filesystems. filesystems is protected by // filesystemsMu. - filesystemsMu sync.Mutex + filesystemsMu sync.Mutex `state:"nosave"` filesystems map[*Filesystem]struct{} } -// New returns a new VirtualFilesystem with no mounts or FilesystemTypes. -func New() *VirtualFilesystem { - vfs := &VirtualFilesystem{ - mountpoints: make(map[*Dentry]map[*Mount]struct{}), - devices: make(map[devTuple]*registeredDevice), - anonBlockDevMinorNext: 1, - anonBlockDevMinor: make(map[uint32]struct{}), - fsTypes: make(map[string]*registeredFilesystemType), - filesystems: make(map[*Filesystem]struct{}), - } +// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. +func (vfs *VirtualFilesystem) Init() error { + vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) + vfs.devices = make(map[devTuple]*registeredDevice) + vfs.anonBlockDevMinorNext = 1 + vfs.anonBlockDevMinor = make(map[uint32]struct{}) + vfs.fsTypes = make(map[string]*registeredFilesystemType) + vfs.filesystems = make(map[*Filesystem]struct{}) vfs.mounts.Init() // Construct vfs.anonMount. anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() if err != nil { - panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err)) + return err } anonfs := anonFilesystem{ devMinor: anonfsDevMinor, @@ -137,8 +137,7 @@ func New() *VirtualFilesystem { fs: &anonfs.vfsfs, refs: 1, } - - return vfs + return nil } // PathOperation specifies the path operated on by a VFS method. -- cgit v1.2.3 From 87bc2834c97a958d0762833fe8db749ccc6d5d50 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 14 Feb 2020 14:23:35 -0800 Subject: Enable automated marshalling for RSeqCriticalSection. PiperOrigin-RevId: 295226468 --- pkg/sentry/kernel/rseq.go | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index efebfd872..18416643b 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -303,26 +303,14 @@ func (t *Task) rseqAddrInterrupt() { return } - buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection) - if _, err := t.CopyInBytes(critAddr, buf); err != nil { + var cs linux.RSeqCriticalSection + if _, err := cs.CopyIn(t, critAddr); err != nil { t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } - // Manually marshal RSeqCriticalSection as this is in the hot path when - // rseq is enabled. It must be as fast as possible. - // - // TODO(b/130243041): Replace with go_marshal. - cs := linux.RSeqCriticalSection{ - Version: usermem.ByteOrder.Uint32(buf[0:4]), - Flags: usermem.ByteOrder.Uint32(buf[4:8]), - Start: usermem.ByteOrder.Uint64(buf[8:16]), - PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]), - Abort: usermem.ByteOrder.Uint64(buf[24:32]), - } - if cs.Version != 0 { t.Debugf("Unknown version in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) -- cgit v1.2.3 From 5baf9dc2fbb459828b4102b0a1c5214879434c03 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 14 Feb 2020 15:48:09 -0800 Subject: Synchronize signalling with S/R This is to fix a data race between sending an external signal to a ThreadGroup and kernel saving state for S/R. PiperOrigin-RevId: 295244281 --- pkg/sentry/kernel/kernel.go | 8 ++++++++ runsc/boot/loader.go | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index ea21af33f..7da0368f1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1169,6 +1169,14 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) { k.sendExternalSignal(info, context) } +// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. +// This function doesn't skip signals like SendExternalSignal does. +func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error { + k.extMu.Lock() + defer k.extMu.Unlock() + return tg.SendSignal(info) +} + // SendContainerSignal sends the given signal to all processes inside the // namespace that match the given container ID. func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 239ca5302..eef43b9df 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -997,7 +997,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) if err == nil { // Send signal directly to the identified process. - return execTG.SendSignal(&arch.SignalInfo{Signo: signo}) + return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo}) } // The caller may be signaling a process not started directly via exec. @@ -1014,7 +1014,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er if tg.Leader().ContainerID() != cid { return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) } - return tg.SendSignal(&arch.SignalInfo{Signo: signo}) + return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) } func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { @@ -1032,7 +1032,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s // No foreground process group has been set. Signal the // original thread group. log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) - return tg.SendSignal(&arch.SignalInfo{Signo: signo}) + return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) } // Send the signal to all processes in the process group. var lastErr error @@ -1040,7 +1040,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s if tg.ProcessGroup() != pg { continue } - if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil { + if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil { lastErr = err } } -- cgit v1.2.3 From d90d71474f4c82f742140fdf026821709845cece Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Thu, 20 Feb 2020 14:28:31 -0800 Subject: Remove bytes read/written from marshal.Marshallable API. Users of the API only care about whether the copy in/out succeeds in their entirety, which is already signalled by the returned error. PiperOrigin-RevId: 296297843 --- pkg/sentry/kernel/rseq.go | 2 +- pkg/sentry/syscalls/linux/sys_stat.go | 6 ++---- tools/go_marshal/gomarshal/generator_interfaces.go | 21 +++++++++++---------- tools/go_marshal/gomarshal/generator_tests.go | 2 +- tools/go_marshal/marshal/marshal.go | 4 ++-- 5 files changed, 17 insertions(+), 18 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index 18416643b..ded95f532 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -304,7 +304,7 @@ func (t *Task) rseqAddrInterrupt() { } var cs linux.RSeqCriticalSection - if _, err := cs.CopyIn(t, critAddr); err != nil { + if err := cs.CopyIn(t, critAddr); err != nil { t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 8b66a9006..11f25e00d 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -131,8 +131,7 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err return err } s := statFromAttrs(t, d.Inode.StableAttr, uattr) - _, err = s.CopyOut(t, statAddr) - return err + return s.CopyOut(t, statAddr) } // fstat implements fstat for the given *fs.File. @@ -142,8 +141,7 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error { return err } s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr) - _, err = s.CopyOut(t, statAddr) - return err + return s.CopyOut(t, statAddr) } // Statx implements linux syscall statx(2). diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go index 3aa299ccd..834c58cee 100644 --- a/tools/go_marshal/gomarshal/generator_interfaces.go +++ b/tools/go_marshal/gomarshal/generator_interfaces.go @@ -507,13 +507,14 @@ func (g *interfaceGenerator) emitMarshallable() { g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") g.recordUsedImport("marshal") g.recordUsedImport("usermem") - g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName()) + g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r) g.emit("%s.MarshalBytes(buf)\n", g.r) - g.emit("return task.CopyOutBytes(addr, buf)\n") + g.emit("_, err := task.CopyOutBytes(addr, buf)\n") + g.emit("return err\n") } if thisPacked { g.recordUsedImport("reflect") @@ -539,11 +540,11 @@ func (g *interfaceGenerator) emitMarshallable() { g.emit("hdr.Len = %s.SizeBytes()\n", g.r) g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - g.emit("len, err := task.CopyOutBytes(addr, buf)\n") + g.emit("_, err := task.CopyOutBytes(addr, buf)\n") g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) g.emit("// must live until after the CopyOutBytes.\n") g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return len, err\n") + g.emit("return err\n") } else { fallback() } @@ -553,20 +554,20 @@ func (g *interfaceGenerator) emitMarshallable() { g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n") g.recordUsedImport("marshal") g.recordUsedImport("usermem") - g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName()) + g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r) - g.emit("n, err := task.CopyInBytes(addr, buf)\n") + g.emit("_, err := task.CopyInBytes(addr, buf)\n") g.emit("if err != nil {\n") g.inIndent(func() { - g.emit("return n, err\n") + g.emit("return err\n") }) g.emit("}\n") g.emit("%s.UnmarshalBytes(buf)\n", g.r) - g.emit("return n, nil\n") + g.emit("return nil\n") } if thisPacked { g.recordUsedImport("reflect") @@ -592,11 +593,11 @@ func (g *interfaceGenerator) emitMarshallable() { g.emit("hdr.Len = %s.SizeBytes()\n", g.r) g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - g.emit("len, err := task.CopyInBytes(addr, buf)\n") + g.emit("_, err := task.CopyInBytes(addr, buf)\n") g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) g.emit("// must live until after the CopyInBytes.\n") g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return len, err\n") + g.emit("return err\n") } else { fallback() } diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go index 8c28b00d0..2326e7a07 100644 --- a/tools/go_marshal/gomarshal/generator_tests.go +++ b/tools/go_marshal/gomarshal/generator_tests.go @@ -92,7 +92,7 @@ func (g *testGenerator) emitTestNonZeroSize() { g.emit("x := &%s{}\n", g.typeName()) g.emit("if x.SizeBytes() == 0 {\n") g.inIndent(func() { - g.emit("t.Fatal(\"Marshallable.Size() should not return zero\")\n") + g.emit("t.Fatal(\"Marshallable.SizeBytes() should not return zero\")\n") }) g.emit("}\n") }) diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go index 20353850d..f129788e0 100644 --- a/tools/go_marshal/marshal/marshal.go +++ b/tools/go_marshal/marshal/marshal.go @@ -91,12 +91,12 @@ type Marshallable interface { // marshalled does not escape. The implementation should avoid creating // extra copies in memory by directly deserializing to the object's // underlying memory. - CopyIn(task Task, addr usermem.Addr) (int, error) + CopyIn(task Task, addr usermem.Addr) error // CopyOut serializes a Marshallable type to a task's memory. This may only // be called from a task goroutine. This is more efficient than calling // MarshalUnsafe on Marshallable.Packed types, as the type being serialized // does not escape. The implementation should avoid creating extra copies in // memory by directly serializing from the object's underlying memory. - CopyOut(task Task, addr usermem.Addr) (int, error) + CopyOut(task Task, addr usermem.Addr) error } -- cgit v1.2.3 From 4a73bae269ae9f52a962ae3b08a17ccaacf7ba80 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Thu, 20 Feb 2020 15:19:40 -0800 Subject: Initial network namespace support. TCP/IP will work with netstack networking. hostinet doesn't work, and sockets will have the same behavior as it is now. Before the userspace is able to create device, the default loopback device can be used to test. /proc/net and /sys/net will still be connected to the root network stack; this is the same behavior now. Issue #1833 PiperOrigin-RevId: 296309389 --- pkg/sentry/fs/proc/net.go | 5 +- pkg/sentry/fs/proc/sys_net.go | 4 +- pkg/sentry/fsimpl/proc/tasks_net.go | 5 +- pkg/sentry/fsimpl/proc/tasks_sys.go | 4 +- pkg/sentry/fsimpl/testutil/kernel.go | 1 + pkg/sentry/inet/BUILD | 1 + pkg/sentry/inet/namespace.go | 99 +++++++++++++++++++++++++ pkg/sentry/kernel/kernel.go | 26 ++++--- pkg/sentry/kernel/task.go | 9 +-- pkg/sentry/kernel/task_clone.go | 16 ++-- pkg/sentry/kernel/task_net.go | 19 +++-- pkg/sentry/kernel/task_start.go | 8 +- pkg/tcpip/time_unsafe.go | 2 + runsc/boot/BUILD | 2 +- runsc/boot/controller.go | 11 +-- runsc/boot/loader.go | 121 +++++++++++++++++++++---------- runsc/boot/network.go | 27 +++++++ runsc/boot/pprof.go | 18 ----- runsc/boot/pprof/BUILD | 11 +++ runsc/boot/pprof/pprof.go | 20 +++++ runsc/sandbox/network.go | 25 +------ test/syscalls/BUILD | 2 + test/syscalls/linux/BUILD | 17 +++++ test/syscalls/linux/network_namespace.cc | 121 +++++++++++++++++++++++++++++++ 24 files changed, 451 insertions(+), 123 deletions(-) create mode 100644 pkg/sentry/inet/namespace.go delete mode 100644 runsc/boot/pprof.go create mode 100644 runsc/boot/pprof/BUILD create mode 100644 runsc/boot/pprof/pprof.go create mode 100644 test/syscalls/linux/network_namespace.cc (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 6f2775344..95d5817ff 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -43,7 +43,10 @@ import ( // newNet creates a new proc net entry. func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode { var contents map[string]*fs.Inode - if s := p.k.NetworkStack(); s != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. We should make this per-process, + // a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net. + if s := p.k.RootNetworkNamespace().Stack(); s != nil { contents = map[string]*fs.Inode{ "dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc), "snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc), diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 0772d4ae4..d4c4b533d 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { var contents map[string]*fs.Inode - if s := p.k.NetworkStack(); s != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. + if s := p.k.RootNetworkNamespace().Stack(); s != nil { contents = map[string]*fs.Inode{ "ipv4": p.newSysNetIPv4Dir(ctx, msrc, s), "core": p.newSysNetCore(ctx, msrc, s), diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go index 608fec017..d4e1812d8 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -39,7 +39,10 @@ import ( func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry - if stack := k.NetworkStack(); stack != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. We should make this per-process, + // a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net. + if stack := k.RootNetworkNamespace().Stack(); stack != nil { const ( arp = "IP address HW type Flags HW address Mask Device\n" netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n" diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index c7ce74883..3d5dc463c 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -50,7 +50,9 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry - if stack := k.NetworkStack(); stack != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. + if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ "tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}), diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index d0be32e72..488478e29 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -128,6 +128,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns ThreadGroup: tc, TaskContext: &kernel.TaskContext{Name: name}, Credentials: auth.CredentialsFromContext(ctx), + NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), UTSNamespace: kernel.UTSNamespaceFromContext(ctx), IPCNamespace: kernel.IPCNamespaceFromContext(ctx), diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index 334432abf..07bf39fed 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -10,6 +10,7 @@ go_library( srcs = [ "context.go", "inet.go", + "namespace.go", "test_stack.go", ], deps = [ diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go new file mode 100644 index 000000000..c16667e7f --- /dev/null +++ b/pkg/sentry/inet/namespace.go @@ -0,0 +1,99 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package inet + +// Namespace represents a network namespace. See network_namespaces(7). +// +// +stateify savable +type Namespace struct { + // stack is the network stack implementation of this network namespace. + stack Stack `state:"nosave"` + + // creator allows kernel to create new network stack for network namespaces. + // If nil, no networking will function if network is namespaced. + creator NetworkStackCreator + + // isRoot indicates whether this is the root network namespace. + isRoot bool +} + +// NewRootNamespace creates the root network namespace, with creator +// allowing new network namespaces to be created. If creator is nil, no +// networking will function if the network is namespaced. +func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace { + return &Namespace{ + stack: stack, + creator: creator, + isRoot: true, + } +} + +// NewNamespace creates a new network namespace from the root. +func NewNamespace(root *Namespace) *Namespace { + n := &Namespace{ + creator: root.creator, + } + n.init() + return n +} + +// Stack returns the network stack of n. Stack may return nil if no network +// stack is configured. +func (n *Namespace) Stack() Stack { + return n.stack +} + +// IsRoot returns whether n is the root network namespace. +func (n *Namespace) IsRoot() bool { + return n.isRoot +} + +// RestoreRootStack restores the root network namespace with stack. This should +// only be called when restoring kernel. +func (n *Namespace) RestoreRootStack(stack Stack) { + if !n.isRoot { + panic("RestoreRootStack can only be called on root network namespace") + } + if n.stack != nil { + panic("RestoreRootStack called after a stack has already been set") + } + n.stack = stack +} + +func (n *Namespace) init() { + // Root network namespace will have stack assigned later. + if n.isRoot { + return + } + if n.creator != nil { + var err error + n.stack, err = n.creator.CreateStack() + if err != nil { + panic(err) + } + } +} + +// afterLoad is invoked by stateify. +func (n *Namespace) afterLoad() { + n.init() +} + +// NetworkStackCreator allows new instances of a network stack to be created. It +// is used by the kernel to create new network namespaces when requested. +type NetworkStackCreator interface { + // CreateStack creates a new network stack for a network namespace. + CreateStack() (Stack, error) +} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 7da0368f1..c62fd6eb1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -111,7 +111,7 @@ type Kernel struct { timekeeper *Timekeeper tasks *TaskSet rootUserNamespace *auth.UserNamespace - networkStack inet.Stack `state:"nosave"` + rootNetworkNamespace *inet.Namespace applicationCores uint useHostCores bool extraAuxv []arch.AuxEntry @@ -260,8 +260,9 @@ type InitKernelArgs struct { // RootUserNamespace is the root user namespace. RootUserNamespace *auth.UserNamespace - // NetworkStack is the TCP/IP network stack. NetworkStack may be nil. - NetworkStack inet.Stack + // RootNetworkNamespace is the root network namespace. If nil, no networking + // will be available. + RootNetworkNamespace *inet.Namespace // ApplicationCores is the number of logical CPUs visible to sandboxed // applications. The set of logical CPU IDs is [0, ApplicationCores); thus @@ -320,7 +321,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.rootUTSNamespace = args.RootUTSNamespace k.rootIPCNamespace = args.RootIPCNamespace k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace - k.networkStack = args.NetworkStack + k.rootNetworkNamespace = args.RootNetworkNamespace + if k.rootNetworkNamespace == nil { + k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil) + } k.applicationCores = args.ApplicationCores if args.UseHostCores { k.useHostCores = true @@ -543,8 +547,6 @@ func (ts *TaskSet) unregisterEpollWaiters() { func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error { loadStart := time.Now() - k.networkStack = net - initAppCores := k.applicationCores // Load the pre-saved CPUID FeatureSet. @@ -575,6 +577,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) log.Infof("Kernel load stats: %s", &stats) log.Infof("Kernel load took [%s].", time.Since(kernelStart)) + // rootNetworkNamespace should be populated after loading the state file. + // Restore the root network stack. + k.rootNetworkNamespace.RestoreRootStack(net) + // Load the memory file's state. memoryStart := time.Now() if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { @@ -905,6 +911,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, + NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, @@ -1255,10 +1262,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { return k.rootAbstractSocketNamespace } -// NetworkStack returns the network stack. NetworkStack may return nil if no -// network stack is available. -func (k *Kernel) NetworkStack() inet.Stack { - return k.networkStack +// RootNetworkNamespace returns the root network namespace, always non-nil. +func (k *Kernel) RootNetworkNamespace() *inet.Namespace { + return k.rootNetworkNamespace } // GlobalInit returns the thread group with ID 1 in the root PID namespace, or diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index a3443ff21..e37e23231 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -486,13 +486,10 @@ type Task struct { numaPolicy int32 numaNodeMask uint64 - // If netns is true, the task is in a non-root network namespace. Network - // namespaces aren't currently implemented in full; being in a network - // namespace simply prevents the task from observing any network devices - // (including loopback) or using abstract socket addresses (see unix(7)). + // netns is the task's network namespace. netns is never nil. // - // netns is protected by mu. netns is owned by the task goroutine. - netns bool + // netns is protected by mu. + netns *inet.Namespace // If rseqPreempted is true, before the next call to p.Switch(), // interrupt rseq critical regions as defined by rseqAddr and diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index ba74b4c1c..78866f280 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -54,8 +55,7 @@ type SharingOptions struct { NewUserNamespace bool // If NewNetworkNamespace is true, the task should have an independent - // network namespace. (Note that network namespaces are not really - // implemented; see comment on Task.netns for details.) + // network namespace. NewNetworkNamespace bool // If NewFiles is true, the task should use an independent file descriptor @@ -199,6 +199,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { ipcns = NewIPCNamespace(userns) } + netns := t.NetworkNamespace() + if opts.NewNetworkNamespace { + netns = inet.NewNamespace(netns) + } + // TODO(b/63601033): Implement CLONE_NEWNS. mntnsVFS2 := t.mountNamespaceVFS2 if mntnsVFS2 != nil { @@ -268,7 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { FDTable: fdTable, Credentials: creds, Niceness: t.Niceness(), - NetworkNamespaced: t.netns, + NetworkNamespace: netns, AllowedCPUMask: t.CPUMask(), UTSNamespace: utsns, IPCNamespace: ipcns, @@ -283,9 +288,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } else { cfg.InheritParent = t } - if opts.NewNetworkNamespace { - cfg.NetworkNamespaced = true - } nt, err := t.tg.pidns.owner.NewTask(cfg) if err != nil { if opts.NewThreadGroup { @@ -482,7 +484,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { t.mu.Unlock() return syserror.EPERM } - t.netns = true + t.netns = inet.NewNamespace(t.netns) } if opts.NewUTSNamespace { if !haveCapSysAdmin { diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go index 172a31e1d..f7711232c 100644 --- a/pkg/sentry/kernel/task_net.go +++ b/pkg/sentry/kernel/task_net.go @@ -22,14 +22,23 @@ import ( func (t *Task) IsNetworkNamespaced() bool { t.mu.Lock() defer t.mu.Unlock() - return t.netns + return !t.netns.IsRoot() } // NetworkContext returns the network stack used by the task. NetworkContext // may return nil if no network stack is available. +// +// TODO(gvisor.dev/issue/1833): Migrate callers of this method to +// NetworkNamespace(). func (t *Task) NetworkContext() inet.Stack { - if t.IsNetworkNamespaced() { - return nil - } - return t.k.networkStack + t.mu.Lock() + defer t.mu.Unlock() + return t.netns.Stack() +} + +// NetworkNamespace returns the network namespace observed by the task. +func (t *Task) NetworkNamespace() *inet.Namespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.netns } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index f9236a842..a5035bb7f 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" @@ -65,9 +66,8 @@ type TaskConfig struct { // Niceness is the niceness of the new task. Niceness int - // If NetworkNamespaced is true, the new task should observe a non-root - // network namespace. - NetworkNamespaced bool + // NetworkNamespace is the network namespace to be used for the new task. + NetworkNamespace *inet.Namespace // AllowedCPUMask contains the cpus that this task can run on. AllowedCPUMask sched.CPUSet @@ -133,7 +133,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, niceness: cfg.Niceness, - netns: cfg.NetworkNamespaced, + netns: cfg.NetworkNamespace, utsns: cfg.UTSNamespace, ipcns: cfg.IPCNamespace, abstractSockets: cfg.AbstractSocketNamespace, diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go index 48764b978..2f98a996f 100644 --- a/pkg/tcpip/time_unsafe.go +++ b/pkg/tcpip/time_unsafe.go @@ -25,6 +25,8 @@ import ( ) // StdClock implements Clock with the time package. +// +// +stateify savable type StdClock struct{} var _ Clock = (*StdClock)(nil) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index ae4dd102a..26f68fe3d 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -19,7 +19,6 @@ go_library( "loader_amd64.go", "loader_arm64.go", "network.go", - "pprof.go", "strace.go", "user.go", ], @@ -91,6 +90,7 @@ go_library( "//pkg/usermem", "//runsc/boot/filter", "//runsc/boot/platforms", + "//runsc/boot/pprof", "//runsc/specutils", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 9c9e94864..17e774e0c 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/specutils" ) @@ -142,7 +143,7 @@ func newController(fd int, l *Loader) (*controller, error) { } srv.Register(manager) - if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok { + if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { net := &Network{ Stack: eps.Stack, } @@ -341,7 +342,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("creating memory file: %v", err) } k.SetMemoryFile(mf) - networkStack := cm.l.k.NetworkStack() + networkStack := cm.l.k.RootNetworkNamespace().Stack() cm.l.k = k // Set up the restore environment. @@ -365,9 +366,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { } if cm.l.conf.ProfileEnable { - // initializePProf opens /proc/self/maps, so has to be - // called before installing seccomp filters. - initializePProf() + // pprof.Initialize opens /proc/self/maps, so has to be called before + // installing seccomp filters. + pprof.Initialize() } // Seccomp filters have to be applied before parsing the state file. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index eef43b9df..e7ca98134 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -49,6 +49,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -60,6 +61,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/runsc/boot/filter" _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. + "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/specutils" // Include supported socket providers. @@ -230,11 +232,8 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("enabling strace: %v", err) } - // Create an empty network stack because the network namespace may be empty at - // this point. Netns is configured before Run() is called. Netstack is - // configured using a control uRPC message. Host network is configured inside - // Run(). - networkStack, err := newEmptyNetworkStack(args.Conf, k, k) + // Create root network namespace/stack. + netns, err := newRootNetworkNamespace(args.Conf, k, k) if err != nil { return nil, fmt.Errorf("creating network: %v", err) } @@ -277,7 +276,7 @@ func New(args Args) (*Loader, error) { FeatureSet: cpuid.HostFeatureSet(), Timekeeper: tk, RootUserNamespace: creds.UserNamespace, - NetworkStack: networkStack, + RootNetworkNamespace: netns, ApplicationCores: uint(args.NumCPU), Vdso: vdso, RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), @@ -466,7 +465,7 @@ func (l *Loader) run() error { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") - stack := l.k.NetworkStack().(*hostinet.Stack) + stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) if err := stack.Configure(); err != nil { return err } @@ -485,7 +484,7 @@ func (l *Loader) run() error { // l.restore is set by the container manager when a restore call is made. if !l.restore { if l.conf.ProfileEnable { - initializePProf() + pprof.Initialize() } // Finally done with all configuration. Setup filters before user code @@ -908,48 +907,92 @@ func (l *Loader) WaitExit() kernel.ExitStatus { return l.k.GlobalInit().ExitStatus() } -func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { +func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { + // Create an empty network stack because the network namespace may be empty at + // this point. Netns is configured before Run() is called. Netstack is + // configured using a control uRPC message. Host network is configured inside + // Run(). switch conf.Network { case NetworkHost: - return hostinet.NewStack(), nil + // No network namespacing support for hostinet yet, hence creator is nil. + return inet.NewRootNamespace(hostinet.NewStack(), nil), nil case NetworkNone, NetworkSandbox: - // NetworkNone sets up loopback using netstack. - netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} - transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} - s := netstack.Stack{stack.New(stack.Options{ - NetworkProtocols: netProtos, - TransportProtocols: transProtos, - Clock: clock, - Stats: netstack.Metrics, - HandleLocal: true, - // Enable raw sockets for users with sufficient - // privileges. - RawFactory: raw.EndpointFactory{}, - UniqueID: uniqueID, - })} - - // Enable SACK Recovery. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %v", err) + s, err := newEmptySandboxNetworkStack(clock, uniqueID) + if err != nil { + return nil, err } + creator := &sandboxNetstackCreator{ + clock: clock, + uniqueID: uniqueID, + } + return inet.NewRootNamespace(s, creator), nil - // Set default TTLs as required by socket/netstack. - s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) - s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + default: + panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + } - // Enable Receive Buffer Auto-Tuning. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) - } +} - s.FillDefaultIPTables() +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + s := netstack.Stack{stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + Clock: clock, + Stats: netstack.Metrics, + HandleLocal: true, + // Enable raw sockets for users with sufficient + // privileges. + RawFactory: raw.EndpointFactory{}, + UniqueID: uniqueID, + })} - return &s, nil + // Enable SACK Recovery. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { + return nil, fmt.Errorf("failed to enable SACK: %v", err) + } - default: - panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + // Set default TTLs as required by socket/netstack. + s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + + // Enable Receive Buffer Auto-Tuning. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) + } + + s.FillDefaultIPTables() + + return &s, nil +} + +// sandboxNetstackCreator implements kernel.NetworkStackCreator. +// +// +stateify savable +type sandboxNetstackCreator struct { + clock tcpip.Clock + uniqueID stack.UniqueID +} + +// CreateStack implements kernel.NetworkStackCreator.CreateStack. +func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + if err != nil { + return nil, err } + + // Setup loopback. + n := &Network{Stack: s.(*netstack.Stack).Stack} + nicID := tcpip.NICID(f.uniqueID.UniqueID()) + link := DefaultLoopbackLink + linkEP := loopback.New() + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return nil, err + } + + return s, nil } // signal sends a signal to one or more processes in a container. If PID is 0, diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 6a8765ec8..bee6ee336 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -17,6 +17,7 @@ package boot import ( "fmt" "net" + "strings" "syscall" "gvisor.dev/gvisor/pkg/log" @@ -31,6 +32,32 @@ import ( "gvisor.dev/gvisor/pkg/urpc" ) +var ( + // DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and + // "::1/8" on "lo" interface. + DefaultLoopbackLink = LoopbackLink{ + Name: "lo", + Addresses: []net.IP{ + net.IP("\x7f\x00\x00\x01"), + net.IPv6loopback, + }, + Routes: []Route{ + { + Destination: net.IPNet{ + IP: net.IPv4(0x7f, 0, 0, 0), + Mask: net.IPv4Mask(0xff, 0, 0, 0), + }, + }, + { + Destination: net.IPNet{ + IP: net.IPv6loopback, + Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), + }, + }, + }, + } +) + // Network exposes methods that can be used to configure a network stack. type Network struct { Stack *stack.Stack diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go deleted file mode 100644 index 463362f02..000000000 --- a/runsc/boot/pprof.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -func initializePProf() { -} diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD new file mode 100644 index 000000000..29cb42b2f --- /dev/null +++ b/runsc/boot/pprof/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "pprof", + srcs = ["pprof.go"], + visibility = [ + "//runsc:__subpackages__", + ], +) diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go new file mode 100644 index 000000000..1ded20dee --- /dev/null +++ b/runsc/boot/pprof/pprof.go @@ -0,0 +1,20 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pprof provides a stub to initialize custom profilers. +package pprof + +// Initialize will be called at boot for initializing custom profilers. +func Initialize() { +} diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 99e143696..bc093fba5 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -21,7 +21,6 @@ import ( "path/filepath" "runtime" "strconv" - "strings" "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -75,30 +74,8 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi } func createDefaultLoopbackInterface(conn *urpc.Client) error { - link := boot.LoopbackLink{ - Name: "lo", - Addresses: []net.IP{ - net.IP("\x7f\x00\x00\x01"), - net.IPv6loopback, - }, - Routes: []boot.Route{ - { - Destination: net.IPNet{ - - IP: net.IPv4(0x7f, 0, 0, 0), - Mask: net.IPv4Mask(0xff, 0, 0, 0), - }, - }, - { - Destination: net.IPNet{ - IP: net.IPv6loopback, - Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), - }, - }, - }, - } if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ - LoopbackLinks: []boot.LoopbackLink{link}, + LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink}, }, nil); err != nil { return fmt.Errorf("creating loopback link and routes: %v", err) } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index d69ac8356..d1977d4de 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -258,6 +258,8 @@ syscall_test( syscall_test(test = "//test/syscalls/linux:munmap_test") +syscall_test(test = "//test/syscalls/linux:network_namespace_test") + syscall_test( add_overlay = True, test = "//test/syscalls/linux:open_create_test", diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 05a818795..aa303af84 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -3639,6 +3639,23 @@ cc_binary( ], ) +cc_binary( + name = "network_namespace_test", + testonly = 1, + srcs = ["network_namespace.cc"], + linkstatic = 1, + deps = [ + ":socket_test_util", + gtest, + "//test/util:capability_util", + "//test/util:memory_util", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:thread_util", + "@com_google_absl//absl/synchronization", + ], +) + cc_binary( name = "semaphore_test", testonly = 1, diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc new file mode 100644 index 000000000..6ea48c263 --- /dev/null +++ b/test/syscalls/linux/network_namespace.cc @@ -0,0 +1,121 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/synchronization/notification.h" +#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/capability_util.h" +#include "test/util/memory_util.h" +#include "test/util/test_util.h" +#include "test/util/thread_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +using TestFunc = std::function; +using RunFunc = std::function; + +struct NamespaceStrategy { + RunFunc run; + + static NamespaceStrategy Of(RunFunc run) { + NamespaceStrategy s; + s.run = run; + return s; + } +}; + +PosixError RunWithUnshare(TestFunc fn) { + PosixError err = PosixError(-1, "function did not return a value"); + ScopedThread t([&] { + if (unshare(CLONE_NEWNET) != 0) { + err = PosixError(errno); + return; + } + err = fn(); + }); + t.Join(); + return err; +} + +PosixError RunWithClone(TestFunc fn) { + struct Args { + absl::Notification n; + TestFunc fn; + PosixError err; + }; + Args args; + args.fn = fn; + args.err = PosixError(-1, "function did not return a value"); + + ASSIGN_OR_RETURN_ERRNO( + Mapping child_stack, + MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); + pid_t child = clone( + +[](void *arg) { + Args *args = reinterpret_cast(arg); + args->err = args->fn(); + args->n.Notify(); + syscall(SYS_exit, 0); // Exit manually. No return address on stack. + return 0; + }, + reinterpret_cast(child_stack.addr() + kPageSize), + CLONE_NEWNET | CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, &args); + if (child < 0) { + return PosixError(errno, "clone() failed"); + } + args.n.WaitForNotification(); + return args.err; +} + +class NetworkNamespaceTest + : public ::testing::TestWithParam {}; + +TEST_P(NetworkNamespaceTest, LoopbackExists) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + + EXPECT_NO_ERRNO(GetParam().run([]() { + // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists. + // Check loopback device exists. + int sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock < 0) { + return PosixError(errno, "socket() failed"); + } + struct ifreq ifr; + snprintf(ifr.ifr_name, IFNAMSIZ, "lo"); + if (ioctl(sock, SIOCGIFINDEX, &ifr) < 0) { + return PosixError(errno, "ioctl() failed, lo cannot be found"); + } + return NoError(); + })); +} + +INSTANTIATE_TEST_SUITE_P( + AllNetworkNamespaceTest, NetworkNamespaceTest, + ::testing::Values(NamespaceStrategy::Of(RunWithUnshare), + NamespaceStrategy::Of(RunWithClone))); + +} // namespace + +} // namespace testing +} // namespace gvisor -- cgit v1.2.3 From 6def8ea6ac601daa9256a31f818db9f7eb532168 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 25 Feb 2020 12:22:09 -0800 Subject: Fix nested logging. PiperOrigin-RevId: 297175316 --- pkg/log/glog.go | 6 ++--- pkg/log/json.go | 2 +- pkg/log/json_k8s.go | 2 +- pkg/log/log.go | 60 ++++++++++++++++++++++++++++++++----------- pkg/sentry/kernel/task_log.go | 6 ++--- 5 files changed, 52 insertions(+), 24 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/log/glog.go b/pkg/log/glog.go index cab5fae55..b4f7bb5a4 100644 --- a/pkg/log/glog.go +++ b/pkg/log/glog.go @@ -46,7 +46,7 @@ var pid = os.Getpid() // line The line number // msg The user-supplied message // -func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, args ...interface{}) { +func (g *GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) { // Log level. prefix := byte('?') switch level { @@ -64,9 +64,7 @@ func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, ar microsecond := int(timestamp.Nanosecond() / 1000) // 0 = this frame. - // 1 = Debugf, etc. - // 2 = Caller. - _, file, line, ok := runtime.Caller(2) + _, file, line, ok := runtime.Caller(depth + 1) if ok { // Trim any directory path from the file. slash := strings.LastIndexByte(file, byte('/')) diff --git a/pkg/log/json.go b/pkg/log/json.go index a278c8fc8..0943db1cc 100644 --- a/pkg/log/json.go +++ b/pkg/log/json.go @@ -62,7 +62,7 @@ type JSONEmitter struct { } // Emit implements Emitter.Emit. -func (e JSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) { +func (e JSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) { j := jsonLog{ Msg: fmt.Sprintf(format, v...), Level: level, diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go index cee6eb514..6c6fc8b6f 100644 --- a/pkg/log/json_k8s.go +++ b/pkg/log/json_k8s.go @@ -33,7 +33,7 @@ type K8sJSONEmitter struct { } // Emit implements Emitter.Emit. -func (e *K8sJSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) { +func (e *K8sJSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) { j := k8sJSONLog{ Log: fmt.Sprintf(format, v...), Level: level, diff --git a/pkg/log/log.go b/pkg/log/log.go index 5056f17e6..a794da1aa 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -79,7 +79,7 @@ func (l Level) String() string { type Emitter interface { // Emit emits the given log statement. This allows for control over the // timestamp used for logging. - Emit(level Level, timestamp time.Time, format string, v ...interface{}) + Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{}) } // Writer writes the output to the given writer. @@ -142,7 +142,7 @@ func (l *Writer) Write(data []byte) (int, error) { } // Emit emits the message. -func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...interface{}) { +func (l *Writer) Emit(_ int, _ Level, _ time.Time, format string, args ...interface{}) { fmt.Fprintf(l, format, args...) } @@ -150,9 +150,9 @@ func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...i type MultiEmitter []Emitter // Emit emits to all emitters. -func (m *MultiEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) { +func (m *MultiEmitter) Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{}) { for _, e := range *m { - e.Emit(level, timestamp, format, v...) + e.Emit(1+depth, level, timestamp, format, v...) } } @@ -167,7 +167,7 @@ type TestEmitter struct { } // Emit emits to the TestLogger. -func (t *TestEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) { +func (t *TestEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) { t.Logf(format, v...) } @@ -198,22 +198,37 @@ type BasicLogger struct { // Debugf implements logger.Debugf. func (l *BasicLogger) Debugf(format string, v ...interface{}) { - if l.IsLogging(Debug) { - l.Emit(Debug, time.Now(), format, v...) - } + l.DebugfAtDepth(1, format, v...) } // Infof implements logger.Infof. func (l *BasicLogger) Infof(format string, v ...interface{}) { - if l.IsLogging(Info) { - l.Emit(Info, time.Now(), format, v...) - } + l.InfofAtDepth(1, format, v...) } // Warningf implements logger.Warningf. func (l *BasicLogger) Warningf(format string, v ...interface{}) { + l.WarningfAtDepth(1, format, v...) +} + +// DebugfAtDepth logs at a specific depth. +func (l *BasicLogger) DebugfAtDepth(depth int, format string, v ...interface{}) { + if l.IsLogging(Debug) { + l.Emit(1+depth, Debug, time.Now(), format, v...) + } +} + +// InfofAtDepth logs at a specific depth. +func (l *BasicLogger) InfofAtDepth(depth int, format string, v ...interface{}) { + if l.IsLogging(Info) { + l.Emit(1+depth, Info, time.Now(), format, v...) + } +} + +// WarningfAtDepth logs at a specific depth. +func (l *BasicLogger) WarningfAtDepth(depth int, format string, v ...interface{}) { if l.IsLogging(Warning) { - l.Emit(Warning, time.Now(), format, v...) + l.Emit(1+depth, Warning, time.Now(), format, v...) } } @@ -257,17 +272,32 @@ func SetLevel(newLevel Level) { // Debugf logs to the global logger. func Debugf(format string, v ...interface{}) { - Log().Debugf(format, v...) + Log().DebugfAtDepth(1, format, v...) } // Infof logs to the global logger. func Infof(format string, v ...interface{}) { - Log().Infof(format, v...) + Log().InfofAtDepth(1, format, v...) } // Warningf logs to the global logger. func Warningf(format string, v ...interface{}) { - Log().Warningf(format, v...) + Log().WarningfAtDepth(1, format, v...) +} + +// DebugfAtDepth logs to the global logger. +func DebugfAtDepth(depth int, format string, v ...interface{}) { + Log().DebugfAtDepth(1+depth, format, v...) +} + +// InfofAtDepth logs to the global logger. +func InfofAtDepth(depth int, format string, v ...interface{}) { + Log().InfofAtDepth(1+depth, format, v...) +} + +// WarningfAtDepth logs to the global logger. +func WarningfAtDepth(depth int, format string, v ...interface{}) { + Log().WarningfAtDepth(1+depth, format, v...) } // defaultStackSize is the default buffer size to allocate for stack traces. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 6d737d3e5..eeccaa197 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -32,21 +32,21 @@ const ( // Infof logs an formatted info message by calling log.Infof. func (t *Task) Infof(fmt string, v ...interface{}) { if log.IsLogging(log.Info) { - log.Infof(t.logPrefix.Load().(string)+fmt, v...) + log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) } } // Warningf logs a warning string by calling log.Warningf. func (t *Task) Warningf(fmt string, v ...interface{}) { if log.IsLogging(log.Warning) { - log.Warningf(t.logPrefix.Load().(string)+fmt, v...) + log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) } } // Debugf creates a debug string that includes the task ID. func (t *Task) Debugf(fmt string, v ...interface{}) { if log.IsLogging(log.Debug) { - log.Debugf(t.logPrefix.Load().(string)+fmt, v...) + log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) } } -- cgit v1.2.3 From 471b15b212831af31c2fe36cd42cea7ec7b7785b Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 25 Feb 2020 13:25:36 -0800 Subject: Port most syscalls to VFS2. pipe and pipe2 aren't ported, pending a slight rework of pipe FDs for VFS2. mount and umount2 aren't ported out of temporary laziness. access and faccessat need additional FSImpl methods to implement properly, but are stubbed to prevent googletest from CHECK-failing. Other syscalls require additional plumbing. Updates #1623 PiperOrigin-RevId: 297188448 --- pkg/abi/linux/epoll_amd64.go | 2 + pkg/abi/linux/epoll_arm64.go | 2 + pkg/abi/linux/file.go | 2 + pkg/abi/linux/fs.go | 2 + pkg/abi/linux/signal.go | 2 + pkg/abi/linux/time.go | 6 + pkg/abi/linux/xattr.go | 1 + pkg/fspath/BUILD | 4 +- pkg/fspath/builder.go | 8 + pkg/fspath/builder_unsafe.go | 27 - pkg/fspath/fspath.go | 3 +- pkg/gohacks/BUILD | 11 + pkg/gohacks/gohacks_unsafe.go | 57 ++ pkg/sentry/fsbridge/vfs.go | 10 +- pkg/sentry/fsimpl/proc/tasks.go | 4 +- pkg/sentry/kernel/fd_table.go | 49 +- pkg/sentry/kernel/fs_context.go | 22 + pkg/sentry/kernel/task.go | 18 + pkg/sentry/syscalls/linux/sys_epoll.go | 4 + pkg/sentry/syscalls/linux/sys_file.go | 40 ++ pkg/sentry/syscalls/linux/sys_getdents.go | 4 + pkg/sentry/syscalls/linux/sys_lseek.go | 4 + pkg/sentry/syscalls/linux/sys_mmap.go | 4 + pkg/sentry/syscalls/linux/sys_read.go | 4 + pkg/sentry/syscalls/linux/sys_stat.go | 4 + pkg/sentry/syscalls/linux/sys_sync.go | 4 + pkg/sentry/syscalls/linux/sys_write.go | 4 + pkg/sentry/syscalls/linux/sys_xattr.go | 4 + pkg/sentry/syscalls/linux/vfs2/BUILD | 28 +- pkg/sentry/syscalls/linux/vfs2/epoll.go | 225 ++++++++ pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go | 44 ++ pkg/sentry/syscalls/linux/vfs2/execve.go | 137 +++++ pkg/sentry/syscalls/linux/vfs2/fd.go | 147 ++++++ pkg/sentry/syscalls/linux/vfs2/filesystem.go | 326 ++++++++++++ pkg/sentry/syscalls/linux/vfs2/fscontext.go | 131 +++++ pkg/sentry/syscalls/linux/vfs2/getdents.go | 149 ++++++ pkg/sentry/syscalls/linux/vfs2/ioctl.go | 35 ++ .../syscalls/linux/vfs2/linux64_override_amd64.go | 216 ++++---- .../syscalls/linux/vfs2/linux64_override_arm64.go | 2 + pkg/sentry/syscalls/linux/vfs2/mmap.go | 92 ++++ pkg/sentry/syscalls/linux/vfs2/path.go | 94 ++++ pkg/sentry/syscalls/linux/vfs2/poll.go | 584 +++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/read_write.go | 511 ++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/setstat.go | 380 ++++++++++++++ pkg/sentry/syscalls/linux/vfs2/stat.go | 346 ++++++++++++ pkg/sentry/syscalls/linux/vfs2/sync.go | 87 +++ pkg/sentry/syscalls/linux/vfs2/sys_read.go | 95 ---- pkg/sentry/syscalls/linux/vfs2/xattr.go | 353 +++++++++++++ pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/epoll.go | 3 + pkg/sentry/vfs/mount_unsafe.go | 12 +- pkg/sentry/vfs/resolving_path.go | 2 +- pkg/sentry/vfs/vfs.go | 10 +- pkg/usermem/BUILD | 2 +- pkg/usermem/usermem.go | 9 +- pkg/usermem/usermem_unsafe.go | 27 - runsc/boot/filter/config.go | 2 + 57 files changed, 4082 insertions(+), 274 deletions(-) delete mode 100644 pkg/fspath/builder_unsafe.go create mode 100644 pkg/gohacks/BUILD create mode 100644 pkg/gohacks/gohacks_unsafe.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/execve.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/fd.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/filesystem.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/fscontext.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/getdents.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/ioctl.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/mmap.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/path.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/poll.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/read_write.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/setstat.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/stat.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/sync.go delete mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_read.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/xattr.go delete mode 100644 pkg/usermem/usermem_unsafe.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go index 57041491c..34ff18009 100644 --- a/pkg/abi/linux/epoll_amd64.go +++ b/pkg/abi/linux/epoll_amd64.go @@ -15,6 +15,8 @@ package linux // EpollEvent is equivalent to struct epoll_event from epoll(2). +// +// +marshal type EpollEvent struct { Events uint32 // Linux makes struct epoll_event::data a __u64. We represent it as diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go index 62ef5821e..f86c35329 100644 --- a/pkg/abi/linux/epoll_arm64.go +++ b/pkg/abi/linux/epoll_arm64.go @@ -15,6 +15,8 @@ package linux // EpollEvent is equivalent to struct epoll_event from epoll(2). +// +// +marshal type EpollEvent struct { Events uint32 // Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index c3ab15a4f..e229ac21c 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -241,6 +241,8 @@ const ( ) // Statx represents struct statx. +// +// +marshal type Statx struct { Mask uint32 Blksize uint32 diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index 2c652baa2..158d2db5b 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -38,6 +38,8 @@ const ( ) // Statfs is struct statfs, from uapi/asm-generic/statfs.h. +// +// +marshal type Statfs struct { // Type is one of the filesystem magic values, defined above. Type uint64 diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go index c69b04ea9..1c330e763 100644 --- a/pkg/abi/linux/signal.go +++ b/pkg/abi/linux/signal.go @@ -115,6 +115,8 @@ const ( ) // SignalSet is a signal mask with a bit corresponding to each signal. +// +// +marshal type SignalSet uint64 // SignalSetSize is the size in bytes of a SignalSet. diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index e562b46d9..e6860ed49 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -157,6 +157,8 @@ func DurationToTimespec(dur time.Duration) Timespec { const SizeOfTimeval = 16 // Timeval represents struct timeval in . +// +// +marshal type Timeval struct { Sec int64 Usec int64 @@ -230,6 +232,8 @@ type Tms struct { type TimerID int32 // StatxTimestamp represents struct statx_timestamp. +// +// +marshal type StatxTimestamp struct { Sec int64 Nsec uint32 @@ -258,6 +262,8 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { } // Utime represents struct utimbuf used by utimes(2). +// +// +marshal type Utime struct { Actime int64 Modtime int64 diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go index a3b6406fa..99180b208 100644 --- a/pkg/abi/linux/xattr.go +++ b/pkg/abi/linux/xattr.go @@ -18,6 +18,7 @@ package linux const ( XATTR_NAME_MAX = 255 XATTR_SIZE_MAX = 65536 + XATTR_LIST_MAX = 65536 XATTR_CREATE = 1 XATTR_REPLACE = 2 diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD index ee84471b2..67dd1e225 100644 --- a/pkg/fspath/BUILD +++ b/pkg/fspath/BUILD @@ -8,9 +8,11 @@ go_library( name = "fspath", srcs = [ "builder.go", - "builder_unsafe.go", "fspath.go", ], + deps = [ + "//pkg/gohacks", + ], ) go_test( diff --git a/pkg/fspath/builder.go b/pkg/fspath/builder.go index 7ddb36826..6318d3874 100644 --- a/pkg/fspath/builder.go +++ b/pkg/fspath/builder.go @@ -16,6 +16,8 @@ package fspath import ( "fmt" + + "gvisor.dev/gvisor/pkg/gohacks" ) // Builder is similar to strings.Builder, but is used to produce pathnames @@ -102,3 +104,9 @@ func (b *Builder) AppendString(str string) { copy(b.buf[b.start:], b.buf[oldStart:]) copy(b.buf[len(b.buf)-len(str):], str) } + +// String returns the accumulated string. No other methods should be called +// after String. +func (b *Builder) String() string { + return gohacks.StringFromImmutableBytes(b.buf[b.start:]) +} diff --git a/pkg/fspath/builder_unsafe.go b/pkg/fspath/builder_unsafe.go deleted file mode 100644 index 75606808d..000000000 --- a/pkg/fspath/builder_unsafe.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fspath - -import ( - "unsafe" -) - -// String returns the accumulated string. No other methods should be called -// after String. -func (b *Builder) String() string { - bs := b.buf[b.start:] - // Compare strings.Builder.String(). - return *(*string)(unsafe.Pointer(&bs)) -} diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go index 9fb3fee24..4c983d5fd 100644 --- a/pkg/fspath/fspath.go +++ b/pkg/fspath/fspath.go @@ -67,7 +67,8 @@ func Parse(pathname string) Path { // Path contains the information contained in a pathname string. // -// Path is copyable by value. +// Path is copyable by value. The zero value for Path is equivalent to +// fspath.Parse(""), i.e. the empty path. type Path struct { // Begin is an iterator to the first path component in the relative part of // the path. diff --git a/pkg/gohacks/BUILD b/pkg/gohacks/BUILD new file mode 100644 index 000000000..798a65eca --- /dev/null +++ b/pkg/gohacks/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "gohacks", + srcs = [ + "gohacks_unsafe.go", + ], + visibility = ["//:sandbox"], +) diff --git a/pkg/gohacks/gohacks_unsafe.go b/pkg/gohacks/gohacks_unsafe.go new file mode 100644 index 000000000..aad675172 --- /dev/null +++ b/pkg/gohacks/gohacks_unsafe.go @@ -0,0 +1,57 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gohacks contains utilities for subverting the Go compiler. +package gohacks + +import ( + "reflect" + "unsafe" +) + +// Noescape hides a pointer from escape analysis. Noescape is the identity +// function but escape analysis doesn't think the output depends on the input. +// Noescape is inlined and currently compiles down to zero instructions. +// USE CAREFULLY! +// +// (Noescape is copy/pasted from Go's runtime/stubs.go:noescape().) +// +//go:nosplit +func Noescape(p unsafe.Pointer) unsafe.Pointer { + x := uintptr(p) + return unsafe.Pointer(x ^ 0) +} + +// ImmutableBytesFromString is equivalent to []byte(s), except that it uses the +// same memory backing s instead of making a heap-allocated copy. This is only +// valid if the returned slice is never mutated. +func ImmutableBytesFromString(s string) []byte { + shdr := (*reflect.StringHeader)(unsafe.Pointer(&s)) + var bs []byte + bshdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) + bshdr.Data = shdr.Data + bshdr.Len = shdr.Len + bshdr.Cap = shdr.Len + return bs +} + +// StringFromImmutableBytes is equivalent to string(bs), except that it uses +// the same memory backing bs instead of making a heap-allocated copy. This is +// only valid if bs is never mutated after StringFromImmutableBytes returns. +func StringFromImmutableBytes(bs []byte) string { + // This is cheaper than messing with reflect.StringHeader and + // reflect.SliceHeader, which as of this writing produces many dead stores + // of zeroes. Compare strings.Builder.String(). + return *(*string)(unsafe.Pointer(&bs)) +} diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go index e657c39bc..6aa17bfc1 100644 --- a/pkg/sentry/fsbridge/vfs.go +++ b/pkg/sentry/fsbridge/vfs.go @@ -117,15 +117,19 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) // default anyways. // // TODO(gvisor.dev/issue/1623): Check mount has read and exec permission. -func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) { +func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) { vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem() creds := auth.CredentialsFromContext(ctx) + path := fspath.Parse(pathname) pop := &vfs.PathOperation{ Root: l.root, - Start: l.root, - Path: fspath.Parse(path), + Start: l.workingDir, + Path: path, FollowFinalSymlink: resolveFinal, } + if path.Absolute { + pop.Start = l.root + } fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index ce08a7d53..10c08fa90 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -73,9 +73,9 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), "net": newNetDir(root, inoGen, k), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), + "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}), "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), + "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}), } inode := &tasksInode{ diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 23b88f7a6..58001d56c 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -296,6 +296,50 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return fds, nil } +// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for +// the given file description. If it succeeds, it takes a reference on file. +func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + if minfd < 0 { + // Don't accept negative FDs. + return -1, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if minfd >= end { + return -1, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // From f.next to find available fd. + fd := minfd + if fd < f.next { + fd = f.next + } + for fd < end { + if d, _, _ := f.get(fd); d == nil { + f.setVFS2(fd, file, flags) + if fd == f.next { + // Update next search start position. + f.next = fd + 1 + } + return fd, nil + } + fd++ + } + return -1, syscall.EMFILE +} + // NewFDAt sets the file reference for the given FD. If there is an active // reference for that FD, the ref count for that existing reference is // decremented. @@ -316,9 +360,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 return syscall.EBADF } - f.mu.Lock() - defer f.mu.Unlock() - // Check the limit for the provided file. if limitSet := limits.FromContext(ctx); limitSet != nil { if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { @@ -327,6 +368,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 } // Install the entry. + f.mu.Lock() + defer f.mu.Unlock() f.setAll(fd, file, fileVFS2, flags) return nil } diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 7218aa24e..47f78df9a 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -244,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) { old.DecRef() } +// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. +// +// This is not a valid call after free. +func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { + if !vd.Ok() { + panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") + } + + f.mu.Lock() + + if !f.rootVFS2.Ok() { + f.mu.Unlock() + panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd)) + } + + old := f.rootVFS2 + vd.IncRef() + f.rootVFS2 = vd + f.mu.Unlock() + old.DecRef() +} + // Umask returns the current umask. func (f *FSContext) Umask() uint { f.mu.Lock() diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e37e23231..2cee2e6ed 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -789,6 +789,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) return fds[0], nil } +// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable.Get. +func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + return t.fdTable.NewFDVFS2(t, fd, file, flags) +} + // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. // // This automatically passes the task as the context. @@ -798,6 +807,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error { return t.fdTable.NewFDAt(t, fd, file, flags) } +// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error { + return t.fdTable.NewFDAtVFS2(t, fd, file, flags) +} + // WithMuLocked executes f with t.mu locked. func (t *Task) WithMuLocked(f func(*Task)) { t.mu.Lock() diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index fbef5b376..3ab93fbde 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // EpollCreate1 implements the epoll_create1(2) linux syscall. func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() @@ -164,3 +166,5 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return EpollWait(t, args) } + +// LINT.ThenChange(vfs2/epoll.go) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 421845ebb..c21f14dc0 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -130,6 +130,8 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string return path, dirPath, nil } +// LINT.IfChange + func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -575,6 +577,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + // Ioctl implements linux syscall ioctl(2). func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -650,6 +656,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } +// LINT.ThenChange(vfs2/ioctl.go) + +// LINT.IfChange + // Getcwd implements the linux syscall getcwd(2). func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() @@ -760,6 +770,10 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, nil } +// LINT.ThenChange(vfs2/fscontext.go) + +// LINT.IfChange + // Close implements linux syscall close(2). func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -1094,6 +1108,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } +// LINT.ThenChange(vfs2/fd.go) + const ( _FADV_NORMAL = 0 _FADV_RANDOM = 1 @@ -1141,6 +1157,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } +// LINT.IfChange + func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1421,6 +1439,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1480,6 +1502,10 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return n, nil, err } +// LINT.ThenChange(vfs2/stat.go) + +// LINT.IfChange + func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1516,6 +1542,10 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, unlinkAt(t, dirFD, addr) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + // Truncate implements linux syscall truncate(2). func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() @@ -1614,6 +1644,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } +// LINT.ThenChange(vfs2/setstat.go) + // Umask implements linux syscall umask(2). func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { mask := args[0].ModeT() @@ -1621,6 +1653,8 @@ func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(mask), nil, nil } +// LINT.IfChange + // Change ownership of a file. // // uid and gid may be -1, in which case they will not be changed. @@ -1987,6 +2021,10 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) } +// LINT.ThenChange(vfs2/setstat.go) + +// LINT.IfChange + func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error { newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { @@ -2042,6 +2080,8 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) } +// LINT.ThenChange(vfs2/filesystem.go) + // Fallocate implements linux system call fallocate(2). func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index f66f4ffde..b126fecc0 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -27,6 +27,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // Getdents implements linux syscall getdents(2) for 64bit systems. func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -244,3 +246,5 @@ func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error { func (ds *direntSerializer) Written() int { return ds.written } + +// LINT.ThenChange(vfs2/getdents.go) diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 297e920c4..3f7691eae 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // Lseek implements linux syscall lseek(2). func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -52,3 +54,5 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } return uintptr(offset), nil, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 9959f6e61..91694d374 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -35,6 +35,8 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo return uintptr(addr), nil, nil } +// LINT.IfChange + // Mmap implements linux syscall mmap(2). func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { prot := args[2].Int() @@ -104,6 +106,8 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return uintptr(rv), nil, err } +// LINT.ThenChange(vfs2/mmap.go) + // Munmap implements linux syscall munmap(2). func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 227692f06..78a2cb750 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + const ( // EventMaskRead contains events that can be triggered on reads. EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr @@ -388,3 +390,5 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i return total, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 11f25e00d..701b27b4a 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat { return linux.Stat{ Dev: sattr.DeviceID, @@ -297,3 +299,5 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error { _, err = t.CopyOut(addr, &statfs) return err } + +// LINT.ThenChange(vfs2/stat.go) diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 3e55235bd..5ad465ae3 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -22,6 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // Sync implements linux system call sync(2). func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { t.MountNamespace().SyncAll(t) @@ -135,3 +137,5 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) } + +// LINT.ThenChange(vfs2/sync.go) diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index aba892939..506ee54ce 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + const ( // EventMaskWrite contains events that can be triggered on writes. // @@ -358,3 +360,5 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) ( return total, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 9d8140b8a..2de5e3422 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // GetXattr implements linux syscall getxattr(2). func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return getXattrFromPath(t, args, true) @@ -418,3 +420,5 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error { return d.Inode.RemoveXattr(t, d, name) } + +// LINT.ThenChange(vfs2/xattr.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 6b8a00b6e..f51761e81 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -5,18 +5,44 @@ package(licenses = ["notice"]) go_library( name = "vfs2", srcs = [ + "epoll.go", + "epoll_unsafe.go", + "execve.go", + "fd.go", + "filesystem.go", + "fscontext.go", + "getdents.go", + "ioctl.go", "linux64.go", "linux64_override_amd64.go", "linux64_override_arm64.go", - "sys_read.go", + "mmap.go", + "path.go", + "poll.go", + "read_write.go", + "setstat.go", + "stat.go", + "sync.go", + "xattr.go", ], + marshal = True, visibility = ["//:sandbox"], deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/gohacks", "//pkg/sentry/arch", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/memmap", "//pkg/sentry/syscalls", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go new file mode 100644 index 000000000..d6cb0e79a --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -0,0 +1,225 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "math" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EpollCreate1 implements Linux syscall epoll_create1(2). +func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags&^linux.EPOLL_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + file, err := t.Kernel().VFS().NewEpollInstanceFD() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// EpollCreate implements Linux syscall epoll_create(2). +func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := args[0].Int() + + // "Since Linux 2.6.8, the size argument is ignored, but must be greater + // than zero" - epoll_create(2) + if size <= 0 { + return 0, nil, syserror.EINVAL + } + + file, err := t.Kernel().VFS().NewEpollInstanceFD() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// EpollCtl implements Linux syscall epoll_ctl(2). +func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + op := args[1].Int() + fd := args[2].Int() + eventAddr := args[3].Pointer() + + epfile := t.GetFileVFS2(epfd) + if epfile == nil { + return 0, nil, syserror.EBADF + } + defer epfile.DecRef() + ep, ok := epfile.Impl().(*vfs.EpollInstance) + if !ok { + return 0, nil, syserror.EINVAL + } + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + if epfile == file { + return 0, nil, syserror.EINVAL + } + + var event linux.EpollEvent + switch op { + case linux.EPOLL_CTL_ADD: + if err := event.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + return 0, nil, ep.AddInterest(file, fd, event) + case linux.EPOLL_CTL_DEL: + return 0, nil, ep.DeleteInterest(file, fd) + case linux.EPOLL_CTL_MOD: + if err := event.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + return 0, nil, ep.ModifyInterest(file, fd, event) + default: + return 0, nil, syserror.EINVAL + } +} + +// EpollWait implements Linux syscall epoll_wait(2). +func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeout := int(args[3].Int()) + + const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS + if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { + return 0, nil, syserror.EINVAL + } + + epfile := t.GetFileVFS2(epfd) + if epfile == nil { + return 0, nil, syserror.EBADF + } + defer epfile.DecRef() + ep, ok := epfile.Impl().(*vfs.EpollInstance) + if !ok { + return 0, nil, syserror.EINVAL + } + + // Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent, + // maxEvents), so that the buffer can be allocated on the stack. + var ( + events [16]linux.EpollEvent + total int + ch chan struct{} + haveDeadline bool + deadline ktime.Time + ) + for { + batchEvents := len(events) + if batchEvents > maxEvents { + batchEvents = maxEvents + } + n := ep.ReadEvents(events[:batchEvents]) + maxEvents -= n + if n != 0 { + // Copy what we read out. + copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n]) + eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent) + total += copiedEvents + if err != nil { + if total != 0 { + return uintptr(total), nil, nil + } + return 0, nil, err + } + // If we've filled the application's event buffer, we're done. + if maxEvents == 0 { + return uintptr(total), nil, nil + } + // Loop if we read a full batch, under the expectation that there + // may be more events to read. + if n == batchEvents { + continue + } + } + // We get here if n != batchEvents. If we read any number of events + // (just now, or in a previous iteration of this loop), or if timeout + // is 0 (such that epoll_wait should be non-blocking), return the + // events we've read so far to the application. + if total != 0 || timeout == 0 { + return uintptr(total), nil, nil + } + // In the first iteration of this loop, register with the epoll + // instance for readability events, but then immediately continue the + // loop since we need to retry ReadEvents() before blocking. In all + // subsequent iterations, block until events are available, the timeout + // expires, or an interrupt arrives. + if ch == nil { + var w waiter.Entry + w, ch = waiter.NewChannelEntry(nil) + epfile.EventRegister(&w, waiter.EventIn) + defer epfile.EventUnregister(&w) + } else { + // Set up the timer if a timeout was specified. + if timeout > 0 && !haveDeadline { + timeoutDur := time.Duration(timeout) * time.Millisecond + deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) + haveDeadline = true + } + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + // total must be 0 since otherwise we would have returned + // above. + return 0, nil, err + } + } + } +} + +// EpollPwait implements Linux syscall epoll_pwait(2). +func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + maskAddr := args[4].Pointer() + maskSize := uint(args[5].Uint()) + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + return EpollWait(t, args) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go new file mode 100644 index 000000000..825f325bf --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go @@ -0,0 +1,44 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "reflect" + "runtime" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/usermem" +) + +const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{})) + +func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) { + if len(events) == 0 { + return 0, nil + } + // Cast events to a byte slice for copying. + var eventBytes []byte + eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes)) + eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0]))) + eventBytesHdr.Len = len(events) * sizeofEpollEvent + eventBytesHdr.Cap = len(events) * sizeofEpollEvent + copiedBytes, err := t.CopyOutBytes(addr, eventBytes) + runtime.KeepAlive(events) + copiedEvents := copiedBytes / sizeofEpollEvent // rounded down + return copiedEvents, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go new file mode 100644 index 000000000..aef0078a8 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -0,0 +1,137 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/loader" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Execve implements linux syscall execve(2). +func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathnameAddr := args[0].Pointer() + argvAddr := args[1].Pointer() + envvAddr := args[2].Pointer() + return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */) +} + +// Execveat implements linux syscall execveat(2). +func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathnameAddr := args[1].Pointer() + argvAddr := args[2].Pointer() + envvAddr := args[3].Pointer() + flags := args[4].Int() + return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags) +} + +func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) + if err != nil { + return 0, nil, err + } + var argv, envv []string + if argvAddr != 0 { + var err error + argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + if envvAddr != 0 { + var err error + envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + var executable fsbridge.File + closeOnExec := false + if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { + // We must open the executable ourselves since dirfd is used as the + // starting point while resolving path, but the task working directory + // is used as the starting point while resolving interpreters (Linux: + // fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() => + // do_open_execat(fd=AT_FDCWD)), and the loader package is currently + // incapable of handling this correctly. + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return 0, nil, syserror.ENOENT + } + dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd) + if dirfile == nil { + return 0, nil, syserror.EBADF + } + start := dirfile.VirtualDentry() + start.IncRef() + dirfile.DecRef() + closeOnExec = dirfileFlags.CloseOnExec + file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, + }) + start.DecRef() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + executable = fsbridge.NewVFSFile(file) + } + + // Load the new TaskContext. + mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change + defer mntns.DecRef() + wd := t.FSContext().WorkingDirectoryVFS2() + defer wd.DecRef() + remainingTraversals := uint(linux.MaxSymlinkTraversals) + loadArgs := loader.LoadArgs{ + Opener: fsbridge.NewVFSLookup(mntns, root, wd), + RemainingTraversals: &remainingTraversals, + ResolveFinal: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + Filename: pathname, + File: executable, + CloseOnExec: closeOnExec, + Argv: argv, + Envv: envv, + Features: t.Arch().FeatureSet(), + } + + tc, se := t.Kernel().LoadTaskImage(t, loadArgs) + if se != nil { + return 0, nil, se.ToError() + } + + ctrl, err := t.Execve(tc) + return 0, ctrl, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go new file mode 100644 index 000000000..3afcea665 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -0,0 +1,147 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Close implements Linux syscall close(2). +func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + // Note that Remove provides a reference on the file that we may use to + // flush. It is still active until we drop the final reference below + // (and other reference-holding operations complete). + _, file := t.FDTable().Remove(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.OnClose(t) + return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) +} + +// Dup implements Linux syscall dup(2). +func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, syserror.EMFILE + } + return uintptr(newFD), nil, nil +} + +// Dup2 implements Linux syscall dup2(2). +func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + + if oldfd == newfd { + // As long as oldfd is valid, dup2() does nothing and returns newfd. + file := t.GetFileVFS2(oldfd) + if file == nil { + return 0, nil, syserror.EBADF + } + file.DecRef() + return uintptr(newfd), nil, nil + } + + return dup3(t, oldfd, newfd, 0) +} + +// Dup3 implements Linux syscall dup3(2). +func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + flags := args[2].Uint() + + if oldfd == newfd { + return 0, nil, syserror.EINVAL + } + + return dup3(t, oldfd, newfd, flags) +} + +func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { + if flags&^linux.O_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(oldfd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(newfd), nil, nil +} + +// Fcntl implements linux syscall fcntl(2). +func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + cmd := args[1].Int() + + file, flags := t.FDTable().GetVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + switch cmd { + case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: + minfd := args[2].Int() + fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{ + CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil + case linux.F_GETFD: + return uintptr(flags.ToLinuxFDFlags()), nil, nil + case linux.F_SETFD: + flags := args[2].Uint() + t.FDTable().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: flags&linux.FD_CLOEXEC != 0, + }) + return 0, nil, nil + case linux.F_GETFL: + return uintptr(file.StatusFlags()), nil, nil + case linux.F_SETFL: + return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) + default: + // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go new file mode 100644 index 000000000..fc5ceea4c --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -0,0 +1,326 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Link implements Linux syscall link(2). +func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldpathAddr := args[0].Pointer() + newpathAddr := args[1].Pointer() + return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) +} + +// Linkat implements Linux syscall linkat(2). +func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + flags := args[4].Int() + return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) +} + +func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { + return syserror.EINVAL + } + if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { + return syserror.ENOENT + } + + oldpath, err := copyInPath(t, oldpathAddr) + if err != nil { + return err + } + oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0)) + if err != nil { + return err + } + defer oldtpop.Release() + + newpath, err := copyInPath(t, newpathAddr) + if err != nil { + return err + } + newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer newtpop.Release() + + return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) +} + +// Mkdir implements Linux syscall mkdir(2). +func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode) +} + +// Mkdirat implements Linux syscall mkdirat(2). +func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + return 0, nil, mkdirat(t, dirfd, addr, mode) +} + +func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error { + path, err := copyInPath(t, addr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ + Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), + }) +} + +// Mknod implements Linux syscall mknod(2). +func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + dev := args[2].Uint() + return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev) +} + +// Mknodat implements Linux syscall mknodat(2). +func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + dev := args[3].Uint() + return 0, nil, mknodat(t, dirfd, addr, mode, dev) +} + +func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error { + path, err := copyInPath(t, addr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + major, minor := linux.DecodeDeviceID(dev) + return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ + Mode: linux.FileMode(mode &^ t.FSContext().Umask()), + DevMajor: uint32(major), + DevMinor: minor, + }) +} + +// Open implements Linux syscall open(2). +func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + mode := args[2].ModeT() + return openat(t, linux.AT_FDCWD, addr, flags, mode) +} + +// Openat implements Linux syscall openat(2). +func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + flags := args[2].Uint() + mode := args[3].ModeT() + return openat(t, dirfd, addr, flags, mode) +} + +// Creat implements Linux syscall creat(2). +func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode) +} + +func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) { + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0)) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ + Flags: flags, + Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()), + }) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + return uintptr(fd), nil, err +} + +// Rename implements Linux syscall rename(2). +func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldpathAddr := args[0].Pointer() + newpathAddr := args[1].Pointer() + return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) +} + +// Renameat implements Linux syscall renameat(2). +func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */) +} + +// Renameat2 implements Linux syscall renameat2(2). +func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + flags := args[4].Uint() + return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) +} + +func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error { + oldpath, err := copyInPath(t, oldpathAddr) + if err != nil { + return err + } + // "If oldpath refers to a symbolic link, the link is renamed" - rename(2) + oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer oldtpop.Release() + + newpath, err := copyInPath(t, newpathAddr) + if err != nil { + return err + } + newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer newtpop.Release() + + return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ + Flags: flags, + }) +} + +// Rmdir implements Linux syscall rmdir(2). +func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr) +} + +func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) +} + +// Unlink implements Linux syscall unlink(2). +func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr) +} + +func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) +} + +// Unlinkat implements Linux syscall unlinkat(2). +func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + + if flags&^linux.AT_REMOVEDIR != 0 { + return 0, nil, syserror.EINVAL + } + + if flags&linux.AT_REMOVEDIR != 0 { + return 0, nil, rmdirat(t, dirfd, pathAddr) + } + return 0, nil, unlinkat(t, dirfd, pathAddr) +} + +// Symlink implements Linux syscall symlink(2). +func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + targetAddr := args[0].Pointer() + linkpathAddr := args[1].Pointer() + return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr) +} + +// Symlinkat implements Linux syscall symlinkat(2). +func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + targetAddr := args[0].Pointer() + newdirfd := args[1].Int() + linkpathAddr := args[2].Pointer() + return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr) +} + +func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error { + target, err := t.CopyInString(targetAddr, linux.PATH_MAX) + if err != nil { + return err + } + linkpath, err := copyInPath(t, linkpathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go new file mode 100644 index 000000000..317409a18 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Getcwd implements Linux syscall getcwd(2). +func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + size := args[1].SizeT() + + root := t.FSContext().RootDirectoryVFS2() + wd := t.FSContext().WorkingDirectoryVFS2() + s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) + root.DecRef() + wd.DecRef() + if err != nil { + return 0, nil, err + } + + // Note this is >= because we need a terminator. + if uint(len(s)) >= size { + return 0, nil, syserror.ERANGE + } + + // Construct a byte slice containing a NUL terminator. + buf := t.CopyScratchBuffer(len(s) + 1) + copy(buf, s) + buf[len(buf)-1] = 0 + + // Write the pathname slice. + n, err := t.CopyOutBytes(addr, buf) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Chdir implements Linux syscall chdir(2). +func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetWorkingDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} + +// Fchdir implements Linux syscall fchdir(2). +func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetWorkingDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} + +// Chroot implements Linux syscall chroot(2). +func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + if !t.HasCapability(linux.CAP_SYS_CHROOT) { + return 0, nil, syserror.EPERM + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetRootDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go new file mode 100644 index 000000000..ddc140b65 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -0,0 +1,149 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Getdents implements Linux syscall getdents(2). +func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getdents(t, args, false /* isGetdents64 */) +} + +// Getdents64 implements Linux syscall getdents64(2). +func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getdents(t, args, true /* isGetdents64 */) +} + +func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := int(args[2].Uint()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + cb := getGetdentsCallback(t, addr, size, isGetdents64) + err := file.IterDirents(t, cb) + n := size - cb.remaining + putGetdentsCallback(cb) + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +type getdentsCallback struct { + t *kernel.Task + addr usermem.Addr + remaining int + isGetdents64 bool +} + +var getdentsCallbackPool = sync.Pool{ + New: func() interface{} { + return &getdentsCallback{} + }, +} + +func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback { + cb := getdentsCallbackPool.Get().(*getdentsCallback) + *cb = getdentsCallback{ + t: t, + addr: addr, + remaining: size, + isGetdents64: isGetdents64, + } + return cb +} + +func putGetdentsCallback(cb *getdentsCallback) { + cb.t = nil + getdentsCallbackPool.Put(cb) +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { + var buf []byte + if cb.isGetdents64 { + // struct linux_dirent64 { + // ino64_t d_ino; /* 64-bit inode number */ + // off64_t d_off; /* 64-bit offset to next structure */ + // unsigned short d_reclen; /* Size of this dirent */ + // unsigned char d_type; /* File type */ + // char d_name[]; /* Filename (null-terminated) */ + // }; + size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) + if size < cb.remaining { + return syserror.EINVAL + } + buf = cb.t.CopyScratchBuffer(size) + usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino) + usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) + usermem.ByteOrder.PutUint16(buf[16:18], uint16(size)) + buf[18] = dirent.Type + copy(buf[19:], dirent.Name) + buf[size-1] = 0 // NUL terminator + } else { + // struct linux_dirent { + // unsigned long d_ino; /* Inode number */ + // unsigned long d_off; /* Offset to next linux_dirent */ + // unsigned short d_reclen; /* Length of this linux_dirent */ + // char d_name[]; /* Filename (null-terminated) */ + // /* length is actually (d_reclen - 2 - + // offsetof(struct linux_dirent, d_name)) */ + // /* + // char pad; // Zero padding byte + // char d_type; // File type (only since Linux + // // 2.6.4); offset is (d_reclen - 1) + // */ + // }; + if cb.t.Arch().Width() != 8 { + panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width())) + } + size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name) + if size < cb.remaining { + return syserror.EINVAL + } + buf = cb.t.CopyScratchBuffer(size) + usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino) + usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) + usermem.ByteOrder.PutUint16(buf[16:18], uint16(size)) + copy(buf[18:], dirent.Name) + buf[size-3] = 0 // NUL terminator + buf[size-2] = 0 // zero padding byte + buf[size-1] = dirent.Type + } + n, err := cb.t.CopyOutBytes(cb.addr, buf) + if err != nil { + // Don't report partially-written dirents by advancing cb.addr or + // cb.remaining. + return err + } + cb.addr += usermem.Addr(n) + cb.remaining -= n + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go new file mode 100644 index 000000000..5a2418da9 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Ioctl implements Linux syscall ioctl(2). +func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + ret, err := file.Ioctl(t, t.MemoryManager(), args) + return ret, nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index e0ac32b33..7d220bc20 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build amd64 + package vfs2 import ( @@ -22,110 +24,142 @@ import ( // Override syscall table to add syscalls implementations from this package. func Override(table map[uintptr]kernel.Syscall) { table[0] = syscalls.Supported("read", Read) - - // Remove syscalls that haven't been converted yet. It's better to get ENOSYS - // rather than a SIGSEGV deep in the stack. - delete(table, 1) // write - delete(table, 2) // open - delete(table, 3) // close - delete(table, 4) // stat - delete(table, 5) // fstat - delete(table, 6) // lstat - delete(table, 7) // poll - delete(table, 8) // lseek - delete(table, 9) // mmap - delete(table, 16) // ioctl - delete(table, 17) // pread64 - delete(table, 18) // pwrite64 - delete(table, 19) // readv - delete(table, 20) // writev - delete(table, 21) // access - delete(table, 22) // pipe - delete(table, 32) // dup - delete(table, 33) // dup2 - delete(table, 40) // sendfile - delete(table, 59) // execve - delete(table, 72) // fcntl - delete(table, 73) // flock - delete(table, 74) // fsync - delete(table, 75) // fdatasync - delete(table, 76) // truncate - delete(table, 77) // ftruncate - delete(table, 78) // getdents - delete(table, 79) // getcwd - delete(table, 80) // chdir - delete(table, 81) // fchdir - delete(table, 82) // rename - delete(table, 83) // mkdir - delete(table, 84) // rmdir - delete(table, 85) // creat - delete(table, 86) // link - delete(table, 87) // unlink - delete(table, 88) // symlink - delete(table, 89) // readlink - delete(table, 90) // chmod - delete(table, 91) // fchmod - delete(table, 92) // chown - delete(table, 93) // fchown - delete(table, 94) // lchown - delete(table, 133) // mknod - delete(table, 137) // statfs - delete(table, 138) // fstatfs - delete(table, 161) // chroot - delete(table, 162) // sync + table[1] = syscalls.Supported("write", Write) + table[2] = syscalls.Supported("open", Open) + table[3] = syscalls.Supported("close", Close) + table[4] = syscalls.Supported("stat", Stat) + table[5] = syscalls.Supported("fstat", Fstat) + table[6] = syscalls.Supported("lstat", Lstat) + table[7] = syscalls.Supported("poll", Poll) + table[8] = syscalls.Supported("lseek", Lseek) + table[9] = syscalls.Supported("mmap", Mmap) + table[16] = syscalls.Supported("ioctl", Ioctl) + table[17] = syscalls.Supported("pread64", Pread64) + table[18] = syscalls.Supported("pwrite64", Pwrite64) + table[19] = syscalls.Supported("readv", Readv) + table[20] = syscalls.Supported("writev", Writev) + table[21] = syscalls.Supported("access", Access) + delete(table, 22) // pipe + table[23] = syscalls.Supported("select", Select) + table[32] = syscalls.Supported("dup", Dup) + table[33] = syscalls.Supported("dup2", Dup2) + delete(table, 40) // sendfile + delete(table, 41) // socket + delete(table, 42) // connect + delete(table, 43) // accept + delete(table, 44) // sendto + delete(table, 45) // recvfrom + delete(table, 46) // sendmsg + delete(table, 47) // recvmsg + delete(table, 48) // shutdown + delete(table, 49) // bind + delete(table, 50) // listen + delete(table, 51) // getsockname + delete(table, 52) // getpeername + delete(table, 53) // socketpair + delete(table, 54) // setsockopt + delete(table, 55) // getsockopt + table[59] = syscalls.Supported("execve", Execve) + table[72] = syscalls.Supported("fcntl", Fcntl) + delete(table, 73) // flock + table[74] = syscalls.Supported("fsync", Fsync) + table[75] = syscalls.Supported("fdatasync", Fdatasync) + table[76] = syscalls.Supported("truncate", Truncate) + table[77] = syscalls.Supported("ftruncate", Ftruncate) + table[78] = syscalls.Supported("getdents", Getdents) + table[79] = syscalls.Supported("getcwd", Getcwd) + table[80] = syscalls.Supported("chdir", Chdir) + table[81] = syscalls.Supported("fchdir", Fchdir) + table[82] = syscalls.Supported("rename", Rename) + table[83] = syscalls.Supported("mkdir", Mkdir) + table[84] = syscalls.Supported("rmdir", Rmdir) + table[85] = syscalls.Supported("creat", Creat) + table[86] = syscalls.Supported("link", Link) + table[87] = syscalls.Supported("unlink", Unlink) + table[88] = syscalls.Supported("symlink", Symlink) + table[89] = syscalls.Supported("readlink", Readlink) + table[90] = syscalls.Supported("chmod", Chmod) + table[91] = syscalls.Supported("fchmod", Fchmod) + table[92] = syscalls.Supported("chown", Chown) + table[93] = syscalls.Supported("fchown", Fchown) + table[94] = syscalls.Supported("lchown", Lchown) + table[132] = syscalls.Supported("utime", Utime) + table[133] = syscalls.Supported("mknod", Mknod) + table[137] = syscalls.Supported("statfs", Statfs) + table[138] = syscalls.Supported("fstatfs", Fstatfs) + table[161] = syscalls.Supported("chroot", Chroot) + table[162] = syscalls.Supported("sync", Sync) delete(table, 165) // mount delete(table, 166) // umount2 - delete(table, 172) // iopl - delete(table, 173) // ioperm delete(table, 187) // readahead - delete(table, 188) // setxattr - delete(table, 189) // lsetxattr - delete(table, 190) // fsetxattr - delete(table, 191) // getxattr - delete(table, 192) // lgetxattr - delete(table, 193) // fgetxattr + table[188] = syscalls.Supported("setxattr", Setxattr) + table[189] = syscalls.Supported("lsetxattr", Lsetxattr) + table[190] = syscalls.Supported("fsetxattr", Fsetxattr) + table[191] = syscalls.Supported("getxattr", Getxattr) + table[192] = syscalls.Supported("lgetxattr", Lgetxattr) + table[193] = syscalls.Supported("fgetxattr", Fgetxattr) + table[194] = syscalls.Supported("listxattr", Listxattr) + table[195] = syscalls.Supported("llistxattr", Llistxattr) + table[196] = syscalls.Supported("flistxattr", Flistxattr) + table[197] = syscalls.Supported("removexattr", Removexattr) + table[198] = syscalls.Supported("lremovexattr", Lremovexattr) + table[199] = syscalls.Supported("fremovexattr", Fremovexattr) delete(table, 206) // io_setup delete(table, 207) // io_destroy delete(table, 208) // io_getevents delete(table, 209) // io_submit delete(table, 210) // io_cancel - delete(table, 213) // epoll_create - delete(table, 214) // epoll_ctl_old - delete(table, 215) // epoll_wait_old - delete(table, 216) // remap_file_pages - delete(table, 217) // getdents64 - delete(table, 232) // epoll_wait - delete(table, 233) // epoll_ctl + table[213] = syscalls.Supported("epoll_create", EpollCreate) + table[217] = syscalls.Supported("getdents64", Getdents64) + delete(table, 221) // fdavise64 + table[232] = syscalls.Supported("epoll_wait", EpollWait) + table[233] = syscalls.Supported("epoll_ctl", EpollCtl) + table[235] = syscalls.Supported("utimes", Utimes) delete(table, 253) // inotify_init delete(table, 254) // inotify_add_watch delete(table, 255) // inotify_rm_watch - delete(table, 257) // openat - delete(table, 258) // mkdirat - delete(table, 259) // mknodat - delete(table, 260) // fchownat - delete(table, 261) // futimesat - delete(table, 262) // fstatat - delete(table, 263) // unlinkat - delete(table, 264) // renameat - delete(table, 265) // linkat - delete(table, 266) // symlinkat - delete(table, 267) // readlinkat - delete(table, 268) // fchmodat - delete(table, 269) // faccessat - delete(table, 270) // pselect - delete(table, 271) // ppoll + table[257] = syscalls.Supported("openat", Openat) + table[258] = syscalls.Supported("mkdirat", Mkdirat) + table[259] = syscalls.Supported("mknodat", Mknodat) + table[260] = syscalls.Supported("fchownat", Fchownat) + table[261] = syscalls.Supported("futimens", Futimens) + table[262] = syscalls.Supported("newfstatat", Newfstatat) + table[263] = syscalls.Supported("unlinkat", Unlinkat) + table[264] = syscalls.Supported("renameat", Renameat) + table[265] = syscalls.Supported("linkat", Linkat) + table[266] = syscalls.Supported("symlinkat", Symlinkat) + table[267] = syscalls.Supported("readlinkat", Readlinkat) + table[268] = syscalls.Supported("fchmodat", Fchmodat) + table[269] = syscalls.Supported("faccessat", Faccessat) + table[270] = syscalls.Supported("pselect", Pselect) + table[271] = syscalls.Supported("ppoll", Ppoll) + delete(table, 275) // splice + delete(table, 276) // tee + table[277] = syscalls.Supported("sync_file_range", SyncFileRange) + table[280] = syscalls.Supported("utimensat", Utimensat) + table[281] = syscalls.Supported("epoll_pwait", EpollPwait) + delete(table, 282) // signalfd + delete(table, 283) // timerfd_create + delete(table, 284) // eventfd delete(table, 285) // fallocate - delete(table, 291) // epoll_create1 - delete(table, 292) // dup3 + delete(table, 286) // timerfd_settime + delete(table, 287) // timerfd_gettime + delete(table, 288) // accept4 + delete(table, 289) // signalfd4 + delete(table, 290) // eventfd2 + table[291] = syscalls.Supported("epoll_create1", EpollCreate1) + table[292] = syscalls.Supported("dup3", Dup3) delete(table, 293) // pipe2 delete(table, 294) // inotify_init1 - delete(table, 295) // preadv - delete(table, 296) // pwritev - delete(table, 306) // syncfs - delete(table, 316) // renameat2 + table[295] = syscalls.Supported("preadv", Preadv) + table[296] = syscalls.Supported("pwritev", Pwritev) + delete(table, 299) // recvmmsg + table[306] = syscalls.Supported("syncfs", Syncfs) + delete(table, 307) // sendmmsg + table[316] = syscalls.Supported("renameat2", Renameat2) delete(table, 319) // memfd_create - delete(table, 322) // execveat - delete(table, 327) // preadv2 - delete(table, 328) // pwritev2 - delete(table, 332) // statx + table[322] = syscalls.Supported("execveat", Execveat) + table[327] = syscalls.Supported("preadv2", Preadv2) + table[328] = syscalls.Supported("pwritev2", Pwritev2) + table[332] = syscalls.Supported("statx", Statx) } diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go index 6af5c400f..a6b367468 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build arm64 + package vfs2 import ( diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go new file mode 100644 index 000000000..60a43f0a0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -0,0 +1,92 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mmap implements Linux syscall mmap(2). +func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + prot := args[2].Int() + flags := args[3].Int() + fd := args[4].Int() + fixed := flags&linux.MAP_FIXED != 0 + private := flags&linux.MAP_PRIVATE != 0 + shared := flags&linux.MAP_SHARED != 0 + anon := flags&linux.MAP_ANONYMOUS != 0 + map32bit := flags&linux.MAP_32BIT != 0 + + // Require exactly one of MAP_PRIVATE and MAP_SHARED. + if private == shared { + return 0, nil, syserror.EINVAL + } + + opts := memmap.MMapOpts{ + Length: args[1].Uint64(), + Offset: args[5].Uint64(), + Addr: args[0].Pointer(), + Fixed: fixed, + Unmap: fixed, + Map32Bit: map32bit, + Private: private, + Perms: usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, + MaxPerms: usermem.AnyAccess, + GrowsDown: linux.MAP_GROWSDOWN&flags != 0, + Precommit: linux.MAP_POPULATE&flags != 0, + } + if linux.MAP_LOCKED&flags != 0 { + opts.MLockMode = memmap.MLockEager + } + defer func() { + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef() + } + }() + + if !anon { + // Convert the passed FD to a file reference. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // mmap unconditionally requires that the FD is readable. + if !file.IsReadable() { + return 0, nil, syserror.EACCES + } + // MAP_SHARED requires that the FD be writable for PROT_WRITE. + if shared && !file.IsWritable() { + opts.MaxPerms.Write = false + } + + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } + } + + rv, err := t.MemoryManager().MMap(t, opts) + return uintptr(rv), nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go new file mode 100644 index 000000000..97da6c647 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -0,0 +1,94 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) { + pathname, err := t.CopyInString(addr, linux.PATH_MAX) + if err != nil { + return fspath.Path{}, err + } + return fspath.Parse(pathname), nil +} + +type taskPathOperation struct { + pop vfs.PathOperation + haveStartRef bool +} + +func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) { + root := t.FSContext().RootDirectoryVFS2() + start := root + haveStartRef := false + if !path.Absolute { + if !path.HasComponents() && !bool(shouldAllowEmptyPath) { + root.DecRef() + return taskPathOperation{}, syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + haveStartRef = true + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + root.DecRef() + return taskPathOperation{}, syserror.EBADF + } + start = dirfile.VirtualDentry() + start.IncRef() + haveStartRef = true + dirfile.DecRef() + } + } + return taskPathOperation{ + pop: vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: bool(shouldFollowFinalSymlink), + }, + haveStartRef: haveStartRef, + }, nil +} + +func (tpop *taskPathOperation) Release() { + tpop.pop.Root.DecRef() + if tpop.haveStartRef { + tpop.pop.Start.DecRef() + tpop.haveStartRef = false + } +} + +type shouldAllowEmptyPath bool + +const ( + disallowEmptyPath shouldAllowEmptyPath = false + allowEmptyPath shouldAllowEmptyPath = true +) + +type shouldFollowFinalSymlink bool + +const ( + nofollowFinalSymlink shouldFollowFinalSymlink = false + followFinalSymlink shouldFollowFinalSymlink = true +) diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go new file mode 100644 index 000000000..dbf4882da --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -0,0 +1,584 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// fileCap is the maximum allowable files for poll & select. This has no +// equivalent in Linux; it exists in gVisor since allocation failure in Go is +// unrecoverable. +const fileCap = 1024 * 1024 + +// Masks for "readable", "writable", and "exceptional" events as defined by +// select(2). +const ( + // selectReadEvents is analogous to the Linux kernel's + // fs/select.c:POLLIN_SET. + selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR + + // selectWriteEvents is analogous to the Linux kernel's + // fs/select.c:POLLOUT_SET. + selectWriteEvents = linux.POLLOUT | linux.POLLERR + + // selectExceptEvents is analogous to the Linux kernel's + // fs/select.c:POLLEX_SET. + selectExceptEvents = linux.POLLPRI +) + +// pollState tracks the associated file description and waiter of a PollFD. +type pollState struct { + file *vfs.FileDescription + waiter waiter.Entry +} + +// initReadiness gets the current ready mask for the file represented by the FD +// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is +// used to register with the file for event notifications, and a reference to +// the file is stored in "state". +func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) { + if pfd.FD < 0 { + pfd.REvents = 0 + return + } + + file := t.GetFileVFS2(pfd.FD) + if file == nil { + pfd.REvents = linux.POLLNVAL + return + } + + if ch == nil { + defer file.DecRef() + } else { + state.file = file + state.waiter, _ = waiter.NewChannelEntry(ch) + file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events))) + } + + r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events))) + pfd.REvents = int16(r.ToLinux()) & pfd.Events +} + +// releaseState releases all the pollState in "state". +func releaseState(state []pollState) { + for i := range state { + if state[i].file != nil { + state[i].file.EventUnregister(&state[i].waiter) + state[i].file.DecRef() + } + } +} + +// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout" +// when "timeout" is greater than zero. +// +// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or +// positive if interrupted by a signal. +func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) { + var ch chan struct{} + if timeout != 0 { + ch = make(chan struct{}, 1) + } + + // Register for event notification in the files involved if we may + // block (timeout not zero). Once we find a file that has a non-zero + // result, we stop registering for events but still go through all files + // to get their ready masks. + state := make([]pollState, len(pfd)) + defer releaseState(state) + n := uintptr(0) + for i := range pfd { + initReadiness(t, &pfd[i], &state[i], ch) + if pfd[i].REvents != 0 { + n++ + ch = nil + } + } + + if timeout == 0 { + return timeout, n, nil + } + + haveTimeout := timeout >= 0 + + for n == 0 { + var err error + // Wait for a notification. + timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout) + if err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + return timeout, 0, err + } + + // We got notified, count how many files are ready. If none, + // then this was a spurious notification, and we just go back + // to sleep with the remaining timeout. + for i := range state { + if state[i].file == nil { + continue + } + + r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events))) + rl := int16(r.ToLinux()) & pfd[i].Events + if rl != 0 { + pfd[i].REvents = rl + n++ + } + } + } + + return timeout, n, nil +} + +// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. +func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) { + if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { + return nil, syserror.EINVAL + } + + pfd := make([]linux.PollFD, nfds) + if nfds > 0 { + if _, err := t.CopyIn(addr, &pfd); err != nil { + return nil, err + } + } + + return pfd, nil +} + +func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) { + pfd, err := copyInPollFDs(t, addr, nfds) + if err != nil { + return timeout, 0, err + } + + // Compatibility warning: Linux adds POLLHUP and POLLERR just before + // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after + // polling, changing event masks here is an application-visible difference. + // (Linux also doesn't copy out event masks at all, only revents.) + for i := range pfd { + pfd[i].Events |= linux.POLLHUP | linux.POLLERR + } + remainingTimeout, n, err := pollBlock(t, pfd, timeout) + err = syserror.ConvertIntr(err, syserror.EINTR) + + // The poll entries are copied out regardless of whether + // any are set or not. This aligns with the Linux behavior. + if nfds > 0 && err == nil { + if _, err := t.CopyOut(addr, pfd); err != nil { + return remainingTimeout, 0, err + } + } + + return remainingTimeout, n, err +} + +// CopyInFDSet copies an fd set from select(2)/pselect(2). +func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { + set := make([]byte, nBytes) + + if addr != 0 { + if _, err := t.CopyIn(addr, &set); err != nil { + return nil, err + } + // If we only use part of the last byte, mask out the extraneous bits. + // + // N.B. This only works on little-endian architectures. + if nBitsInLastPartialByte != 0 { + set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte + } + } + return set, nil +} + +func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) { + if nfds < 0 || nfds > fileCap { + return 0, syserror.EINVAL + } + + // Calculate the size of the fd sets (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 + + // Capture all the provided input vectors. + r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + + // Count how many FDs are actually being requested so that we can build + // a PollFD array. + fdCount := 0 + for i := 0; i < nBytes; i++ { + v := r[i] | w[i] | e[i] + for v != 0 { + v &= (v - 1) + fdCount++ + } + } + + // Build the PollFD array. + pfd := make([]linux.PollFD, 0, fdCount) + var fd int32 + for i := 0; i < nBytes; i++ { + rV, wV, eV := r[i], w[i], e[i] + v := rV | wV | eV + m := byte(1) + for j := 0; j < 8; j++ { + if (v & m) != 0 { + // Make sure the fd is valid and decrement the reference + // immediately to ensure we don't leak. Note, another thread + // might be about to close fd. This is racy, but that's + // OK. Linux is racy in the same way. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + file.DecRef() + + var mask int16 + if (rV & m) != 0 { + mask |= selectReadEvents + } + + if (wV & m) != 0 { + mask |= selectWriteEvents + } + + if (eV & m) != 0 { + mask |= selectExceptEvents + } + + pfd = append(pfd, linux.PollFD{ + FD: fd, + Events: mask, + }) + } + + fd++ + m <<= 1 + } + } + + // Do the syscall, then count the number of bits set. + if _, _, err = pollBlock(t, pfd, timeout); err != nil { + return 0, syserror.ConvertIntr(err, syserror.EINTR) + } + + // r, w, and e are currently event mask bitsets; unset bits corresponding + // to events that *didn't* occur. + bitSetCount := uintptr(0) + for idx := range pfd { + events := pfd[idx].REvents + i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8) + m := byte(1) << j + if r[i]&m != 0 { + if (events & selectReadEvents) != 0 { + bitSetCount++ + } else { + r[i] &^= m + } + } + if w[i]&m != 0 { + if (events & selectWriteEvents) != 0 { + bitSetCount++ + } else { + w[i] &^= m + } + } + if e[i]&m != 0 { + if (events & selectExceptEvents) != 0 { + bitSetCount++ + } else { + e[i] &^= m + } + } + } + + // Copy updated vectors back. + if readFDs != 0 { + if _, err := t.CopyOut(readFDs, r); err != nil { + return 0, err + } + } + + if writeFDs != 0 { + if _, err := t.CopyOut(writeFDs, w); err != nil { + return 0, err + } + } + + if exceptFDs != 0 { + if _, err := t.CopyOut(exceptFDs, e); err != nil { + return 0, err + } + } + + return bitSetCount, nil +} + +// timeoutRemaining returns the amount of time remaining for the specified +// timeout or 0 if it has elapsed. +// +// startNs must be from CLOCK_MONOTONIC. +func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration { + now := t.Kernel().MonotonicClock().Now() + remaining := timeout - now.Sub(startNs) + if remaining < 0 { + remaining = 0 + } + return remaining +} + +// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) + return tsRemaining.CopyOut(t, timespecAddr) +} + +// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) + return tvRemaining.CopyOut(t, timevalAddr) +} + +// pollRestartBlock encapsulates the state required to restart poll(2) via +// restart_syscall(2). +// +// +stateify savable +type pollRestartBlock struct { + pfdAddr usermem.Addr + nfds uint + timeout time.Duration +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return poll(t, p.pfdAddr, p.nfds, p.timeout) +} + +func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) { + remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) + // On an interrupt poll(2) is restarted with the remaining timeout. + if err == syserror.EINTR { + t.SetSyscallRestartBlock(&pollRestartBlock{ + pfdAddr: pfdAddr, + nfds: nfds, + timeout: remainingTimeout, + }) + return 0, kernel.ERESTART_RESTARTBLOCK + } + return n, err +} + +// Poll implements linux syscall poll(2). +func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timeout := time.Duration(args[2].Int()) * time.Millisecond + n, err := poll(t, pfdAddr, nfds, timeout) + return n, nil, err +} + +// Ppoll implements linux syscall ppoll(2). +func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timespecAddr := args[2].Pointer() + maskAddr := args[3].Pointer() + maskSize := uint(args[4].Uint()) + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + _, n, err := doPoll(t, pfdAddr, nfds, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // doPoll returns EINTR if interrupted, but ppoll is normally restartable + // if interrupted by something other than a signal handled by the + // application (i.e. returns ERESTARTNOHAND). However, if + // copyOutTimespecRemaining failed, then the restarted ppoll would use the + // wrong timeout, so the error should be left as EINTR. + // + // Note that this means that if err is nil but copyErr is not, copyErr is + // ignored. This is consistent with Linux. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Select implements linux syscall select(2). +func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timevalAddr := args[4].Pointer() + + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timevalAddr != 0 { + var timeval linux.Timeval + if err := timeval.CopyIn(t, timevalAddr); err != nil { + return 0, nil, err + } + if timeval.Sec < 0 || timeval.Usec < 0 { + return 0, nil, syserror.EINVAL + } + timeout = time.Duration(timeval.ToNsecCapped()) + } + startNs := t.Kernel().MonotonicClock().Now() + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Pselect implements linux syscall pselect(2). +func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timespecAddr := args[4].Pointer() + maskWithSizeAddr := args[5].Pointer() + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if maskWithSizeAddr != 0 { + if t.Arch().Width() != 8 { + panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width())) + } + var maskStruct sigSetWithSize + if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil { + return 0, nil, err + } + if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil { + return 0, nil, err + } + } + + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// +marshal +type sigSetWithSize struct { + sigsetAddr uint64 + sizeofSigset uint64 +} + +// copyTimespecInToDuration copies a Timespec from the untrusted app range, +// validates it and converts it to a Duration. +// +// If the Timespec is larger than what can be represented in a Duration, the +// returned value is the maximum that Duration will allow. +// +// If timespecAddr is NULL, the returned value is negative. +func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) { + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timespecAddr != 0 { + var timespec linux.Timespec + if err := timespec.CopyIn(t, timespecAddr); err != nil { + return 0, err + } + if !timespec.Valid() { + return 0, syserror.EINVAL + } + timeout = time.Duration(timespec.ToNsecCapped()) + } + return timeout, nil +} + +func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error { + if maskAddr == 0 { + return nil + } + if maskSize != linux.SignalSetSize { + return syserror.EINVAL + } + var mask linux.SignalSet + if err := mask.CopyIn(t, maskAddr); err != nil { + return err + } + mask &^= kernel.UnblockableSignals + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go new file mode 100644 index 000000000..35f6308d6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -0,0 +1,511 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + eventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr + eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr +) + +// Read implements Linux syscall read(2). +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +// Readv implements Linux syscall readv(2). +func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file) +} + +func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := file.Read(t, dst, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Read(t, dst, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Pread64 implements Linux syscall pread64(2). +func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file) +} + +// Preadv implements Linux syscall preadv(2). +func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file) +} + +// Preadv2 implements Linux syscall preadv2(2). +func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the glibc signature is + // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the actual syscall + // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 6th argument (index 5). + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := args[5].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + opts := vfs.ReadOptions{ + Flags: uint32(flags), + } + var n int64 + if offset == -1 { + n, err = read(t, file, dst, opts) + } else { + n, err = pread(t, file, dst, offset, opts) + } + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) +} + +func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + n, err := file.PRead(t, dst, offset, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.PRead(t, dst, offset+total, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Write implements Linux syscall write(2). +func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := write(t, file, src, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file) +} + +// Writev implements Linux syscall writev(2). +func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := write(t, file, src, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file) +} + +func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + n, err := file.Write(t, src, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Write(t, src, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Pwrite64 implements Linux syscall pwrite64(2). +func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file) +} + +// Pwritev implements Linux syscall pwritev(2). +func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file) +} + +// Pwritev2 implements Linux syscall pwritev2(2). +func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the glibc signature is + // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the actual syscall + // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 6th argument (index 5). + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := args[5].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + opts := vfs.WriteOptions{ + Flags: uint32(flags), + } + var n int64 + if offset == -1 { + n, err = write(t, file, src, opts) + } else { + n, err = pwrite(t, file, src, offset, opts) + } + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) +} + +func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, err := file.PWrite(t, src, offset, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.PWrite(t, src, offset+total, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Lseek implements Linux syscall lseek(2). +func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + whence := args[2].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newoff, err := file.Seek(t, offset, whence) + return uintptr(newoff), nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go new file mode 100644 index 000000000..9250659ff --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -0,0 +1,380 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX + +// Chmod implements Linux syscall chmod(2). +func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + mode := args[1].ModeT() + return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode) +} + +// Fchmodat implements Linux syscall fchmodat(2). +func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + mode := args[2].ModeT() + return 0, nil, fchmodat(t, dirfd, pathAddr, mode) +} + +func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE, + Mode: uint16(mode & chmodMask), + }, + }) +} + +// Fchmod implements Linux syscall fchmod(2). +func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].ModeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SetStat(t, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE, + Mode: uint16(mode & chmodMask), + }, + }) +} + +// Chown implements Linux syscall chown(2). +func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + owner := args[1].Int() + group := args[2].Int() + return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */) +} + +// Lchown implements Linux syscall lchown(2). +func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + owner := args[1].Int() + group := args[2].Int() + return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW) +} + +// Fchownat implements Linux syscall fchownat(2). +func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + owner := args[2].Int() + group := args[3].Int() + flags := args[4].Int() + return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags) +} + +func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { + return err + } + + return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) +} + +func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error { + userns := t.UserNamespace() + if owner != -1 { + kuid := userns.MapToKUID(auth.UID(owner)) + if !kuid.Ok() { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_UID + opts.Stat.UID = uint32(kuid) + } + if group != -1 { + kgid := userns.MapToKGID(auth.GID(group)) + if !kgid.Ok() { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_GID + opts.Stat.GID = uint32(kgid) + } + return nil +} + +// Fchown implements Linux syscall fchown(2). +func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + owner := args[1].Int() + group := args[2].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { + return 0, nil, err + } + return 0, nil, file.SetStat(t, opts) +} + +// Truncate implements Linux syscall truncate(2). +func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: uint64(length), + }, + }) +} + +// Ftruncate implements Linux syscall ftruncate(2). +func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SetStat(t, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: uint64(length), + }, + }) +} + +// Utime implements Linux syscall utime(2). +func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + opts := vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_ATIME | linux.STATX_MTIME, + }, + } + if timesAddr == 0 { + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + } else { + var times linux.Utime + if err := times.CopyIn(t, timesAddr); err != nil { + return 0, nil, err + } + opts.Stat.Atime.Sec = times.Actime + opts.Stat.Mtime.Sec = times.Modtime + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Utimes implements Linux syscall utimes(2). +func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + opts := vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_ATIME | linux.STATX_MTIME, + }, + } + if timesAddr == 0 { + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + } else { + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Usec * 1000), + } + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Usec * 1000), + } + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Utimensat implements Linux syscall utimensat(2). +func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + flags := args[3].Int() + + if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + + return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Futimens implements Linux syscall futimens(2). +func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + timesAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + + return 0, nil, file.SetStat(t, opts) +} + +func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { + if timesAddr == 0 { + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + return nil + } + var times [2]linux.Timespec + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return err + } + if times[0].Nsec != linux.UTIME_OMIT { + opts.Stat.Mask |= linux.STATX_ATIME + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Nsec), + } + } + if times[1].Nsec != linux.UTIME_OMIT { + opts.Stat.Mask |= linux.STATX_MTIME + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Nsec), + } + } + return nil +} + +func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && !bool(shouldAllowEmptyPath) { + return syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.SetStat() instead of + // VirtualFilesystem.SetStatAt(), since the former may be able + // to use opened file state to expedite the SetStat. + err := dirfile.SetStat(t, *opts) + dirfile.DecRef() + return err + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: bool(shouldFollowFinalSymlink), + }, opts) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go new file mode 100644 index 000000000..dca8d7011 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -0,0 +1,346 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Stat implements Linux syscall stat(2). +func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + statAddr := args[1].Pointer() + return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */) +} + +// Lstat implements Linux syscall lstat(2). +func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + statAddr := args[1].Pointer() + return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW) +} + +// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2). +func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + statAddr := args[2].Pointer() + flags := args[3].Int() + return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags) +} + +func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return syserror.EINVAL + } + + opts := vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.Stat() instead of + // VirtualFilesystem.StatAt() for fstatat(fd, ""), since the + // former may be able to use opened file state to expedite the + // Stat. + statx, err := dirfile.Stat(t, opts) + dirfile.DecRef() + if err != nil { + return err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return stat.CopyOut(t, statAddr) + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + + statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &opts) + if err != nil { + return err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return stat.CopyOut(t, statAddr) +} + +// This takes both input and output as pointer arguments to avoid copying large +// structs. +func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) { + // Linux just copies fields from struct kstat without regard to struct + // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too. + userns := t.UserNamespace() + *stat = linux.Stat{ + Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)), + Ino: statx.Ino, + Nlink: uint64(statx.Nlink), + Mode: uint32(statx.Mode), + UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()), + GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)), + Size: int64(statx.Size), + Blksize: int64(statx.Blksize), + Blocks: int64(statx.Blocks), + ATime: timespecFromStatxTimestamp(statx.Atime), + MTime: timespecFromStatxTimestamp(statx.Mtime), + CTime: timespecFromStatxTimestamp(statx.Ctime), + } +} + +func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec { + return linux.Timespec{ + Sec: sxts.Sec, + Nsec: int64(sxts.Nsec), + } +} + +// Fstat implements Linux syscall fstat(2). +func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + statAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + statx, err := file.Stat(t, vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + return 0, nil, err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return 0, nil, stat.CopyOut(t, statAddr) +} + +// Statx implements Linux syscall statx(2). +func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + mask := args[3].Uint() + statxAddr := args[4].Pointer() + + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + opts := vfs.StatOptions{ + Mask: mask, + Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE), + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return 0, nil, syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return 0, nil, syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.Stat() instead of + // VirtualFilesystem.StatAt() for statx(fd, ""), since the + // former may be able to use opened file state to expedite the + // Stat. + statx, err := dirfile.Stat(t, opts) + dirfile.DecRef() + if err != nil { + return 0, nil, err + } + userifyStatx(t, &statx) + return 0, nil, statx.CopyOut(t, statxAddr) + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + + statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &opts) + if err != nil { + return 0, nil, err + } + userifyStatx(t, &statx) + return 0, nil, statx.CopyOut(t, statxAddr) +} + +func userifyStatx(t *kernel.Task, statx *linux.Statx) { + userns := t.UserNamespace() + statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow()) + statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow()) +} + +// Readlink implements Linux syscall readlink(2). +func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + bufAddr := args[1].Pointer() + size := args[2].SizeT() + return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size) +} + +// Access implements Linux syscall access(2). +func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // FIXME(jamieliu): actually implement + return 0, nil, nil +} + +// Faccessat implements Linux syscall access(2). +func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // FIXME(jamieliu): actually implement + return 0, nil, nil +} + +// Readlinkat implements Linux syscall mknodat(2). +func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + bufAddr := args[2].Pointer() + size := args[3].SizeT() + return readlinkat(t, dirfd, pathAddr, bufAddr, size) +} + +func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { + if int(size) <= 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + // "Since Linux 2.6.39, pathname can be an empty string, in which case the + // call operates on the symbolic link referred to by dirfd ..." - + // readlinkat(2) + tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + if len(target) > int(size) { + target = target[:size] + } + n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target)) + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Statfs implements Linux syscall statfs(2). +func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + bufAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + return 0, nil, statfs.CopyOut(t, bufAddr) +} + +// Fstatfs implements Linux syscall fstatfs(2). +func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufAddr := args[1].Pointer() + + tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + return 0, nil, statfs.CopyOut(t, bufAddr) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go new file mode 100644 index 000000000..365250b0b --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -0,0 +1,87 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements Linux syscall sync(2). +func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t) +} + +// Syncfs implements Linux syscall syncfs(2). +func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SyncFS(t) +} + +// Fsync implements Linux syscall fsync(2). +func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.Sync(t) +} + +// Fdatasync implements Linux syscall fdatasync(2). +func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata. + return Fsync(t, args) +} + +// SyncFileRange implements Linux syscall sync_file_range(2). +func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + nbytes := args[2].Int64() + flags := args[3].Uint() + + if offset < 0 { + return 0, nil, syserror.EINVAL + } + if nbytes < 0 { + return 0, nil, syserror.EINVAL + } + if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of + // [offset, offset+nbytes). + return 0, nil, file.Sync(t) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go deleted file mode 100644 index 7667524c7..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs2 - -import ( - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -const ( - // EventMaskRead contains events that can be triggered on reads. - EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr -) - -// Read implements linux syscall read(2). Note that we try to get a buffer that -// is exactly the size requested because some applications like qemu expect -// they can do large reads all at once. Bug for bug. Same for other read -// calls below. -func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := args[0].Int() - addr := args[1].Pointer() - size := args[2].SizeT() - - file := t.GetFileVFS2(fd) - if file == nil { - return 0, nil, syserror.EBADF - } - defer file.DecRef() - - // Check that the size is legitimate. - si := int(size) - if si < 0 { - return 0, nil, syserror.EINVAL - } - - // Get the destination of the read. - dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ - AddressSpaceActive: true, - }) - if err != nil { - return 0, nil, err - } - - n, err := read(t, file, dst, vfs.ReadOptions{}) - t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) -} - -func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - n, err := file.Read(t, dst, opts) - if err != syserror.ErrWouldBlock { - return n, err - } - - // Register for notifications. - w, ch := waiter.NewChannelEntry(nil) - file.EventRegister(&w, EventMaskRead) - - total := n - for { - // Shorten dst to reflect bytes previously read. - dst = dst.DropFirst(int(n)) - - // Issue the request and break out if it completes with anything other than - // "would block". - n, err := file.Read(t, dst, opts) - total += n - if err != syserror.ErrWouldBlock { - break - } - if err := t.Block(ch); err != nil { - break - } - } - file.EventUnregister(&w) - - return total, err -} diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go new file mode 100644 index 000000000..89e9ff4d7 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -0,0 +1,353 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "bytes" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Listxattr implements Linux syscall listxattr(2). +func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listxattr(t, args, followFinalSymlink) +} + +// Llistxattr implements Linux syscall llistxattr(2). +func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listxattr(t, args, nofollowFinalSymlink) +} + +func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + listAddr := args[1].Pointer() + size := args[2].SizeT() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrNameList(t, listAddr, size, names) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Flistxattr implements Linux syscall flistxattr(2). +func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + listAddr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + names, err := file.Listxattr(t) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrNameList(t, listAddr, size, names) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Getxattr implements Linux syscall getxattr(2). +func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getxattr(t, args, followFinalSymlink) +} + +// Lgetxattr implements Linux syscall lgetxattr(2). +func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getxattr(t, args, nofollowFinalSymlink) +} + +func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrValue(t, valueAddr, size, value) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Fgetxattr implements Linux syscall fgetxattr(2). +func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + value, err := file.Getxattr(t, name) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrValue(t, valueAddr, size, value) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Setxattr implements Linux syscall setxattr(2). +func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, setxattr(t, args, followFinalSymlink) +} + +// Lsetxattr implements Linux syscall lsetxattr(2). +func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, setxattr(t, args, nofollowFinalSymlink) +} + +func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Int() + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + value, err := copyInXattrValue(t, valueAddr, size) + if err != nil { + return err + } + + return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{ + Name: name, + Value: value, + Flags: uint32(flags), + }) +} + +// Fsetxattr implements Linux syscall fsetxattr(2). +func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Int() + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + value, err := copyInXattrValue(t, valueAddr, size) + if err != nil { + return 0, nil, err + } + + return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{ + Name: name, + Value: value, + Flags: uint32(flags), + }) +} + +// Removexattr implements Linux syscall removexattr(2). +func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, removexattr(t, args, followFinalSymlink) +} + +// Lremovexattr implements Linux syscall lremovexattr(2). +func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, removexattr(t, args, nofollowFinalSymlink) +} + +func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name) +} + +// Fremovexattr implements Linux syscall fremovexattr(2). +func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + return 0, nil, file.Removexattr(t, name) +} + +func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { + name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) + if err != nil { + if err == syserror.ENAMETOOLONG { + return "", syserror.ERANGE + } + return "", err + } + if len(name) == 0 { + return "", syserror.ERANGE + } + return name, nil +} + +func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) { + if size > linux.XATTR_LIST_MAX { + size = linux.XATTR_LIST_MAX + } + var buf bytes.Buffer + for _, name := range names { + buf.WriteString(name) + buf.WriteByte(0) + } + if size == 0 { + // Return the size that would be required to accomodate the list. + return buf.Len(), nil + } + if buf.Len() > int(size) { + if size >= linux.XATTR_LIST_MAX { + return 0, syserror.E2BIG + } + return 0, syserror.ERANGE + } + return t.CopyOutBytes(listAddr, buf.Bytes()) +} + +func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) { + if size > linux.XATTR_SIZE_MAX { + return "", syserror.E2BIG + } + buf := make([]byte, size) + if _, err := t.CopyInBytes(valueAddr, buf); err != nil { + return "", err + } + return gohacks.StringFromImmutableBytes(buf), nil +} + +func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) { + if size > linux.XATTR_SIZE_MAX { + size = linux.XATTR_SIZE_MAX + } + if size == 0 { + // Return the size that would be required to accomodate the value. + return len(value), nil + } + if len(value) > int(size) { + if size >= linux.XATTR_SIZE_MAX { + return 0, syserror.E2BIG + } + return 0, syserror.ERANGE + } + return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value)) +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 0b4f18ab5..07c8383e6 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -43,6 +43,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/gohacks", "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/fs/lock", diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index eed41139b..3da45d744 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -202,6 +202,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin // Add epi to file.epolls so that it is removed when the last // FileDescription reference is dropped. file.epollMu.Lock() + if file.epolls == nil { + file.epolls = make(map[*epollInterest]struct{}) + } file.epolls[epi] = struct{}{} file.epollMu.Unlock() diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index 1fe766a44..bc7581698 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -26,6 +26,7 @@ import ( "sync/atomic" "unsafe" + "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) @@ -160,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer { // Lookup may be called even if there are concurrent mutators of mt. func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} - hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) + hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) loop: for { @@ -361,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr //go:linkname rand32 runtime.fastrand func rand32() uint32 - -// This is copy/pasted from runtime.noescape(), and is needed because arguments -// apparently escape from all functions defined by linkname. -// -//go:nosplit -func noescape(p unsafe.Pointer) unsafe.Pointer { - x := uintptr(p) - return unsafe.Pointer(x ^ 0) -} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 8a0b382f6..eb4ebb511 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -228,7 +228,7 @@ func (rp *ResolvingPath) Advance() { rp.pit = next } else { // at end of path segment, continue with next one rp.curPart-- - rp.pit = rp.parts[rp.curPart-1] + rp.pit = rp.parts[rp.curPart] } } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 8f29031b2..73f8043be 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -385,15 +385,11 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential // Only a regular file can be executed. stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) if err != nil { + fd.DecRef() return nil, err } - if stat.Mask&linux.STATX_TYPE != 0 { - // This shouldn't happen, but if type can't be retrieved, file can't - // be executed. - return nil, syserror.EACCES - } - if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular { - ctx.Infof("%q is not a regular file: %v", pop.Path, t) + if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { + fd.DecRef() return nil, syserror.EACCES } } diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD index ff8b9e91a..6c9ada9c7 100644 --- a/pkg/usermem/BUILD +++ b/pkg/usermem/BUILD @@ -25,7 +25,6 @@ go_library( "bytes_io_unsafe.go", "usermem.go", "usermem_arm64.go", - "usermem_unsafe.go", "usermem_x86.go", ], visibility = ["//:sandbox"], @@ -33,6 +32,7 @@ go_library( "//pkg/atomicbitops", "//pkg/binary", "//pkg/context", + "//pkg/gohacks", "//pkg/log", "//pkg/safemem", "//pkg/syserror", diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go index 71fd4e155..d2f4403b0 100644 --- a/pkg/usermem/usermem.go +++ b/pkg/usermem/usermem.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -251,7 +252,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt } end, ok := addr.AddLength(uint64(readlen)) if !ok { - return stringFromImmutableBytes(buf[:done]), syserror.EFAULT + return gohacks.StringFromImmutableBytes(buf[:done]), syserror.EFAULT } // Shorten the read to avoid crossing page boundaries, since faulting // in a page unnecessarily is expensive. This also ensures that partial @@ -272,16 +273,16 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt // Look for the terminating zero byte, which may have occurred before // hitting err. if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 { - return stringFromImmutableBytes(buf[:done+i]), nil + return gohacks.StringFromImmutableBytes(buf[:done+i]), nil } done += n if err != nil { - return stringFromImmutableBytes(buf[:done]), err + return gohacks.StringFromImmutableBytes(buf[:done]), err } addr = end } - return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG + return gohacks.StringFromImmutableBytes(buf), syserror.ENAMETOOLONG } // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The diff --git a/pkg/usermem/usermem_unsafe.go b/pkg/usermem/usermem_unsafe.go deleted file mode 100644 index 876783e78..000000000 --- a/pkg/usermem/usermem_unsafe.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "unsafe" -) - -// stringFromImmutableBytes is equivalent to string(bs), except that it never -// copies even if escape analysis can't prove that bs does not escape. This is -// only valid if bs is never mutated after stringFromImmutableBytes returns. -func stringFromImmutableBytes(bs []byte) string { - // Compare strings.Builder.String(). - return *(*string)(unsafe.Pointer(&bs)) -} diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index c69f4c602..a4627905e 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -229,7 +229,9 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_NANOSLEEP: {}, syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, + syscall.SYS_PREADV: {}, syscall.SYS_PWRITE64: {}, + syscall.SYS_PWRITEV: {}, syscall.SYS_READ: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { -- cgit v1.2.3 From 72e3f3a3eef3a1dc02db0ff71f98a5d7fe89a6e3 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 25 Feb 2020 13:42:34 -0800 Subject: Add option to skip stuck tasks waiting for address space PiperOrigin-RevId: 297192390 --- pkg/sentry/kernel/kernel.go | 4 ++++ pkg/sentry/kernel/task_context.go | 2 +- pkg/sentry/kernel/task_exec.go | 2 +- pkg/sentry/kernel/task_usermem.go | 2 +- pkg/sentry/mm/address_space.go | 23 ++++++++++++++--------- pkg/sentry/mm/lifecycle.go | 26 ++++++++++++++------------ pkg/sentry/mm/mm.go | 5 +++++ pkg/sentry/mm/mm_test.go | 2 +- 8 files changed, 41 insertions(+), 25 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index c62fd6eb1..8b76750e9 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -247,6 +247,10 @@ type Kernel struct { // VFS keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem + + // If set to true, report address space activation waits as if the task is in + // external wait so that the watchdog doesn't report the task stuck. + SleepForAddressSpaceActivation bool } // InitKernelArgs holds arguments to Init. diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 2be982684..0158b1788 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -140,7 +140,7 @@ func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*Task } // Prepare a new user address space to load into. - m := mm.NewMemoryManager(k, k) + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) defer m.DecUsers(ctx) args.MemoryManager = m diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 8f57a34a6..00c425cca 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -220,7 +220,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.mu.Unlock() t.unstopVforkParent() // NOTE(b/30316266): All locks must be dropped prior to calling Activate. - t.MemoryManager().Activate() + t.MemoryManager().Activate(t) t.ptraceExec(oldTID) return (*runSyscallExit)(nil) diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index 2bf3ce8a8..b02044ad2 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -30,7 +30,7 @@ var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown()) // Activate ensures that the task has an active address space. func (t *Task) Activate() { if mm := t.MemoryManager(); mm != nil { - if err := mm.Activate(); err != nil { + if err := mm.Activate(t); err != nil { panic("unable to activate mm: " + err.Error()) } } diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go index 94d39af60..0332fc71c 100644 --- a/pkg/sentry/mm/address_space.go +++ b/pkg/sentry/mm/address_space.go @@ -18,6 +18,7 @@ import ( "fmt" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -38,7 +39,7 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace { // // When this MemoryManager is no longer needed by a task, it should call // Deactivate to release the reference. -func (mm *MemoryManager) Activate() error { +func (mm *MemoryManager) Activate(ctx context.Context) error { // Fast path: the MemoryManager already has an active // platform.AddressSpace, and we just need to indicate that we need it too. for { @@ -91,16 +92,20 @@ func (mm *MemoryManager) Activate() error { if as == nil { // AddressSpace is unavailable, we must wait. // - // activeMu must not be held while waiting, as the user - // of the address space we are waiting on may attempt - // to take activeMu. - // - // Don't call UninterruptibleSleepStart to register the - // wait to allow the watchdog stuck task to trigger in - // case a process is starved waiting for the address - // space. + // activeMu must not be held while waiting, as the user of the address + // space we are waiting on may attempt to take activeMu. mm.activeMu.Unlock() + + sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation + if sleep { + // Mark this task sleeping while waiting for the address space to + // prevent the watchdog from reporting it as a stuck task. + ctx.UninterruptibleSleepStart(false) + } <-c + if sleep { + ctx.UninterruptibleSleepFinish(false) + } continue } diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 3c263ebaa..d8a5b9d29 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -28,16 +28,17 @@ import ( ) // NewMemoryManager returns a new MemoryManager with no mappings and 1 user. -func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager { +func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager { return &MemoryManager{ - p: p, - mfp: mfp, - haveASIO: p.SupportsAddressSpaceIO(), - privateRefs: &privateRefs{}, - users: 1, - auxv: arch.Auxv{}, - dumpability: UserDumpable, - aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + p: p, + mfp: mfp, + haveASIO: p.SupportsAddressSpaceIO(), + privateRefs: &privateRefs{}, + users: 1, + auxv: arch.Auxv{}, + dumpability: UserDumpable, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + sleepForActivation: sleepForActivation, } } @@ -79,9 +80,10 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { envv: mm.envv, auxv: append(arch.Auxv(nil), mm.auxv...), // IncRef'd below, once we know that there isn't an error. - executable: mm.executable, - dumpability: mm.dumpability, - aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + executable: mm.executable, + dumpability: mm.dumpability, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + sleepForActivation: mm.sleepForActivation, } // Copy vmas. diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 637383c7a..c2195ae11 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -226,6 +226,11 @@ type MemoryManager struct { // aioManager keeps track of AIOContexts used for async IOs. AIOManager // must be cloned when CLONE_VM is used. aioManager aioManager + + // sleepForActivation indicates whether the task should report to be sleeping + // before trying to activate the address space. When set to true, delays in + // activation are not reported as stuck tasks by the watchdog. + sleepForActivation bool } // vma represents a virtual memory area. diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go index edacca741..fdc308542 100644 --- a/pkg/sentry/mm/mm_test.go +++ b/pkg/sentry/mm/mm_test.go @@ -31,7 +31,7 @@ import ( func testMemoryManager(ctx context.Context) *MemoryManager { p := platform.FromContext(ctx) mfp := pgalloc.MemoryFileProviderFromContext(ctx) - mm := NewMemoryManager(p, mfp) + mm := NewMemoryManager(p, mfp, false) mm.layout = arch.MmapLayout{ MinAddr: p.MinUserAddress(), MaxAddr: p.MaxUserAddress(), -- cgit v1.2.3 From 6b4d36e3253238dd72d0861ac1220d147e1de8dd Mon Sep 17 00:00:00 2001 From: Ting-Yu Wang Date: Fri, 28 Feb 2020 10:37:52 -0800 Subject: Hide /dev/net/tun when using hostinet. /dev/net/tun does not currently work with hostinet. This has caused some program starts failing because it thinks the feature exists. PiperOrigin-RevId: 297876196 --- pkg/sentry/fs/dev/BUILD | 1 + pkg/sentry/fs/dev/dev.go | 7 +++++-- pkg/sentry/fs/dev/net_tun.go | 7 +++++++ pkg/sentry/kernel/kernel.go | 4 ++++ test/syscalls/BUILD | 5 +++++ test/syscalls/linux/BUILD | 12 +++++++++++ test/syscalls/linux/dev.cc | 7 ------- test/syscalls/linux/tuntap.cc | 7 +++++++ test/syscalls/linux/tuntap_hostinet.cc | 37 ++++++++++++++++++++++++++++++++++ 9 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 test/syscalls/linux/tuntap_hostinet.cc (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 9b6bb26d0..9379a4d7b 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -26,6 +26,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/memmap", "//pkg/sentry/mm", diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index 7e66c29b0..acbd401a0 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/usermem" ) @@ -124,10 +125,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { "ptmx": newSymlink(ctx, "pts/ptmx", msrc), "tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor), + } - "net": newDirectory(ctx, map[string]*fs.Inode{ + if isNetTunSupported(inet.StackFromContext(ctx)) { + contents["net"] = newDirectory(ctx, map[string]*fs.Inode{ "tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor), - }, msrc), + }, msrc) } iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go index 755644488..dc7ad075a 100644 --- a/pkg/sentry/fs/dev/net_tun.go +++ b/pkg/sentry/fs/dev/net_tun.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/syserror" @@ -168,3 +169,9 @@ func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.Eve func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) { fops.device.EventUnregister(e) } + +// isNetTunSupported returns whether /dev/net/tun device is supported for s. +func isNetTunSupported(s inet.Stack) bool { + _, ok := s.(*netstack.Stack) + return ok +} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 8b76750e9..1d627564f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -755,6 +755,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter + case inet.CtxStack: + return ctx.k.RootNetworkNamespace().Stack() case ktime.CtxRealtimeClock: return ctx.k.RealtimeClock() case limits.CtxLimits: @@ -1481,6 +1483,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter + case inet.CtxStack: + return ctx.k.RootNetworkNamespace().Stack() case ktime.CtxRealtimeClock: return ctx.k.RealtimeClock() case limits.CtxLimits: diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 3518e862d..a69b0ce13 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -680,6 +680,11 @@ syscall_test( syscall_test(test = "//test/syscalls/linux:tuntap_test") +syscall_test( + add_hostinet = True, + test = "//test/syscalls/linux:tuntap_hostinet_test", +) + syscall_test(test = "//test/syscalls/linux:udp_bind_test") syscall_test( diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 704bae17b..70c120e42 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -3460,6 +3460,18 @@ cc_binary( ], ) +cc_binary( + name = "tuntap_hostinet_test", + testonly = 1, + srcs = ["tuntap_hostinet.cc"], + linkstatic = 1, + deps = [ + gtest, + "//test/util:test_main", + "//test/util:test_util", + ], +) + cc_library( name = "udp_socket_test_cases", testonly = 1, diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc index 4e473268c..4dd302eed 100644 --- a/test/syscalls/linux/dev.cc +++ b/test/syscalls/linux/dev.cc @@ -153,13 +153,6 @@ TEST(DevTest, TTYExists) { EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666); } -TEST(DevTest, NetTunExists) { - struct stat statbuf = {}; - ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds()); - // Check that it's a character device with rw-rw-rw- permissions. - EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666); -} - } // namespace } // namespace testing diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc index f6ac9d7b8..f734511d6 100644 --- a/test/syscalls/linux/tuntap.cc +++ b/test/syscalls/linux/tuntap.cc @@ -153,6 +153,13 @@ std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip, } // namespace +TEST(TuntapStaticTest, NetTunExists) { + struct stat statbuf; + ASSERT_THAT(stat(kDevNetTun, &statbuf), SyscallSucceeds()); + // Check that it's a character device with rw-rw-rw- permissions. + EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666); +} + class TuntapTest : public ::testing::Test { protected: void TearDown() override { diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc new file mode 100644 index 000000000..0c527419e --- /dev/null +++ b/test/syscalls/linux/tuntap_hostinet.cc @@ -0,0 +1,37 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "gtest/gtest.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +TEST(TuntapHostInetTest, NoNetTun) { + SKIP_IF(!IsRunningOnGvisor()); + + struct stat statbuf; + ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT)); +} + +} // namespace +} // namespace testing + +} // namespace gvisor -- cgit v1.2.3 From 463f4217d109ded8af758fe51a5daf8670da9794 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Fri, 28 Feb 2020 12:28:10 -0800 Subject: Make pipe buffer implementation standard. A follow-up change will convert the networking code to use this standard pipe implementation. PiperOrigin-RevId: 297903206 --- pkg/buffer/BUILD | 39 ++++ pkg/buffer/buffer.go | 67 ++++++ pkg/buffer/safemem.go | 131 ++++++++++++ pkg/buffer/view.go | 382 ++++++++++++++++++++++++++++++++++ pkg/buffer/view_test.go | 233 +++++++++++++++++++++ pkg/buffer/view_unsafe.go | 25 +++ pkg/sentry/kernel/pipe/BUILD | 18 +- pkg/sentry/kernel/pipe/buffer.go | 115 ---------- pkg/sentry/kernel/pipe/buffer_test.go | 32 --- pkg/sentry/kernel/pipe/pipe.go | 118 ++--------- pkg/sentry/kernel/pipe/pipe_util.go | 25 +-- 11 files changed, 912 insertions(+), 273 deletions(-) create mode 100644 pkg/buffer/BUILD create mode 100644 pkg/buffer/buffer.go create mode 100644 pkg/buffer/safemem.go create mode 100644 pkg/buffer/view.go create mode 100644 pkg/buffer/view_test.go create mode 100644 pkg/buffer/view_unsafe.go delete mode 100644 pkg/sentry/kernel/pipe/buffer.go delete mode 100644 pkg/sentry/kernel/pipe/buffer_test.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/buffer/BUILD b/pkg/buffer/BUILD new file mode 100644 index 000000000..a77a3beea --- /dev/null +++ b/pkg/buffer/BUILD @@ -0,0 +1,39 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "buffer_list", + out = "buffer_list.go", + package = "buffer", + prefix = "buffer", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Buffer", + "Linker": "*Buffer", + }, +) + +go_library( + name = "buffer", + srcs = [ + "buffer.go", + "buffer_list.go", + "safemem.go", + "view.go", + "view_unsafe.go", + ], + visibility = ["//visibility:public"], + deps = [ + "//pkg/log", + "//pkg/safemem", + ], +) + +go_test( + name = "buffer_test", + size = "small", + srcs = ["view_test.go"], + library = ":buffer", +) diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go new file mode 100644 index 000000000..d5f64609b --- /dev/null +++ b/pkg/buffer/buffer.go @@ -0,0 +1,67 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package buffer provides the implementation of a buffer view. +package buffer + +import ( + "sync" +) + +const bufferSize = 8144 // See below. + +// Buffer encapsulates a queueable byte buffer. +// +// Note that the total size is slightly less than two pages. This is done +// intentionally to ensure that the buffer object aligns with runtime +// internals. We have no hard size or alignment requirements. This two page +// size will effectively minimize internal fragmentation, but still have a +// large enough chunk to limit excessive segmentation. +// +// +stateify savable +type Buffer struct { + data [bufferSize]byte + read int + write int + bufferEntry +} + +// Reset resets internal data. +// +// This must be called before use. +func (b *Buffer) Reset() { + b.read = 0 + b.write = 0 +} + +// Empty indicates the buffer is empty. +// +// This indicates there is no data left to read. +func (b *Buffer) Empty() bool { + return b.read == b.write +} + +// Full indicates the buffer is full. +// +// This indicates there is no capacity left to write. +func (b *Buffer) Full() bool { + return b.write == len(b.data) +} + +// bufferPool is a pool for buffers. +var bufferPool = sync.Pool{ + New: func() interface{} { + return new(Buffer) + }, +} diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go new file mode 100644 index 000000000..071aaa488 --- /dev/null +++ b/pkg/buffer/safemem.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package buffer + +import ( + "io" + + "gvisor.dev/gvisor/pkg/safemem" +) + +// WriteBlock returns this buffer as a write Block. +func (b *Buffer) WriteBlock() safemem.Block { + return safemem.BlockFromSafeSlice(b.data[b.write:]) +} + +// ReadBlock returns this buffer as a read Block. +func (b *Buffer) ReadBlock() safemem.Block { + return safemem.BlockFromSafeSlice(b.data[b.read:b.write]) +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// This will advance the write index. +func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + need := int(srcs.NumBytes()) + if need == 0 { + return 0, nil + } + + var ( + dst safemem.BlockSeq + blocks []safemem.Block + ) + + // Need at least one buffer. + firstBuf := v.data.Back() + if firstBuf == nil { + firstBuf = bufferPool.Get().(*Buffer) + v.data.PushBack(firstBuf) + } + + // Does the last block have sufficient capacity alone? + if l := len(firstBuf.data) - firstBuf.write; l >= need { + dst = safemem.BlockSeqOf(firstBuf.WriteBlock()) + } else { + // Append blocks until sufficient. + need -= l + blocks = append(blocks, firstBuf.WriteBlock()) + for need > 0 { + emptyBuf := bufferPool.Get().(*Buffer) + v.data.PushBack(emptyBuf) + need -= len(emptyBuf.data) // Full block. + blocks = append(blocks, emptyBuf.WriteBlock()) + } + dst = safemem.BlockSeqFromSlice(blocks) + } + + // Perform the copy. + n, err := safemem.CopySeq(dst, srcs) + v.size += int64(n) + + // Update all indices. + for left := int(n); left > 0; firstBuf = firstBuf.Next() { + if l := len(firstBuf.data) - firstBuf.write; left >= l { + firstBuf.write += l // Whole block. + left -= l + } else { + firstBuf.write += left // Partial block. + left = 0 + } + } + + return n, err +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +// +// This will not advance the read index; the caller should follow +// this call with a call to TrimFront in order to remove the read +// data from the buffer. This is done to support pipe sematics. +func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + need := int(dsts.NumBytes()) + if need == 0 { + return 0, nil + } + + var ( + src safemem.BlockSeq + blocks []safemem.Block + ) + + firstBuf := v.data.Front() + if firstBuf == nil { + return 0, io.EOF + } + + // Is all the data in a single block? + if l := firstBuf.write - firstBuf.read; l >= need { + src = safemem.BlockSeqOf(firstBuf.ReadBlock()) + } else { + // Build a list of all the buffers. + need -= l + blocks = append(blocks, firstBuf.ReadBlock()) + for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() { + need -= buf.write - buf.read + blocks = append(blocks, buf.ReadBlock()) + } + src = safemem.BlockSeqFromSlice(blocks) + } + + // Perform the copy. + n, err := safemem.CopySeq(dsts, src) + + // See above: we would normally advance the read index here, but we + // don't do that in order to support pipe semantics. We rely on a + // separate call to TrimFront() in this case. + + return n, err +} diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go new file mode 100644 index 000000000..00fc11e9c --- /dev/null +++ b/pkg/buffer/view.go @@ -0,0 +1,382 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package buffer + +import ( + "fmt" + "io" +) + +// View is a non-linear buffer. +// +// All methods are thread compatible. +// +// +stateify savable +type View struct { + data bufferList + size int64 +} + +// TrimFront removes the first count bytes from the buffer. +func (v *View) TrimFront(count int64) { + if count >= v.size { + v.advanceRead(v.size) + } else { + v.advanceRead(count) + } +} + +// Read implements io.Reader.Read. +// +// Note that reading does not advance the read index. This must be done +// manually using TrimFront or other methods. +func (v *View) Read(p []byte) (int, error) { + return v.ReadAt(p, 0) +} + +// ReadAt implements io.ReaderAt.ReadAt. +func (v *View) ReadAt(p []byte, offset int64) (int, error) { + var ( + skipped int64 + done int64 + ) + for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() { + needToSkip := int(offset - skipped) + if l := buf.write - buf.read; l <= needToSkip { + skipped += int64(l) + continue + } + + // Actually read data. + n := copy(p[done:], buf.data[buf.read+needToSkip:buf.write]) + skipped += int64(needToSkip) + done += int64(n) + } + if int(done) < len(p) { + return int(done), io.EOF + } + return int(done), nil +} + +// Write implements io.Writer.Write. +func (v *View) Write(p []byte) (int, error) { + v.Append(p) // Does not fail. + return len(p), nil +} + +// advanceRead advances the view's read index. +// +// Precondition: there must be sufficient bytes in the buffer. +func (v *View) advanceRead(count int64) { + for buf := v.data.Front(); buf != nil && count > 0; { + l := int64(buf.write - buf.read) + if l > count { + // There is still data for reading. + buf.read += int(count) + v.size -= count + count = 0 + break + } + + // Read from this buffer. + buf.read += int(l) + count -= l + v.size -= l + + // When all data has been read from a buffer, we push + // it into the empty buffer pool for reuse. + oldBuf := buf + buf = buf.Next() // Iterate. + v.data.Remove(oldBuf) + oldBuf.Reset() + bufferPool.Put(oldBuf) + } + if count > 0 { + panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count)) + } +} + +// Truncate truncates the view to the given bytes. +func (v *View) Truncate(length int64) { + if length < 0 || length >= v.size { + return // Nothing to do. + } + for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() { + l := int64(buf.write - buf.read) // Local bytes. + switch { + case v.size-l >= length: + // Drop the buffer completely; see above. + v.data.Remove(buf) + v.size -= l + buf.Reset() + bufferPool.Put(buf) + + case v.size > length && v.size-l < length: + // Just truncate the buffer locally. + delta := (length - (v.size - l)) + buf.write = buf.read + int(delta) + v.size = length + + default: + // Should never happen. + panic("invalid buffer during truncation") + } + } + v.size = length // Save the new size. +} + +// Grow grows the given view to the number of bytes. If zero +// is true, all these bytes will be zero. If zero is false, +// then this is the caller's responsibility. +// +// Precondition: length must be >= 0. +func (v *View) Grow(length int64, zero bool) { + if length < 0 { + panic("negative length provided") + } + for v.size < length { + buf := v.data.Back() + + // Is there at least one buffer? + if buf == nil || buf.Full() { + buf = bufferPool.Get().(*Buffer) + v.data.PushBack(buf) + } + + // Write up to length bytes. + l := len(buf.data) - buf.write + if int64(l) > length-v.size { + l = int(length - v.size) + } + + // Zero the written section; note that this pattern is + // specifically recognized and optimized by the compiler. + if zero { + for i := buf.write; i < buf.write+l; i++ { + buf.data[i] = 0 + } + } + + // Advance the index. + buf.write += l + v.size += int64(l) + } +} + +// Prepend prepends the given data. +func (v *View) Prepend(data []byte) { + // Is there any space in the first buffer? + if buf := v.data.Front(); buf != nil && buf.read > 0 { + // Fill up before the first write. + avail := buf.read + copy(buf.data[0:], data[len(data)-avail:]) + data = data[:len(data)-avail] + v.size += int64(avail) + } + + for len(data) > 0 { + // Do we need an empty buffer? + buf := bufferPool.Get().(*Buffer) + v.data.PushFront(buf) + + // The buffer is empty; copy last chunk. + start := len(data) - len(buf.data) + if start < 0 { + start = 0 // Everything. + } + + // We have to put the data at the end of the current + // buffer in order to ensure that the next prepend will + // correctly fill up the beginning of this buffer. + bStart := len(buf.data) - len(data[start:]) + n := copy(buf.data[bStart:], data[start:]) + buf.read = bStart + buf.write = len(buf.data) + data = data[:start] + v.size += int64(n) + } +} + +// Append appends the given data. +func (v *View) Append(data []byte) { + for done := 0; done < len(data); { + buf := v.data.Back() + + // Find the first empty buffer. + if buf == nil || buf.Full() { + buf = bufferPool.Get().(*Buffer) + v.data.PushBack(buf) + } + + // Copy in to the given buffer. + n := copy(buf.data[buf.write:], data[done:]) + done += n + buf.write += n + v.size += int64(n) + } +} + +// Flatten returns a flattened copy of this data. +// +// This method should not be used in any performance-sensitive paths. It may +// allocate a fresh byte slice sufficiently large to contain all the data in +// the buffer. +// +// N.B. Tee data still belongs to this view, as if there is a single buffer +// present, then it will be returned directly. This should be used for +// temporary use only, and a reference to the given slice should not be held. +func (v *View) Flatten() []byte { + if buf := v.data.Front(); buf.Next() == nil { + return buf.data[buf.read:buf.write] // Only one buffer. + } + data := make([]byte, 0, v.size) // Need to flatten. + for buf := v.data.Front(); buf != nil; buf = buf.Next() { + // Copy to the allocated slice. + data = append(data, buf.data[buf.read:buf.write]...) + } + return data +} + +// Size indicates the total amount of data available in this view. +func (v *View) Size() (sz int64) { + sz = v.size // Pre-calculated. + return sz +} + +// Copy makes a strict copy of this view. +func (v *View) Copy() (other View) { + for buf := v.data.Front(); buf != nil; buf = buf.Next() { + other.Append(buf.data[buf.read:buf.write]) + } + return other +} + +// Apply applies the given function across all valid data. +func (v *View) Apply(fn func([]byte)) { + for buf := v.data.Front(); buf != nil; buf = buf.Next() { + if l := int64(buf.write - buf.read); l > 0 { + fn(buf.data[buf.read:buf.write]) + } + } +} + +// Merge merges the provided View with this one. +// +// The other view will be empty after this operation. +func (v *View) Merge(other *View) { + // Copy over all buffers. + for buf := other.data.Front(); buf != nil && !buf.Empty(); buf = other.data.Front() { + other.data.Remove(buf) + v.data.PushBack(buf) + } + + // Adjust sizes. + v.size += other.size + other.size = 0 +} + +// WriteFromReader writes to the buffer from an io.Reader. +func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) { + var ( + done int64 + n int + err error + ) + for done < count { + buf := v.data.Back() + + // Find the first empty buffer. + if buf == nil || buf.Full() { + buf = bufferPool.Get().(*Buffer) + v.data.PushBack(buf) + } + + // Is this less than the minimum batch? + if len(buf.data[buf.write:]) < minBatch && (count-done) >= int64(minBatch) { + tmp := make([]byte, minBatch) + n, err = r.Read(tmp) + v.Write(tmp[:n]) + done += int64(n) + if err != nil { + break + } + continue + } + + // Limit the read, if necessary. + end := len(buf.data) + if int64(end-buf.write) > (count - done) { + end = buf.write + int(count-done) + } + + // Pass the relevant portion of the buffer. + n, err = r.Read(buf.data[buf.write:end]) + buf.write += n + done += int64(n) + v.size += int64(n) + if err == io.EOF { + err = nil // Short write allowed. + break + } else if err != nil { + break + } + } + return done, err +} + +// ReadToWriter reads from the buffer into an io.Writer. +// +// N.B. This does not consume the bytes read. TrimFront should +// be called appropriately after this call in order to do so. +func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) { + var ( + done int64 + n int + err error + ) + offset := 0 // Spill-over for batching. + for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() { + l := buf.write - buf.read - offset + + // Is this less than the minimum batch? + if l < minBatch && (count-done) >= int64(minBatch) && (v.size-done) >= int64(minBatch) { + tmp := make([]byte, minBatch) + n, err = v.ReadAt(tmp, done) + w.Write(tmp[:n]) + done += int64(n) + offset = n - l // Reset below. + if err != nil { + break + } + continue + } + + // Limit the write if necessary. + if int64(l) >= (count - done) { + l = int(count - done) + } + + // Perform the actual write. + n, err = w.Write(buf.data[buf.read+offset : buf.read+offset+l]) + done += int64(n) + if err != nil { + break + } + + // Reset spill-over. + offset = 0 + } + return done, err +} diff --git a/pkg/buffer/view_test.go b/pkg/buffer/view_test.go new file mode 100644 index 000000000..37e652f16 --- /dev/null +++ b/pkg/buffer/view_test.go @@ -0,0 +1,233 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package buffer + +import ( + "bytes" + "strings" + "testing" +) + +func TestView(t *testing.T) { + testCases := []struct { + name string + input string + output string + ops []func(*View) + }{ + // Prepend. + { + name: "prepend", + input: "world", + ops: []func(*View){ + func(v *View) { + v.Prepend([]byte("hello ")) + }, + }, + output: "hello world", + }, + { + name: "prepend fill", + input: strings.Repeat("1", bufferSize-1), + ops: []func(*View){ + func(v *View) { + v.Prepend([]byte("0")) + }, + }, + output: "0" + strings.Repeat("1", bufferSize-1), + }, + { + name: "prepend overflow", + input: strings.Repeat("1", bufferSize), + ops: []func(*View){ + func(v *View) { + v.Prepend([]byte("0")) + }, + }, + output: "0" + strings.Repeat("1", bufferSize), + }, + { + name: "prepend multiple buffers", + input: strings.Repeat("1", bufferSize-1), + ops: []func(*View){ + func(v *View) { + v.Prepend([]byte(strings.Repeat("0", bufferSize*3))) + }, + }, + output: strings.Repeat("0", bufferSize*3) + strings.Repeat("1", bufferSize-1), + }, + + // Append. + { + name: "append", + input: "hello", + ops: []func(*View){ + func(v *View) { + v.Append([]byte(" world")) + }, + }, + output: "hello world", + }, + { + name: "append fill", + input: strings.Repeat("1", bufferSize-1), + ops: []func(*View){ + func(v *View) { + v.Append([]byte("0")) + }, + }, + output: strings.Repeat("1", bufferSize-1) + "0", + }, + { + name: "append overflow", + input: strings.Repeat("1", bufferSize), + ops: []func(*View){ + func(v *View) { + v.Append([]byte("0")) + }, + }, + output: strings.Repeat("1", bufferSize) + "0", + }, + { + name: "append multiple buffers", + input: strings.Repeat("1", bufferSize-1), + ops: []func(*View){ + func(v *View) { + v.Append([]byte(strings.Repeat("0", bufferSize*3))) + }, + }, + output: strings.Repeat("1", bufferSize-1) + strings.Repeat("0", bufferSize*3), + }, + + // Truncate. + { + name: "truncate", + input: "hello world", + ops: []func(*View){ + func(v *View) { + v.Truncate(5) + }, + }, + output: "hello", + }, + { + name: "truncate multiple buffers", + input: strings.Repeat("1", bufferSize*2), + ops: []func(*View){ + func(v *View) { + v.Truncate(bufferSize*2 - 1) + }, + }, + output: strings.Repeat("1", bufferSize*2-1), + }, + { + name: "truncate multiple buffers to one buffer", + input: strings.Repeat("1", bufferSize*2), + ops: []func(*View){ + func(v *View) { + v.Truncate(5) + }, + }, + output: "11111", + }, + + // TrimFront. + { + name: "trim", + input: "hello world", + ops: []func(*View){ + func(v *View) { + v.TrimFront(6) + }, + }, + output: "world", + }, + { + name: "trim multiple buffers", + input: strings.Repeat("1", bufferSize*2), + ops: []func(*View){ + func(v *View) { + v.TrimFront(1) + }, + }, + output: strings.Repeat("1", bufferSize*2-1), + }, + { + name: "trim multiple buffers to one buffer", + input: strings.Repeat("1", bufferSize*2), + ops: []func(*View){ + func(v *View) { + v.TrimFront(bufferSize*2 - 1) + }, + }, + output: "1", + }, + + // Grow. + { + name: "grow", + input: "hello world", + ops: []func(*View){ + func(v *View) { + v.Grow(1, true) + }, + }, + output: "hello world", + }, + { + name: "grow from zero", + ops: []func(*View){ + func(v *View) { + v.Grow(1024, true) + }, + }, + output: strings.Repeat("\x00", 1024), + }, + { + name: "grow from non-zero", + input: strings.Repeat("1", bufferSize), + ops: []func(*View){ + func(v *View) { + v.Grow(bufferSize*2, true) + }, + }, + output: strings.Repeat("1", bufferSize) + strings.Repeat("\x00", bufferSize), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Construct the new view. + var view View + view.Append([]byte(tc.input)) + + // Run all operations. + for _, op := range tc.ops { + op(&view) + } + + // Flatten and validate. + out := view.Flatten() + if !bytes.Equal([]byte(tc.output), out) { + t.Errorf("expected %q, got %q", tc.output, string(out)) + } + + // Ensure the size is correct. + if len(out) != int(view.Size()) { + t.Errorf("size is wrong: expected %d, got %d", len(out), view.Size()) + } + }) + } +} diff --git a/pkg/buffer/view_unsafe.go b/pkg/buffer/view_unsafe.go new file mode 100644 index 000000000..d1ef39b26 --- /dev/null +++ b/pkg/buffer/view_unsafe.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package buffer + +import ( + "unsafe" +) + +// minBatch is the smallest Read or Write operation that the +// WriteFromReader and ReadToWriter functions will use. +// +// This is defined as the size of a native pointer. +const minBatch = int(unsafe.Sizeof(uintptr(0))) diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 4c049d5b4..f29dc0472 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -1,25 +1,10 @@ load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -go_template_instance( - name = "buffer_list", - out = "buffer_list.go", - package = "pipe", - prefix = "buffer", - template = "//pkg/ilist:generic_list", - types = { - "Element": "*buffer", - "Linker": "*buffer", - }, -) - go_library( name = "pipe", srcs = [ - "buffer.go", - "buffer_list.go", "device.go", "node.go", "pipe.go", @@ -33,8 +18,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/buffer", "//pkg/context", - "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", @@ -51,7 +36,6 @@ go_test( name = "pipe_test", size = "small", srcs = [ - "buffer_test.go", "node_test.go", "pipe_test.go", ], diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go deleted file mode 100644 index fe3be5dbd..000000000 --- a/pkg/sentry/kernel/pipe/buffer.go +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pipe - -import ( - "io" - - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sync" -) - -// buffer encapsulates a queueable byte buffer. -// -// Note that the total size is slightly less than two pages. This -// is done intentionally to ensure that the buffer object aligns -// with runtime internals. We have no hard size or alignment -// requirements. This two page size will effectively minimize -// internal fragmentation, but still have a large enough chunk -// to limit excessive segmentation. -// -// +stateify savable -type buffer struct { - data [8144]byte - read int - write int - bufferEntry -} - -// Reset resets internal data. -// -// This must be called before use. -func (b *buffer) Reset() { - b.read = 0 - b.write = 0 -} - -// Empty indicates the buffer is empty. -// -// This indicates there is no data left to read. -func (b *buffer) Empty() bool { - return b.read == b.write -} - -// Full indicates the buffer is full. -// -// This indicates there is no capacity left to write. -func (b *buffer) Full() bool { - return b.write == len(b.data) -} - -// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. -func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:])) - n, err := safemem.CopySeq(dst, srcs) - b.write += int(n) - return n, err -} - -// WriteFromReader writes to the buffer from an io.Reader. -func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) { - dst := b.data[b.write:] - if count < int64(len(dst)) { - dst = b.data[b.write:][:count] - } - n, err := r.Read(dst) - b.write += n - return int64(n), err -} - -// ReadToBlocks implements safemem.Reader.ReadToBlocks. -func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write])) - n, err := safemem.CopySeq(dsts, src) - b.read += int(n) - return n, err -} - -// ReadToWriter reads from the buffer into an io.Writer. -func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) { - src := b.data[b.read:b.write] - if count < int64(len(src)) { - src = b.data[b.read:][:count] - } - n, err := w.Write(src) - if !dup { - b.read += n - } - return int64(n), err -} - -// bufferPool is a pool for buffers. -var bufferPool = sync.Pool{ - New: func() interface{} { - return new(buffer) - }, -} - -// newBuffer grabs a new buffer from the pool. -func newBuffer() *buffer { - b := bufferPool.Get().(*buffer) - b.Reset() - return b -} diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/pipe/buffer_test.go deleted file mode 100644 index 4d54b8b8f..000000000 --- a/pkg/sentry/kernel/pipe/buffer_test.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pipe - -import ( - "testing" - "unsafe" - - "gvisor.dev/gvisor/pkg/usermem" -) - -func TestBufferSize(t *testing.T) { - bufferSize := unsafe.Sizeof(buffer{}) - if bufferSize < usermem.PageSize { - t.Errorf("buffer is less than a page") - } - if bufferSize > (2 * usermem.PageSize) { - t.Errorf("buffer is greater than two pages") - } -} diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 08410283f..725e9db7d 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -20,6 +20,7 @@ import ( "sync/atomic" "syscall" + "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" @@ -70,10 +71,10 @@ type Pipe struct { // mu protects all pipe internal state below. mu sync.Mutex `state:"nosave"` - // data is the buffer queue of pipe contents. + // view is the underlying set of buffers. // // This is protected by mu. - data bufferList + view buffer.View // max is the maximum size of the pipe in bytes. When this max has been // reached, writers will get EWOULDBLOCK. @@ -81,11 +82,6 @@ type Pipe struct { // This is protected by mu. max int64 - // size is the current size of the pipe in bytes. - // - // This is protected by mu. - size int64 - // hadWriter indicates if this pipe ever had a writer. Note that this // does not necessarily indicate there is *currently* a writer, just // that there has been a writer at some point since the pipe was @@ -196,7 +192,7 @@ type readOps struct { limit func(int64) // read performs the actual read operation. - read func(*buffer) (int64, error) + read func(*buffer.View) (int64, error) } // read reads data from the pipe into dst and returns the number of bytes @@ -213,7 +209,7 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { defer p.mu.Unlock() // Is the pipe empty? - if p.size == 0 { + if p.view.Size() == 0 { if !p.HasWriters() { // There are no writers, return EOF. return 0, nil @@ -222,71 +218,13 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { } // Limit how much we consume. - if ops.left() > p.size { - ops.limit(p.size) + if ops.left() > p.view.Size() { + ops.limit(p.view.Size()) } - done := int64(0) - for ops.left() > 0 { - // Pop the first buffer. - first := p.data.Front() - if first == nil { - break - } - - // Copy user data. - n, err := ops.read(first) - done += int64(n) - p.size -= n - - // Empty buffer? - if first.Empty() { - // Push to the free list. - p.data.Remove(first) - bufferPool.Put(first) - } - - // Handle errors. - if err != nil { - return done, err - } - } - - return done, nil -} - -// dup duplicates all data from this pipe into the given writer. -// -// There is no blocking behavior implemented here. The writer may propagate -// some blocking error. All the writes must be complete writes. -func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) { - p.mu.Lock() - defer p.mu.Unlock() - - // Is the pipe empty? - if p.size == 0 { - if !p.HasWriters() { - // See above. - return 0, nil - } - return 0, syserror.ErrWouldBlock - } - - // Limit how much we consume. - if ops.left() > p.size { - ops.limit(p.size) - } - - done := int64(0) - for buf := p.data.Front(); buf != nil; buf = buf.Next() { - n, err := ops.read(buf) - done += n - if err != nil { - return done, err - } - } - - return done, nil + // Copy user data; the read op is responsible for trimming. + done, err := ops.read(&p.view) + return done, err } type writeOps struct { @@ -297,7 +235,7 @@ type writeOps struct { limit func(int64) // write should write to the provided buffer. - write func(*buffer) (int64, error) + write func(*buffer.View) (int64, error) } // write writes data from sv into the pipe and returns the number of bytes @@ -317,33 +255,19 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be // atomic, but requires no atomicity for writes larger than this. wanted := ops.left() - if avail := p.max - p.size; wanted > avail { + if avail := p.max - p.view.Size(); wanted > avail { if wanted <= p.atomicIOBytes { return 0, syserror.ErrWouldBlock } ops.limit(avail) } - done := int64(0) - for ops.left() > 0 { - // Need a new buffer? - last := p.data.Back() - if last == nil || last.Full() { - // Add a new buffer to the data list. - last = newBuffer() - p.data.PushBack(last) - } - - // Copy user data. - n, err := ops.write(last) - done += int64(n) - p.size += n - - // Handle errors. - if err != nil { - return done, err - } + // Copy user data. + done, err := ops.write(&p.view) + if err != nil { + return done, err } + if wanted > done { // Partial write due to full pipe. return done, syserror.ErrWouldBlock @@ -396,7 +320,7 @@ func (p *Pipe) HasWriters() bool { // Precondition: mu must be held. func (p *Pipe) rReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) - if p.HasReaders() && p.data.Front() != nil { + if p.HasReaders() && p.view.Size() != 0 { ready |= waiter.EventIn } if !p.HasWriters() && p.hadWriter { @@ -422,7 +346,7 @@ func (p *Pipe) rReadiness() waiter.EventMask { // Precondition: mu must be held. func (p *Pipe) wReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) - if p.HasWriters() && p.size < p.max { + if p.HasWriters() && p.view.Size() < p.max { ready |= waiter.EventOut } if !p.HasReaders() { @@ -451,7 +375,7 @@ func (p *Pipe) rwReadiness() waiter.EventMask { func (p *Pipe) queued() int64 { p.mu.Lock() defer p.mu.Unlock() - return p.size + return p.view.Size() } // FifoSize implements fs.FifoSizer.FifoSize. @@ -474,7 +398,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) { } p.mu.Lock() defer p.mu.Unlock() - if size < p.size { + if size < p.view.Size() { return 0, syserror.EBUSY } p.max = size diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 80158239e..5a1d4fd57 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" @@ -49,9 +50,10 @@ func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) limit: func(l int64) { dst = dst.TakeFirst64(l) }, - read: func(buf *buffer) (int64, error) { - n, err := dst.CopyOutFrom(ctx, buf) + read: func(view *buffer.View) (int64, error) { + n, err := dst.CopyOutFrom(ctx, view) dst = dst.DropFirst64(n) + view.TrimFront(n) return n, err }, }) @@ -70,16 +72,15 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) limit: func(l int64) { count = l }, - read: func(buf *buffer) (int64, error) { - n, err := buf.ReadToWriter(w, count, dup) + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadToWriter(w, count) + if !dup { + view.TrimFront(n) + } count -= n return n, err }, } - if dup { - // There is no notification for dup operations. - return p.dup(ctx, ops) - } n, err := p.read(ctx, ops) if n > 0 { p.Notify(waiter.EventOut) @@ -96,8 +97,8 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) limit: func(l int64) { src = src.TakeFirst64(l) }, - write: func(buf *buffer) (int64, error) { - n, err := src.CopyInTo(ctx, buf) + write: func(view *buffer.View) (int64, error) { + n, err := src.CopyInTo(ctx, view) src = src.DropFirst64(n) return n, err }, @@ -117,8 +118,8 @@ func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, e limit: func(l int64) { count = l }, - write: func(buf *buffer) (int64, error) { - n, err := buf.WriteFromReader(r, count) + write: func(view *buffer.View) (int64, error) { + n, err := view.WriteFromReader(r, count) count -= n return n, err }, -- cgit v1.2.3 From 413a9b7fdc14f8bff660e1988e3ef0355dd4e6c6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 21 Feb 2020 11:00:11 -0800 Subject: Define CPUIDInstruction for arm64 There is no cpuid instruction on arm64, so we need to defined it just to avoid a compile time error. Signed-off-by: Andrei Vagin --- pkg/sentry/arch/arch_arm64.go | 5 +++++ pkg/sentry/kernel/task_run.go | 41 ++++++++++++++++++++++++++++------------- 2 files changed, 33 insertions(+), 13 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 372b650b9..885115ae2 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -53,6 +53,11 @@ const ( preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5 ) +var ( + // CPUIDInstruction doesn't exist on ARM64. + CPUIDInstruction = []byte{} +) + // These constants are selected as heuristics to help make the Platform's // potentially limited address space conform as closely to Linux as possible. const ( diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 5568c91bc..799cbcd93 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -126,13 +126,39 @@ func (t *Task) doStop() { } } +func (*runApp) handleCPUIDInstruction(t *Task) error { + if len(arch.CPUIDInstruction) == 0 { + // CPUID emulation isn't supported, but this code can be + // executed, because the ptrace platform returns + // ErrContextSignalCPUID on page faults too. Look at + // pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more + // details. + return platform.ErrContextSignal + } + // Is this a CPUID instruction? + region := trace.StartRegion(t.traceContext, cpuidRegion) + expected := arch.CPUIDInstruction[:] + found := make([]byte, len(expected)) + _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) + if err == nil && bytes.Equal(expected, found) { + // Skip the cpuid instruction. + t.Arch().CPUIDEmulate(t) + t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + region.End() + + return nil + } + region.End() // Not an actual CPUID, but required copy-in. + return platform.ErrContextSignal +} + // The runApp state checks for interrupts before executing untrusted // application code. // // +stateify savable type runApp struct{} -func (*runApp) execute(t *Task) taskRunState { +func (app *runApp) execute(t *Task) taskRunState { if t.interrupted() { // Checkpointing instructs tasks to stop by sending an interrupt, so we // must check for stops before entering runInterrupt (instead of @@ -237,21 +263,10 @@ func (*runApp) execute(t *Task) taskRunState { return (*runApp)(nil) case platform.ErrContextSignalCPUID: - // Is this a CPUID instruction? - region := trace.StartRegion(t.traceContext, cpuidRegion) - expected := arch.CPUIDInstruction[:] - found := make([]byte, len(expected)) - _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) - if err == nil && bytes.Equal(expected, found) { - // Skip the cpuid instruction. - t.Arch().CPUIDEmulate(t) - t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) - region.End() - + if err := app.handleCPUIDInstruction(t); err == nil { // Resume execution. return (*runApp)(nil) } - region.End() // Not an actual CPUID, but required copy-in. // The instruction at the given RIP was not a CPUID, and we // fallthrough to the default signal deliver behavior below. -- cgit v1.2.3 From da48fc6cca23a38faef51c5b5f8ae609940773a0 Mon Sep 17 00:00:00 2001 From: Ian Lewis Date: Thu, 5 Mar 2020 18:21:39 -0800 Subject: Stub oom_score_adj and oom_score. Adds an oom_score_adj and oom_score proc file stub. oom_score_adj accepts writes of values -1000 to 1000 and persists the value with the task. New tasks inherit the parent's oom_score_adj. oom_score is a read-only stub that always returns the value '0'. Issue #202 PiperOrigin-RevId: 299245355 --- pkg/sentry/fs/proc/task.go | 126 ++++++++++++++++++++++++++----- pkg/sentry/fsimpl/proc/task.go | 12 +-- pkg/sentry/fsimpl/proc/task_files.go | 43 +++++++++++ pkg/sentry/fsimpl/proc/tasks_test.go | 32 ++++---- pkg/sentry/kernel/task.go | 33 ++++++++ pkg/sentry/kernel/task_clone.go | 6 ++ pkg/sentry/kernel/task_start.go | 4 + test/syscalls/BUILD | 8 +- test/syscalls/linux/BUILD | 13 ++++ test/syscalls/linux/proc.cc | 21 ++++++ test/syscalls/linux/proc_pid_oomscore.cc | 72 ++++++++++++++++++ 11 files changed, 330 insertions(+), 40 deletions(-) create mode 100644 test/syscalls/linux/proc_pid_oomscore.cc (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 8ab8d8a02..4e9b0fc00 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -72,24 +72,26 @@ var _ fs.InodeOperations = (*taskDir)(nil) // newTaskDir creates a new proc task entry. func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { contents := map[string]*fs.Inode{ - "auxv": newAuxvec(t, msrc), - "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - "comm": newComm(t, msrc), - "environ": newExecArgInode(t, msrc, environExecArg), - "exe": newExe(t, msrc), - "fd": newFdDir(t, msrc), - "fdinfo": newFdInfoDir(t, msrc), - "gid_map": newGIDMap(t, msrc), - "io": newIO(t, msrc, isThreadGroup), - "maps": newMaps(t, msrc), - "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), - "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - "ns": newNamespaceDir(t, msrc), - "smaps": newSmaps(t, msrc), - "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), - "statm": newStatm(t, msrc), - "status": newStatus(t, msrc, p.pidns), - "uid_map": newUIDMap(t, msrc), + "auxv": newAuxvec(t, msrc), + "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + "comm": newComm(t, msrc), + "environ": newExecArgInode(t, msrc, environExecArg), + "exe": newExe(t, msrc), + "fd": newFdDir(t, msrc), + "fdinfo": newFdInfoDir(t, msrc), + "gid_map": newGIDMap(t, msrc), + "io": newIO(t, msrc, isThreadGroup), + "maps": newMaps(t, msrc), + "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), + "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + "ns": newNamespaceDir(t, msrc), + "oom_score": newOOMScore(t, msrc), + "oom_score_adj": newOOMScoreAdj(t, msrc), + "smaps": newSmaps(t, msrc), + "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), + "statm": newStatm(t, msrc), + "status": newStatus(t, msrc, p.pidns), + "uid_map": newUIDMap(t, msrc), } if isThreadGroup { contents["task"] = p.newSubtasks(t, msrc) @@ -796,4 +798,92 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc return int64(n), err } +// newOOMScore returns a oom_score file. It is a stub that always returns 0. +// TODO(gvisor.dev/issue/1967) +func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newStaticProcInode(t, msrc, []byte("0\n")) +} + +// oomScoreAdj is a file containing the oom_score adjustment for a task. +// +// +stateify savable +type oomScoreAdj struct { + fsutil.SimpleFileInode + + t *kernel.Task +} + +// +stateify savable +type oomScoreAdjFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + t *kernel.Task +} + +// newOOMScoreAdj returns a oom_score_adj file. +func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + i := &oomScoreAdj{ + SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + t: t, + } + return newProcInode(t, i, msrc, fs.SpecialFile, t) +} + +// Truncate implements fs.InodeOperations.Truncate. Truncate is called when +// O_TRUNC is specified for any kind of existing Dirent but is not called via +// (f)truncate for proc files. +func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil +} + +// Read implements fs.FileOperations.Read. +func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + adj, err := f.t.OOMScoreAdj() + if err != nil { + return 0, err + } + adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n") + n, err := dst.CopyOut(ctx, adjBytes) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + + if err := f.t.SetOOMScoreAdj(v); err != nil { + return 0, err + } + + return n, nil +} + // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go) diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 2d814668a..18e5cd6f6 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -62,11 +62,13 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), "user": newNamespaceSymlink(task, inoGen.NextIno(), "user"), }), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers) diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index efd3b3453..5a231ac86 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -525,3 +525,46 @@ func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) return nil } + +// oomScoreAdj is a stub of the /proc//oom_score_adj file. +// +// +stateify savable +type oomScoreAdj struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { + adj, err := o.task.OOMScoreAdj() + if err != nil { + return err + } + fmt.Fprintf(buf, "%d\n", adj) + return nil +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + + if err := o.task.SetOOMScoreAdj(v); err != nil { + return 0, err + } + + return n, nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index c5d531fe0..0eb401619 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -63,21 +63,23 @@ var ( "thread-self": threadSelfLink.NextOff, } taskStaticFiles = map[string]testutil.DirentType{ - "auxv": linux.DT_REG, - "cgroup": linux.DT_REG, - "cmdline": linux.DT_REG, - "comm": linux.DT_REG, - "environ": linux.DT_REG, - "gid_map": linux.DT_REG, - "io": linux.DT_REG, - "maps": linux.DT_REG, - "ns": linux.DT_DIR, - "smaps": linux.DT_REG, - "stat": linux.DT_REG, - "statm": linux.DT_REG, - "status": linux.DT_REG, - "task": linux.DT_DIR, - "uid_map": linux.DT_REG, + "auxv": linux.DT_REG, + "cgroup": linux.DT_REG, + "cmdline": linux.DT_REG, + "comm": linux.DT_REG, + "environ": linux.DT_REG, + "gid_map": linux.DT_REG, + "io": linux.DT_REG, + "maps": linux.DT_REG, + "ns": linux.DT_DIR, + "oom_score": linux.DT_REG, + "oom_score_adj": linux.DT_REG, + "smaps": linux.DT_REG, + "stat": linux.DT_REG, + "statm": linux.DT_REG, + "status": linux.DT_REG, + "task": linux.DT_DIR, + "uid_map": linux.DT_REG, } ) diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 2cee2e6ed..c0dbbe890 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -554,6 +555,13 @@ type Task struct { // // startTime is protected by mu. startTime ktime.Time + + // oomScoreAdj is the task's OOM score adjustment. This is currently not + // used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is protected by mu, and is owned by the task goroutine. + oomScoreAdj int32 } func (t *Task) savePtraceTracer() *Task { @@ -847,3 +855,28 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace { func (t *Task) ContainerID() string { return t.containerID } + +// OOMScoreAdj gets the task's OOM score adjustment. +func (t *Task) OOMScoreAdj() (int32, error) { + t.mu.Lock() + defer t.mu.Unlock() + if t.ExitState() == TaskExitDead { + return 0, syserror.ESRCH + } + return t.oomScoreAdj, nil +} + +// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be +// between -1000 and 1000 inclusive. +func (t *Task) SetOOMScoreAdj(adj int32) error { + t.mu.Lock() + defer t.mu.Unlock() + if t.ExitState() == TaskExitDead { + return syserror.ESRCH + } + if adj > 1000 || adj < -1000 { + return syserror.EINVAL + } + t.oomScoreAdj = adj + return nil +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 78866f280..dda502bb8 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -264,6 +264,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { rseqSignature = t.rseqSignature } + adj, err := t.OOMScoreAdj() + if err != nil { + return 0, nil, err + } + cfg := &TaskConfig{ Kernel: t.k, ThreadGroup: tg, @@ -282,6 +287,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), + OOMScoreAdj: adj, } if opts.NewThreadGroup { cfg.Parent = t diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index a5035bb7f..2bbf48bb8 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -93,6 +93,9 @@ type TaskConfig struct { // ContainerID is the container the new task belongs to. ContainerID string + + // oomScoreAdj is the task's OOM score adjustment. + OOMScoreAdj int32 } // NewTask creates a new task defined by cfg. @@ -143,6 +146,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, + oomScoreAdj: cfg.OOMScoreAdj, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index a69b0ce13..9800a0cdf 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -318,10 +318,14 @@ syscall_test( test = "//test/syscalls/linux:proc_test", ) -syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test") - syscall_test(test = "//test/syscalls/linux:proc_net_test") +syscall_test(test = "//test/syscalls/linux:proc_pid_oomscore_test") + +syscall_test(test = "//test/syscalls/linux:proc_pid_smaps_test") + +syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test") + syscall_test( size = "medium", test = "//test/syscalls/linux:pselect_test", diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 0fbd556de..43455f1a3 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1631,6 +1631,19 @@ cc_binary( ], ) +cc_binary( + name = "proc_pid_oomscore_test", + testonly = 1, + srcs = ["proc_pid_oomscore.cc"], + linkstatic = 1, + deps = [ + "//test/util:fs_util", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_absl//absl/strings", + ], +) + cc_binary( name = "proc_pid_smaps_test", testonly = 1, diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index f91187e75..5a70f6c3b 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -1431,6 +1431,12 @@ TEST(ProcPidFile, SubprocessRunning) { EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)), SyscallSucceedsWithValue(sizeof(buf))); + + EXPECT_THAT(ReadWhileRunning("oom_score", buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); + + EXPECT_THAT(ReadWhileRunning("oom_score_adj", buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); } // Test whether /proc/PID/ files can be read for a zombie process. @@ -1466,6 +1472,12 @@ TEST(ProcPidFile, SubprocessZombie) { EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)), SyscallSucceedsWithValue(sizeof(buf))); + EXPECT_THAT(ReadWhileZombied("oom_score", buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); + + EXPECT_THAT(ReadWhileZombied("oom_score_adj", buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); + // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux // on proc files. // @@ -1527,6 +1539,15 @@ TEST(ProcPidFile, SubprocessExited) { EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)), SyscallSucceedsWithValue(sizeof(buf))); + + if (!IsRunningOnGvisor()) { + // FIXME(gvisor.dev/issue/164): Succeeds on gVisor. + EXPECT_THAT(ReadWhileExited("oom_score", buf, sizeof(buf)), + SyscallFailsWithErrno(ESRCH)); + } + + EXPECT_THAT(ReadWhileExited("oom_score_adj", buf, sizeof(buf)), + SyscallFailsWithErrno(ESRCH)); } PosixError DirContainsImpl(absl::string_view path, diff --git a/test/syscalls/linux/proc_pid_oomscore.cc b/test/syscalls/linux/proc_pid_oomscore.cc new file mode 100644 index 000000000..707821a3f --- /dev/null +++ b/test/syscalls/linux/proc_pid_oomscore.cc @@ -0,0 +1,72 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include "test/util/fs_util.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +PosixErrorOr ReadProcNumber(std::string path) { + ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents(path)); + EXPECT_EQ(contents[contents.length() - 1], '\n'); + + int num; + if (!absl::SimpleAtoi(contents, &num)) { + return PosixError(EINVAL, "invalid value: " + contents); + } + + return num; +} + +TEST(ProcPidOomscoreTest, BasicRead) { + auto const oom_score = + ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score")); + EXPECT_LE(oom_score, 1000); + EXPECT_GE(oom_score, -1000); +} + +TEST(ProcPidOomscoreAdjTest, BasicRead) { + auto const oom_score = + ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj")); + + // oom_score_adj defaults to 0. + EXPECT_EQ(oom_score, 0); +} + +TEST(ProcPidOomscoreAdjTest, BasicWrite) { + constexpr int test_value = 7; + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/oom_score_adj", O_WRONLY)); + ASSERT_THAT( + RetryEINTR(write)(fd.get(), std::to_string(test_value).c_str(), 1), + SyscallSucceeds()); + + auto const oom_score = + ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj")); + EXPECT_EQ(oom_score, test_value); +} + +} // namespace + +} // namespace testing +} // namespace gvisor -- cgit v1.2.3 From 6fa5cee82c0f515b001dee5f3840e1f875b2f477 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Fri, 6 Mar 2020 12:30:37 -0800 Subject: Prevent memory leaks in ilist When list elements are removed from a list but not discarded, it becomes important to invalidate the references they hold to their former neighbors to prevent memory leaks. PiperOrigin-RevId: 299412421 --- pkg/ilist/list.go | 8 ++++++-- pkg/sentry/fs/dirent_cache.go | 2 -- pkg/sentry/fs/inotify.go | 5 ++++- pkg/sentry/kernel/epoll/epoll_state.go | 13 ++++++++----- pkg/tcpip/network/fragmentation/fragmentation.go | 8 +++++--- 5 files changed, 23 insertions(+), 13 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go index f3a609b57..8f93e4d6d 100644 --- a/pkg/ilist/list.go +++ b/pkg/ilist/list.go @@ -169,8 +169,9 @@ func (l *List) InsertBefore(a, e Element) { // Remove removes e from l. func (l *List) Remove(e Element) { - prev := ElementMapper{}.linkerFor(e).Prev() - next := ElementMapper{}.linkerFor(e).Next() + linker := ElementMapper{}.linkerFor(e) + prev := linker.Prev() + next := linker.Next() if prev != nil { ElementMapper{}.linkerFor(prev).SetNext(next) @@ -183,6 +184,9 @@ func (l *List) Remove(e Element) { } else { l.tail = prev } + + linker.SetNext(nil) + linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 25514ace4..33de32c69 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -101,8 +101,6 @@ func (c *DirentCache) remove(d *Dirent) { panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d)) } c.list.Remove(d) - d.SetPrev(nil) - d.SetNext(nil) d.DecRef() c.currentSize-- if c.limit != nil { diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index 928c90aa0..e3a715c1f 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -143,7 +143,10 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i } var writeLen int64 - for event := i.events.Front(); event != nil; event = event.Next() { + for it := i.events.Front(); it != nil; { + event := it + it = it.Next() + // Does the buffer have enough remaining space to hold the event we're // about to write out? if dst.NumBytes() < int64(event.sizeOf()) { diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go index a0d35d350..8e9f200d0 100644 --- a/pkg/sentry/kernel/epoll/epoll_state.go +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -38,11 +38,14 @@ func (e *EventPoll) afterLoad() { } } - for it := e.waitingList.Front(); it != nil; it = it.Next() { - if it.id.File.Readiness(it.mask) != 0 { - e.waitingList.Remove(it) - e.readyList.PushBack(it) - it.curList = &e.readyList + for it := e.waitingList.Front(); it != nil; { + entry := it + it = it.Next() + + if entry.id.File.Readiness(entry.mask) != 0 { + e.waitingList.Remove(entry) + e.readyList.PushBack(entry) + entry.curList = &e.readyList e.Notify(waiter.EventIn) } } diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go index 92f2aa13a..f42abc4bb 100644 --- a/pkg/tcpip/network/fragmentation/fragmentation.go +++ b/pkg/tcpip/network/fragmentation/fragmentation.go @@ -115,10 +115,12 @@ func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buf // Evict reassemblers if we are consuming more memory than highLimit until // we reach lowLimit. if f.size > f.highLimit { - tail := f.rList.Back() - for f.size > f.lowLimit && tail != nil { + for f.size > f.lowLimit { + tail := f.rList.Back() + if tail == nil { + break + } f.release(tail) - tail = tail.Prev() } } f.mu.Unlock() -- cgit v1.2.3 From b78cee3bae142eb5c602d51874d0cbad274777e2 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 13 Mar 2020 12:16:59 -0700 Subject: Fix lock recursion in kernel.ProcessGroup.SendSignal(). PiperOrigin-RevId: 300803515 --- pkg/sentry/kernel/sessions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 047b5214d..0e19286de 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -246,7 +246,7 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { var lastErr error for tg := range tasks.Root.tgids { - if tg.ProcessGroup() == pg { + if tg.processGroup == pg { tg.signalHandlers.mu.Lock() infoCopy := *info if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { -- cgit v1.2.3 From 1c0535297067179a822ba2dd9a6fe13a8be5a666 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 13 Mar 2020 13:17:59 -0700 Subject: Fix oom_score_adj. - Make oomScoreAdj a ThreadGroup field (Linux: signal_struct::oom_score_adj). - Avoid deadlock caused by Task.OOMScoreAdj()/SetOOMScoreAdj() locking Task.mu and TaskSet.mu in the wrong order (via Task.ExitState()). PiperOrigin-RevId: 300814698 --- pkg/sentry/fs/proc/task.go | 17 ++++++++++------- pkg/sentry/fsimpl/proc/task_files.go | 10 ++++++---- pkg/sentry/kernel/task.go | 29 ++++++----------------------- pkg/sentry/kernel/task_clone.go | 9 +++------ pkg/sentry/kernel/task_start.go | 4 ---- pkg/sentry/kernel/thread_group.go | 7 +++++++ 6 files changed, 32 insertions(+), 44 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 03cc788c8..d6c5dd2c1 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -853,15 +853,15 @@ func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F // Read implements fs.FileOperations.Read. func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - if offset != 0 { - return 0, io.EOF + if f.t.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH } - adj, err := f.t.OOMScoreAdj() - if err != nil { - return 0, err + var buf bytes.Buffer + fmt.Fprintf(&buf, "%d\n", f.t.OOMScoreAdj()) + if offset >= int64(buf.Len()) { + return 0, io.EOF } - adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n") - n, err := dst.CopyOut(ctx, adjBytes) + n, err := dst.CopyOut(ctx, buf.Bytes()[offset:]) return int64(n), err } @@ -880,6 +880,9 @@ func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOS return 0, err } + if f.t.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH + } if err := f.t.SetOOMScoreAdj(v); err != nil { return 0, err } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 5a231ac86..4d3332771 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -539,11 +539,10 @@ var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { - adj, err := o.task.OOMScoreAdj() - if err != nil { - return err + if o.task.ExitState() == kernel.TaskExitDead { + return syserror.ESRCH } - fmt.Fprintf(buf, "%d\n", adj) + fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) return nil } @@ -562,6 +561,9 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset return 0, err } + if o.task.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH + } if err := o.task.SetOOMScoreAdj(v); err != nil { return 0, err } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c0dbbe890..8452ddf5b 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -555,13 +555,6 @@ type Task struct { // // startTime is protected by mu. startTime ktime.Time - - // oomScoreAdj is the task's OOM score adjustment. This is currently not - // used but is maintained for consistency. - // TODO(gvisor.dev/issue/1967) - // - // oomScoreAdj is protected by mu, and is owned by the task goroutine. - oomScoreAdj int32 } func (t *Task) savePtraceTracer() *Task { @@ -856,27 +849,17 @@ func (t *Task) ContainerID() string { return t.containerID } -// OOMScoreAdj gets the task's OOM score adjustment. -func (t *Task) OOMScoreAdj() (int32, error) { - t.mu.Lock() - defer t.mu.Unlock() - if t.ExitState() == TaskExitDead { - return 0, syserror.ESRCH - } - return t.oomScoreAdj, nil +// OOMScoreAdj gets the task's thread group's OOM score adjustment. +func (t *Task) OOMScoreAdj() int32 { + return atomic.LoadInt32(&t.tg.oomScoreAdj) } -// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be -// between -1000 and 1000 inclusive. +// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The +// value should be between -1000 and 1000 inclusive. func (t *Task) SetOOMScoreAdj(adj int32) error { - t.mu.Lock() - defer t.mu.Unlock() - if t.ExitState() == TaskExitDead { - return syserror.ESRCH - } if adj > 1000 || adj < -1000 { return syserror.EINVAL } - t.oomScoreAdj = adj + atomic.StoreInt32(&t.tg.oomScoreAdj, adj) return nil } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index dda502bb8..e1ecca99e 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -15,6 +15,8 @@ package kernel import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -260,15 +262,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { sh = sh.Fork() } tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature } - adj, err := t.OOMScoreAdj() - if err != nil { - return 0, nil, err - } - cfg := &TaskConfig{ Kernel: t.k, ThreadGroup: tg, @@ -287,7 +285,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), - OOMScoreAdj: adj, } if opts.NewThreadGroup { cfg.Parent = t diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 2bbf48bb8..a5035bb7f 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -93,9 +93,6 @@ type TaskConfig struct { // ContainerID is the container the new task belongs to. ContainerID string - - // oomScoreAdj is the task's OOM score adjustment. - OOMScoreAdj int32 } // NewTask creates a new task defined by cfg. @@ -146,7 +143,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, - oomScoreAdj: cfg.OOMScoreAdj, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 268f62e9d..52849f5b3 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -254,6 +254,13 @@ type ThreadGroup struct { // // tty is protected by the signal mutex. tty *TTY + + // oomScoreAdj is the thread group's OOM score adjustment. This is + // currently not used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is accessed using atomic memory operations. + oomScoreAdj int32 } // NewThreadGroup returns a new, empty thread group in PID namespace pidns. The -- cgit v1.2.3 From b0f2c3e7646df603156f1b8e8b3382f33353eb04 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 13 Mar 2020 16:08:06 -0700 Subject: Fix infinite loop in semaphore.sem.wakeWaiters(). PiperOrigin-RevId: 300845134 --- pkg/sentry/kernel/semaphore/semaphore.go | 1 + 1 file changed, 1 insertion(+) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 1000f3287..c00fa1138 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -554,6 +554,7 @@ func (s *sem) wakeWaiters() { for w := s.waiters.Front(); w != nil; { if s.value < w.value { // Still blocked, skip it. + w = w.Next() continue } w.ch <- struct{}{} -- cgit v1.2.3 From 829beebf0b67e20e50dd5ec4a5030636e38cc576 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 13 Mar 2020 17:16:59 -0700 Subject: Panic if file in FDTable has been destroyed This will give more information about the file to identify where possibly the extra DecRef() would be. PiperOrigin-RevId: 300855874 --- pkg/sentry/kernel/fd_table.go | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 58001d56c..00f914564 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -195,6 +195,8 @@ func (f *FDTable) Size() int { // // It is the caller's responsibility to acquire an appropriate lock. func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { + // retries tracks the number of failed TryIncRef attempts for the same FD. + retries := 0 fd := int32(0) for { file, fileVFS2, flags, ok := f.getAll(fd) @@ -204,17 +206,26 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes switch { case file != nil: if !file.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations)) + } continue // Race caught. } fn(fd, file, nil, flags) file.DecRef() case fileVFS2 != nil: if !fileVFS2.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl())) + } continue // Race caught. } fn(fd, nil, fileVFS2, flags) fileVFS2.DecRef() } + retries = 0 fd++ } } -- cgit v1.2.3 From 45a8ae240dd180f1b8b4c56e77ac67e4cd3af96f Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 13 Mar 2020 18:56:05 -0700 Subject: Add remaining procfs files Closes #1195 PiperOrigin-RevId: 300867055 --- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 8 + pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/proc/BUILD | 4 + pkg/sentry/fsimpl/proc/subtasks.go | 6 +- pkg/sentry/fsimpl/proc/task.go | 26 +-- pkg/sentry/fsimpl/proc/task_fds.go | 287 ++++++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/task_files.go | 251 +++++++++++++++++++++++- pkg/sentry/fsimpl/proc/tasks.go | 30 ++- pkg/sentry/fsimpl/proc/tasks_files.go | 52 +++-- pkg/sentry/fsimpl/proc/tasks_test.go | 87 +++++++-- pkg/sentry/fsimpl/testutil/BUILD | 2 + pkg/sentry/fsimpl/testutil/kernel.go | 27 ++- pkg/sentry/kernel/fd_table.go | 7 +- 13 files changed, 709 insertions(+), 80 deletions(-) create mode 100644 pkg/sentry/fsimpl/proc/task_fds.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index d50018b18..94ca3dbdd 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -554,3 +554,11 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } + +// AlwaysValid partially implements kernfs.inodeDynamicLookup. +type AlwaysValid struct{} + +// Valid implements kernfs.inodeDynamicLookup. +func (*AlwaysValid) Valid(context.Context) bool { + return true +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index a8ab2a2ba..18a34a590 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -319,7 +319,7 @@ type inodeMetadata interface { // CheckPermissions checks that creds may access this inode for the // requested access type, per the the rules of // fs/namei.c:generic_permission(). - CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error + CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error // Mode returns the (struct stat)::st_mode value for this inode. This is // separated from Stat for performance. diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index bb609a305..8156984eb 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -8,6 +8,7 @@ go_library( "filesystem.go", "subtasks.go", "task.go", + "task_fds.go", "task_files.go", "task_net.go", "tasks.go", @@ -19,8 +20,10 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/log", + "//pkg/refs", "//pkg/safemem", "//pkg/sentry/fs", + "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -53,6 +56,7 @@ go_test( "//pkg/fspath", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 611645f3f..a3a7c16a5 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -34,6 +34,7 @@ type subtasksInode struct { kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs kernfs.OrderedChildren + kernfs.AlwaysValid task *kernel.Task pidns *kernel.PIDNamespace @@ -61,11 +62,6 @@ func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenera return dentry } -// Valid implements kernfs.inodeDynamicLookup. -func (i *subtasksInode) Valid(ctx context.Context) bool { - return true -} - // Lookup implements kernfs.inodeDynamicLookup. func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { tid, err := strconv.ParseUint(name, 10, 32) diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 493acbd1b..4891caab6 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -45,19 +45,19 @@ var _ kernfs.Inode = (*taskInode)(nil) func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { contents := map[string]*kernfs.Dentry{ - "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), - "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), - "comm": newComm(task, inoGen.NextIno(), 0444), - "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), - //"exe": newExe(t, msrc), - //"fd": newFdDir(t, msrc), - //"fdinfo": newFdInfoDir(t, msrc), - "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), - //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), - //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - "net": newTaskNetDir(task, inoGen), + "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), + "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": newComm(task, inoGen.NextIno(), 0444), + "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "exe": newExeSymlink(task, inoGen.NextIno()), + "fd": newFDDirInode(task, inoGen), + "fdinfo": newFDInfoDirInode(task, inoGen), + "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), + "mountinfo": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountsData{task: task}), + "net": newTaskNetDir(task, inoGen), "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go new file mode 100644 index 000000000..76bfc5307 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -0,0 +1,287 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type fdDir struct { + inoGen InoGenerator + task *kernel.Task + + // When produceSymlinks is set, dirents produces for the FDs are reported + // as symlink. Otherwise, they are reported as regular files. + produceSymlink bool +} + +func (i *fdDir) lookup(name string) (*vfs.FileDescription, kernel.FDFlags, error) { + fd, err := strconv.ParseUint(name, 10, 64) + if err != nil { + return nil, kernel.FDFlags{}, syserror.ENOENT + } + + var ( + file *vfs.FileDescription + flags kernel.FDFlags + ) + i.task.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + file, flags = fdTable.GetVFS2(int32(fd)) + } + }) + if file == nil { + return nil, kernel.FDFlags{}, syserror.ENOENT + } + return file, flags, nil +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) { + var fds []int32 + i.task.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.GetFDs() + } + }) + + offset := absOffset + relOffset + typ := uint8(linux.DT_REG) + if i.produceSymlink { + typ = linux.DT_LNK + } + + // Find the appropriate starting point. + idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) }) + if idx >= len(fds) { + return offset, nil + } + for _, fd := range fds[idx:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(fd), 10), + Type: typ, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} + +// fdDirInode represents the inode for /proc/[pid]/fd directory. +// +// +stateify savable +type fdDirInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + fdDir +} + +var _ kernfs.Inode = (*fdDirInode)(nil) + +func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { + inode := &fdDirInode{ + fdDir: fdDir{ + inoGen: inoGen, + task: task, + produceSymlink: true, + }, + } + inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + file, _, err := i.lookup(name) + if err != nil { + return nil, err + } + taskDentry := newFDSymlink(i.task.Credentials(), file, i.inoGen.NextIno()) + return taskDentry.VFSDentry(), nil +} + +// Open implements kernfs.Inode. +func (i *fdDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + return fd.VFSFileDescription(), nil +} + +// CheckPermissions implements kernfs.Inode. +// +// This is to match Linux, which uses a special permission handler to guarantee +// that a process can still access /proc/self/fd after it has executed +// setuid. See fs/proc/fd.c:proc_fd_permission. +func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + err := i.InodeAttrs.CheckPermissions(ctx, creds, ats) + if err == nil { + // Access granted, no extra check needed. + return nil + } + if t := kernel.TaskFromContext(ctx); t != nil { + // Allow access if the task trying to access it is in the thread group + // corresponding to this directory. + if i.task.ThreadGroup() == t.ThreadGroup() { + // Access granted (overridden). + return nil + } + } + return err +} + +// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. +// +// +stateify savable +type fdSymlink struct { + refs.AtomicRefCount + kernfs.InodeAttrs + kernfs.InodeSymlink + + file *vfs.FileDescription +} + +var _ kernfs.Inode = (*fdSymlink)(nil) + +func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64) *kernfs.Dentry { + file.IncRef() + inode := &fdSymlink{file: file} + inode.Init(creds, ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { + root := vfs.RootFromContext(ctx) + defer root.DecRef() + + vfsObj := s.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() + return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry()) +} + +func (s *fdSymlink) DecRef() { + s.AtomicRefCount.DecRefWithDestructor(func() { + s.Destroy() + }) +} + +func (s *fdSymlink) Destroy() { + s.file.DecRef() +} + +// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. +// +// +stateify savable +type fdInfoDirInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + fdDir +} + +var _ kernfs.Inode = (*fdInfoDirInode)(nil) + +func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { + inode := &fdInfoDirInode{ + fdDir: fdDir{ + inoGen: inoGen, + task: task, + }, + } + inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + file, flags, err := i.lookup(name) + if err != nil { + return nil, err + } + + data := &fdInfoData{file: file, flags: flags} + dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data) + return dentry.VFSDentry(), nil +} + +// Open implements kernfs.Inode. +func (i *fdInfoDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + return fd.VFSFileDescription(), nil +} + +// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. +// +// +stateify savable +type fdInfoData struct { + kernfs.DynamicBytesFile + refs.AtomicRefCount + + file *vfs.FileDescription + flags kernel.FDFlags +} + +var _ dynamicInode = (*fdInfoData)(nil) + +func (d *fdInfoData) DecRef() { + d.AtomicRefCount.DecRefWithDestructor(d.destroy) +} + +func (d *fdInfoData) destroy() { + d.file.DecRef() +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/121266871): Include pos, locks, and other data. For now we only + // have flags. + // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt + flags := uint(d.file.StatusFlags()) | d.flags.ToLinuxFileFlags() + fmt.Fprintf(buf, "flags:\t0%o\n", flags) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 4d3332771..8c743df8d 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -18,10 +18,14 @@ import ( "bytes" "fmt" "io" + "sort" + "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -496,7 +500,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// ioUsage is the /proc//io and /proc//task//io data provider. +// ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. type ioUsage interface { // IOUsage returns the io usage data. IOUsage() *usage.IO @@ -570,3 +574,248 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset return n, nil } + +// exeSymlink is an symlink for the /proc/[pid]/exe file. +// +// +stateify savable +type exeSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + task *kernel.Task +} + +var _ kernfs.Inode = (*exeSymlink)(nil) + +func newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { + inode := &exeSymlink{task: task} + inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +// Readlink implements kernfs.Inode. +func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return "", syserror.EACCES + } + + // Pull out the executable for /proc/[pid]/exe. + exec, err := s.executable() + if err != nil { + return "", err + } + defer exec.DecRef() + + return exec.PathnameWithDeleted(ctx), nil +} + +func (s *exeSymlink) executable() (file fsbridge.File, err error) { + s.task.WithMuLocked(func(t *kernel.Task) { + mm := t.MemoryManager() + if mm == nil { + // TODO(b/34851096): Check shouldn't allow Readlink once the + // Task is zombied. + err = syserror.EACCES + return + } + + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + file = mm.Executable() + if file == nil { + err = syserror.ENOENT + } + }) + return +} + +// forEachMountSource runs f for the process root mount and each mount that is +// a descendant of the root. +func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { + var fsctx *kernel.FSContext + t.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() + }) + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return + } + + // All mount points must be relative to the rootDir, and mounts outside + // will be excluded. + rootDir := fsctx.RootDirectory() + if rootDir == nil { + // The task has been destroyed. Nothing to show here. + return + } + defer rootDir.DecRef() + + mnt := t.MountNamespace().FindMount(rootDir) + if mnt == nil { + // Has it just been unmounted? + return + } + ms := t.MountNamespace().AllMountsUnder(mnt) + sort.Slice(ms, func(i, j int) bool { + return ms[i].ID < ms[j].ID + }) + for _, m := range ms { + mroot := m.Root() + if mroot == nil { + continue // No longer valid. + } + mountPath, desc := mroot.FullName(rootDir) + mroot.DecRef() + if !desc { + // MountSources that are not descendants of the chroot jail are ignored. + continue + } + fn(mountPath, m) + } +} + +// mountInfoData is used to implement /proc/[pid]/mountinfo. +// +// +stateify savable +type mountInfoData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mountInfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + forEachMount(i.task, func(mountPath string, m *fs.Mount) { + mroot := m.Root() + if mroot == nil { + return // No longer valid. + } + defer mroot.DecRef() + + // Format: + // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + // (1) MountSource ID. + fmt.Fprintf(buf, "%d ", m.ID) + + // (2) Parent ID (or this ID if there is no parent). + pID := m.ID + if !m.IsRoot() && !m.IsUndo() { + pID = m.ParentID + } + fmt.Fprintf(buf, "%d ", pID) + + // (3) Major:Minor device ID. We don't have a superblock, so we + // just use the root inode device number. + sa := mroot.Inode.StableAttr + fmt.Fprintf(buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor) + + // (4) Root: the pathname of the directory in the filesystem + // which forms the root of this mount. + // + // NOTE(b/78135857): This will always be "/" until we implement + // bind mounts. + fmt.Fprintf(buf, "/ ") + + // (5) Mount point (relative to process root). + fmt.Fprintf(buf, "%s ", mountPath) + + // (6) Mount options. + flags := mroot.Inode.MountSource.Flags + opts := "rw" + if flags.ReadOnly { + opts = "ro" + } + if flags.NoAtime { + opts += ",noatime" + } + if flags.NoExec { + opts += ",noexec" + } + fmt.Fprintf(buf, "%s ", opts) + + // (7) Optional fields: zero or more fields of the form "tag[:value]". + // (8) Separator: the end of the optional fields is marked by a single hyphen. + fmt.Fprintf(buf, "- ") + + // (9) Filesystem type. + fmt.Fprintf(buf, "%s ", mroot.Inode.MountSource.FilesystemType) + + // (10) Mount source: filesystem-specific information or "none". + fmt.Fprintf(buf, "none ") + + // (11) Superblock options, and final newline. + fmt.Fprintf(buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource)) + }) + return nil +} + +func superBlockOpts(mountPath string, msrc *fs.MountSource) string { + // gVisor doesn't (yet) have a concept of super block options, so we + // use the ro/rw bit from the mount flag. + opts := "rw" + if msrc.Flags.ReadOnly { + opts = "ro" + } + + // NOTE(b/147673608): If the mount is a cgroup, we also need to include + // the cgroup name in the options. For now we just read that from the + // path. + // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we + // should get this value from the cgroup itself, and not rely on the + // path. + if msrc.FilesystemType == "cgroup" { + splitPath := strings.Split(mountPath, "/") + cgroupType := splitPath[len(splitPath)-1] + opts += "," + cgroupType + } + return opts +} + +// mountsData is used to implement /proc/[pid]/mounts. +// +// +stateify savable +type mountsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mountInfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + forEachMount(i.task, func(mountPath string, m *fs.Mount) { + // Format: + // + // + // We use the filesystem name as the first field, since there + // is no real block device we can point to, and we also should + // not expose anything about the remote filesystem. + // + // Only ro/rw option is supported for now. + // + // The "needs dump"and fsck flags are always 0, which is allowed. + root := m.Root() + if root == nil { + return // No longer valid. + } + defer root.DecRef() + + flags := root.Inode.MountSource.Flags + opts := "rw" + if flags.ReadOnly { + opts = "ro" + } + fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0) + }) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index d203cebd4..07115664c 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -46,6 +46,7 @@ type tasksInode struct { kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs kernfs.OrderedChildren + kernfs.AlwaysValid inoGen InoGenerator pidns *kernel.PIDNamespace @@ -66,23 +67,23 @@ var _ kernfs.Inode = (*tasksInode)(nil) func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), - //"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), - "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), - "sys": newSysDir(root, inoGen, k), - "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), - "net": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}), - "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}), + "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), + "filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), + "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), + "sys": newSysDir(root, inoGen, k), + "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), + "net": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"), + "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), + "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), + "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), } inode := &tasksInode{ pidns: pidns, inoGen: inoGen, - selfSymlink: newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), - threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), + selfSymlink: newSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), + threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) @@ -121,11 +122,6 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return taskDentry.VFSDentry(), nil } -// Valid implements kernfs.inodeDynamicLookup. -func (i *tasksInode) Valid(ctx context.Context) bool { - return true -} - // IterDirents implements kernfs.inodeDynamicLookup. func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 434998910..b99badba8 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -40,9 +40,9 @@ type selfSymlink struct { var _ kernfs.Inode = (*selfSymlink)(nil) -func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &selfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|perm) + inode.Init(creds, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -72,9 +72,9 @@ type threadSelfSymlink struct { var _ kernfs.Inode = (*threadSelfSymlink)(nil) -func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &threadSelfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|perm) + inode.Init(creds, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -138,21 +138,19 @@ func (c cpuStats) String() string { // +stateify savable type statData struct { kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel } var _ dynamicInode = (*statData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error { // TODO(b/37226836): We currently export only zero CPU stats. We could // at least provide some aggregate stats. var cpu cpuStats fmt.Fprintf(buf, "cpu %s\n", cpu) - for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { + k := kernel.KernelFromContext(ctx) + for c, max := uint(0), k.ApplicationCores(); c < max; c++ { fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) } @@ -176,7 +174,7 @@ func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "ctxt 0\n") // CLOCK_REALTIME timestamp from boot, in seconds. - fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) + fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds()) // Total number of clones. // TODO(b/37226836): Count this. @@ -209,7 +207,7 @@ type loadavgData struct { var _ dynamicInode = (*loadavgData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { // TODO(b/62345059): Include real data in fields. // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. // Column 4-5: currently running processes and the total number of processes. @@ -223,16 +221,14 @@ func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { // +stateify savable type meminfoData struct { kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel } var _ dynamicInode = (*meminfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { - mf := d.k.MemoryFile() +func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + mf := k.MemoryFile() mf.UpdateUsage() snapshot, totalUsage := usage.MemoryAccounting.Copy() totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) @@ -295,16 +291,14 @@ func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error { // +stateify savable type versionData struct { kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel } var _ dynamicInode = (*versionData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { - init := v.k.GlobalInit() +func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + init := k.GlobalInit() if init == nil { // Attempted to read before the init Task is created. This can // only occur during startup, which should never need to read @@ -335,3 +329,19 @@ func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) return nil } + +// filesystemsData backs /proc/filesystems. +// +// +stateify savable +type filesystemsData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*filesystemsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + k.VFS().GenerateProcFilesystems(buf) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 1bb9430c0..d0f97c137 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -47,6 +48,7 @@ var ( var ( tasksStaticFiles = map[string]testutil.DirentType{ "cpuinfo": linux.DT_REG, + "filesystems": linux.DT_REG, "loadavg": linux.DT_REG, "meminfo": linux.DT_REG, "mounts": linux.DT_LNK, @@ -68,9 +70,14 @@ var ( "cmdline": linux.DT_REG, "comm": linux.DT_REG, "environ": linux.DT_REG, + "exe": linux.DT_LNK, + "fd": linux.DT_DIR, + "fdinfo": linux.DT_DIR, "gid_map": linux.DT_REG, "io": linux.DT_REG, "maps": linux.DT_REG, + "mountinfo": linux.DT_REG, + "mounts": linux.DT_REG, "net": linux.DT_DIR, "ns": linux.DT_DIR, "oom_score": linux.DT_REG, @@ -96,17 +103,37 @@ func setup(t *testing.T) *testutil.System { k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - fsOpts := vfs.GetFilesystemOptions{ - InternalData: &InternalData{ - Cgroups: map[string]string{ - "cpuset": "/foo/cpuset", - "memory": "/foo/memory", + + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("NewMountNamespace(): %v", err) + } + pop := &vfs.PathOperation{ + Root: mntns.Root(), + Start: mntns.Root(), + Path: fspath.Parse("/proc"), + } + if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil { + t.Fatalf("MkDir(/proc): %v", err) + } + + pop = &vfs.PathOperation{ + Root: mntns.Root(), + Start: mntns.Root(), + Path: fspath.Parse("/proc"), + } + mntOpts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: &InternalData{ + Cgroups: map[string]string{ + "cpuset": "/foo/cpuset", + "memory": "/foo/memory", + }, }, }, } - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts) - if err != nil { - t.Fatalf("NewMountNamespace(): %v", err) + if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil { + t.Fatalf("MountAt(/proc): %v", err) } return testutil.NewSystem(ctx, t, k.VFS(), mntns) } @@ -115,7 +142,7 @@ func TestTasksEmpty(t *testing.T) { s := setup(t) defer s.Destroy() - collector := s.ListDirents(s.PathOpAtRoot("/")) + collector := s.ListDirents(s.PathOpAtRoot("/proc")) s.AssertAllDirentTypes(collector, tasksStaticFiles) s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) } @@ -141,7 +168,7 @@ func TestTasks(t *testing.T) { expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR } - collector := s.ListDirents(s.PathOpAtRoot("/")) + collector := s.ListDirents(s.PathOpAtRoot("/proc")) s.AssertAllDirentTypes(collector, expectedDirents) s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) @@ -181,7 +208,7 @@ func TestTasks(t *testing.T) { } // Test lookup. - for _, path := range []string{"/1", "/2"} { + for _, path := range []string{"/proc/1", "/proc/2"} { fd, err := s.VFS.OpenAt( s.Ctx, s.Creds, @@ -191,6 +218,7 @@ func TestTasks(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) } + defer fd.DecRef() buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { @@ -201,10 +229,10 @@ func TestTasks(t *testing.T) { if _, err := s.VFS.OpenAt( s.Ctx, s.Creds, - s.PathOpAtRoot("/9999"), + s.PathOpAtRoot("/proc/9999"), &vfs.OpenOptions{}, ); err != syserror.ENOENT { - t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err) + t.Fatalf("wrong error from vfsfs.OpenAt(/proc/9999): %v", err) } } @@ -302,12 +330,13 @@ func TestTasksOffset(t *testing.T) { fd, err := s.VFS.OpenAt( s.Ctx, s.Creds, - s.PathOpAtRoot("/"), + s.PathOpAtRoot("/proc"), &vfs.OpenOptions{}, ) if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } + defer fd.DecRef() if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) } @@ -344,7 +373,7 @@ func TestTask(t *testing.T) { t.Fatalf("CreateTask(): %v", err) } - collector := s.ListDirents(s.PathOpAtRoot("/1")) + collector := s.ListDirents(s.PathOpAtRoot("/proc/1")) s.AssertAllDirentTypes(collector, taskStaticFiles) } @@ -362,14 +391,14 @@ func TestProcSelf(t *testing.T) { collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{ Root: s.Root, Start: s.Root, - Path: fspath.Parse("/self/"), + Path: fspath.Parse("/proc/self/"), FollowFinalSymlink: true, }) s.AssertAllDirentTypes(collector, taskStaticFiles) } func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) { - t.Logf("Iterating: /proc%s", fd.MappedName(ctx)) + t.Logf("Iterating: %s", fd.MappedName(ctx)) var collector testutil.DirentCollector if err := fd.IterDirents(ctx, &collector); err != nil { @@ -412,6 +441,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err) continue } + defer child.DecRef() stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("Stat(%v) failed: %v", childPath, err) @@ -432,6 +462,22 @@ func TestTree(t *testing.T) { defer s.Destroy() k := kernel.KernelFromContext(s.Ctx) + + pop := &vfs.PathOperation{ + Root: s.Root, + Start: s.Root, + Path: fspath.Parse("test-file"), + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_CREAT, + Mode: 0777, + } + file, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, opts) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + defer file.DecRef() + var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) @@ -439,6 +485,8 @@ func TestTree(t *testing.T) { if err != nil { t.Fatalf("CreateTask(): %v", err) } + // Add file to populate /proc/[pid]/fd and fdinfo directories. + task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{}) tasks = append(tasks, task) } @@ -446,11 +494,12 @@ func TestTree(t *testing.T) { fd, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(s.Ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/")}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc")}, &vfs.OpenOptions{}, ) if err != nil { - t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err) } iterateDir(ctx, t, s, fd) + fd.DecRef() } diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index e4f36f4ae..0e4053a46 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -16,12 +16,14 @@ go_library( "//pkg/cpuid", "//pkg/fspath", "//pkg/memutil", + "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/sched", "//pkg/sentry/limits", "//pkg/sentry/loader", + "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm", diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 488478e29..c16a36cdb 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -23,13 +23,16 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/time" @@ -123,10 +126,17 @@ func Boot() (*kernel.Kernel, error) { // CreateTask creates a new bare bones task for tests. func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) + exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root) + if err != nil { + return nil, err + } + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) + m.SetExecutable(fsbridge.NewVFSFile(exe)) + config := &kernel.TaskConfig{ Kernel: k, ThreadGroup: tc, - TaskContext: &kernel.TaskContext{Name: name}, + TaskContext: &kernel.TaskContext{Name: name, MemoryManager: m}, Credentials: auth.CredentialsFromContext(ctx), NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), @@ -135,10 +145,25 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), MountNamespaceVFS2: mntns, FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), + FDTable: k.NewFDTable(), } return k.TaskSet().NewTask(config) } +func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) { + const name = "executable" + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_CREAT, + Mode: 0777, + } + return vfsObj.OpenAt(ctx, creds, pop, opts) +} + func createMemoryFile() (*pgalloc.MemoryFile, error) { const memfileName = "test-memory" memfd, err := memutil.CreateMemFD(memfileName, 0) diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 00f914564..7de2e509e 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -191,7 +191,7 @@ func (f *FDTable) Size() int { return int(size) } -// forEach iterates over all non-nil files. +// forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { @@ -458,7 +458,10 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { } } -// GetFDs returns a list of valid fds. +// GetFDs returns a sorted list of valid fds. +// +// Precondition: The caller must be running on the task goroutine, or Task.mu +// must be locked. func (f *FDTable) GetFDs() []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { -- cgit v1.2.3 From 5e413cad10d2358a21dd08216953faee70e62a0b Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Sat, 14 Mar 2020 07:13:15 -0700 Subject: Plumb VFS2 imported fds into virtual filesystem. - When setting up the virtual filesystem, mount a host.filesystem to contain all files that need to be imported. - Make read/preadv syscalls to the host in cases where preadv2 may not be supported yet (likewise for writing). - Make save/restore functions in kernel/kernel.go return early if vfs2 is enabled. PiperOrigin-RevId: 300922353 --- pkg/abi/linux/file.go | 3 + pkg/sentry/fs/host/control.go | 2 + pkg/sentry/fsimpl/host/BUILD | 2 + pkg/sentry/fsimpl/host/default_file.go | 45 +++++++----- pkg/sentry/fsimpl/host/host.go | 124 ++++++++++++++++++++++++++++++--- pkg/sentry/fsimpl/host/util.go | 28 ++------ pkg/sentry/kernel/kernel.go | 40 +++++++---- pkg/sentry/syscalls/linux/sys_stat.go | 5 +- pkg/sentry/syscalls/linux/vfs2/stat.go | 6 +- runsc/boot/filter/config.go | 1 + test/syscalls/linux/stat.cc | 60 ++++++++++++++-- 11 files changed, 246 insertions(+), 70 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index e229ac21c..dbe58acbe 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -266,6 +266,9 @@ type Statx struct { DevMinor uint32 } +// SizeOfStatx is the size of a Statx struct. +var SizeOfStatx = binary.Size(Statx{}) + // FileMode represents a mode_t. type FileMode uint16 diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 1658979fc..cd84e1337 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -32,6 +32,8 @@ func newSCMRights(fds []int) control.SCMRights { } // Files implements control.SCMRights.Files. +// +// TODO(gvisor.dev/issue/2017): Port to VFS2. func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) { n := max var trunc bool diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 731f192b3..5d67f88e3 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -9,9 +9,11 @@ go_library( "host.go", "util.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fd", "//pkg/log", "//pkg/refs", "//pkg/safemem", diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go index 172cdb161..98682ba5e 100644 --- a/pkg/sentry/fsimpl/host/default_file.go +++ b/pkg/sentry/fsimpl/host/default_file.go @@ -21,6 +21,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -64,9 +65,7 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } - f.mu.Lock() n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags)) - f.mu.Unlock() if isBlockError(err) { // If we got any data at all, return it as a "completed" partial read // rather than retrying until complete. @@ -86,16 +85,22 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v return n, err } -func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { - if flags&^(linux.RWF_VALID) != 0 { +func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. + if flags != 0 { return 0, syserror.EOPNOTSUPP } - reader := safemem.FromVecReaderFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Preadv2(fd, srcs, offset, flags) - return int64(n), err - }, + var reader safemem.Reader + if offset == -1 { + reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} + } else { + reader = safemem.FromVecReaderFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Preadv(hostFD, srcs, offset) + return int64(n), err + }, + } } n, err := dst.CopyOutFrom(ctx, reader) return int64(n), err @@ -120,9 +125,7 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } - f.mu.Lock() n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags)) - f.mu.Unlock() if isBlockError(err) { err = syserror.ErrWouldBlock } @@ -137,16 +140,22 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts return n, err } -func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) { - if flags&^(linux.RWF_VALID) != 0 { +func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. + if flags != 0 { return 0, syserror.EOPNOTSUPP } - writer := safemem.FromVecWriterFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Pwritev2(fd, srcs, offset, flags) - return int64(n), err - }, + var writer safemem.Writer + if offset == -1 { + writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} + } else { + writer = safemem.FromVecWriterFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Pwritev(hostFD, srcs, offset) + return int64(n), err + }, + } } n, err := src.CopyInTo(ctx, writer) return int64(n), err diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index c205e6a0b..0be812d13 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -38,10 +38,19 @@ type filesystem struct { kernfs.Filesystem } +// NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD. +func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) { + fs := &filesystem{} + fs.Init(vfsObj) + vfsfs := fs.VFSFilesystem() + // NewDisconnectedMount will take an additional reference on vfsfs. + defer vfsfs.DecRef() + return vfsObj.NewDisconnectedMount(vfsfs, nil, &vfs.MountOptions{}) +} + // ImportFD sets up and returns a vfs.FileDescription from a donated fd. func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) { - // Must be importing to a mount of host.filesystem. - fs, ok := mnt.Filesystem().Impl().(*filesystem) + fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) } @@ -54,8 +63,7 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID fileMode := linux.FileMode(s.Mode) fileType := fileMode.FileType() - // Pipes, character devices, and sockets can return EWOULDBLOCK for - // operations that would block. + // Pipes, character devices, and sockets. isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK i := &inode{ @@ -143,11 +151,109 @@ func (i *inode) Mode() linux.FileMode { // Stat implements kernfs.Inode. func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + if opts.Mask&linux.STATX__RESERVED != 0 { + return linux.Statx{}, syserror.EINVAL + } + if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { + return linux.Statx{}, syserror.EINVAL + } + + // Limit our host call only to known flags. + mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t - if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil { + err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) + // Fallback to fstat(2), if statx(2) is not supported on the host. + // + // TODO(b/151263641): Remove fallback. + if err == syserror.ENOSYS { + return i.fstat(opts) + } else if err != nil { + return linux.Statx{}, err + } + + ls := linux.Statx{Mask: mask} + // Unconditionally fill blksize, attributes, and device numbers, as indicated + // by /include/uapi/linux/stat.h. + // + // RdevMajor/RdevMinor are left as zero, so as not to expose host device + // numbers. + // + // TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined + // device numbers. If we use the device number from the host, it may collide + // with another sentry-internal device number. We handle device/inode + // numbers without relying on the host to prevent collisions. + ls.Blksize = s.Blksize + ls.Attributes = s.Attributes + ls.AttributesMask = s.Attributes_mask + + if mask|linux.STATX_TYPE != 0 { + ls.Mode |= s.Mode & linux.S_IFMT + } + if mask|linux.STATX_MODE != 0 { + ls.Mode |= s.Mode &^ linux.S_IFMT + } + if mask|linux.STATX_NLINK != 0 { + ls.Nlink = s.Nlink + } + if mask|linux.STATX_ATIME != 0 { + ls.Atime = unixToLinuxStatxTimestamp(s.Atime) + } + if mask|linux.STATX_BTIME != 0 { + ls.Btime = unixToLinuxStatxTimestamp(s.Btime) + } + if mask|linux.STATX_CTIME != 0 { + ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) + } + if mask|linux.STATX_MTIME != 0 { + ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) + } + if mask|linux.STATX_SIZE != 0 { + ls.Size = s.Size + } + if mask|linux.STATX_BLOCKS != 0 { + ls.Blocks = s.Blocks + } + + // Use our own internal inode number and file owner. + if mask|linux.STATX_INO != 0 { + ls.Ino = i.ino + } + if mask|linux.STATX_UID != 0 { + ls.UID = uint32(i.uid) + } + if mask|linux.STATX_GID != 0 { + ls.GID = uint32(i.gid) + } + + return ls, nil +} + +// fstat is a best-effort fallback for inode.Stat() if the host does not +// support statx(2). +// +// We ignore the mask and sync flags in opts and simply supply +// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification +// of a mask or sync flags. fstat(2) does not provide any metadata +// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so +// those fields remain empty. +func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { + var s unix.Stat_t + if err := unix.Fstat(i.hostFD, &s); err != nil { return linux.Statx{}, err } - ls := unixToLinuxStatx(s) + + // Note that rdev numbers are left as 0; do not expose host device numbers. + ls := linux.Statx{ + Mask: linux.STATX_BASIC_STATS, + Blksize: uint32(s.Blksize), + Nlink: uint32(s.Nlink), + Mode: uint16(s.Mode), + Size: uint64(s.Size), + Blocks: uint64(s.Blocks), + Atime: timespecToStatxTimestamp(s.Atim), + Ctime: timespecToStatxTimestamp(s.Ctim), + Mtime: timespecToStatxTimestamp(s.Mtim), + } // Use our own internal inode number and file owner. // @@ -159,9 +265,6 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro ls.UID = uint32(i.uid) ls.GID = uint32(i.gid) - // Update file mode from the host. - i.mode = linux.FileMode(ls.Mode) - return ls, nil } @@ -217,7 +320,6 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio } func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { - fileType := i.mode.FileType() if fileType == syscall.S_IFSOCK { if i.isTTY { @@ -227,6 +329,8 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error return nil, errors.New("importing host sockets not supported") } + // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that + // we don't allow importing arbitrary file types without proper support. if i.isTTY { // TODO(gvisor.dev/issue/1672): support importing host fd as TTY. return nil, errors.New("importing host fd as TTY not supported") diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index e1ccacb4d..d519feef5 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -35,34 +35,14 @@ func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { } } -func unixToLinuxStatx(s unix.Statx_t) linux.Statx { - return linux.Statx{ - Mask: s.Mask, - Blksize: s.Blksize, - Attributes: s.Attributes, - Nlink: s.Nlink, - UID: s.Uid, - GID: s.Gid, - Mode: s.Mode, - Ino: s.Ino, - Size: s.Size, - Blocks: s.Blocks, - AttributesMask: s.Attributes_mask, - Atime: unixToLinuxStatxTimestamp(s.Atime), - Btime: unixToLinuxStatxTimestamp(s.Btime), - Ctime: unixToLinuxStatxTimestamp(s.Ctime), - Mtime: unixToLinuxStatxTimestamp(s.Mtime), - RdevMajor: s.Rdev_major, - RdevMinor: s.Rdev_minor, - DevMajor: s.Dev_major, - DevMinor: s.Dev_minor, - } -} - func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp { return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec} } +func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { + return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)} +} + // wouldBlock returns true for file types that can return EWOULDBLOCK // for blocking operations, e.g. pipes, character devices, and sockets. func wouldBlock(fileType uint32) bool { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1d627564f..6feda8fa1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -467,6 +467,11 @@ func (k *Kernel) flushMountSourceRefs() error { // // Precondition: Must be called with the kernel paused. func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return nil + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -484,7 +489,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil @@ -533,6 +538,11 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } func (ts *TaskSet) unregisterEpollWaiters() { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -1005,11 +1015,14 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.PauseTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } + }) + } } } k.timekeeper.PauseUpdates() @@ -1034,12 +1047,15 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.ResumeTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + }) + } } } } diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 9bd2df104..a11a87cd1 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -136,7 +136,10 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall mask := args[3].Uint() statxAddr := args[4].Pointer() - if mask&linux.STATX__RESERVED > 0 { + if mask&linux.STATX__RESERVED != 0 { + return 0, nil, syserror.EINVAL + } + if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 { return 0, nil, syserror.EINVAL } if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index a74ea6fd5..97eaedd66 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -150,7 +150,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall mask := args[3].Uint() statxAddr := args[4].Pointer() - if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 { + return 0, nil, syserror.EINVAL + } + + if mask&linux.STATX__RESERVED != 0 { return 0, nil, syserror.EINVAL } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index a4627905e..f459d1973 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -284,6 +284,7 @@ var allowedSyscalls = seccomp.SyscallRules{ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, + unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TGKILL: []seccomp.Rule{ { diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index c951ac3b3..513b9cd1c 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -607,7 +607,7 @@ int statx(int dirfd, const char* pathname, int flags, unsigned int mask, } TEST_F(StatTest, StatxAbsPath) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); struct kernel_statx stx; @@ -617,7 +617,7 @@ TEST_F(StatTest, StatxAbsPath) { } TEST_F(StatTest, StatxRelPathDirFD) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); struct kernel_statx stx; @@ -631,7 +631,7 @@ TEST_F(StatTest, StatxRelPathDirFD) { } TEST_F(StatTest, StatxRelPathCwd) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds()); @@ -643,7 +643,7 @@ TEST_F(StatTest, StatxRelPathCwd) { } TEST_F(StatTest, StatxEmptyPath) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY)); @@ -653,6 +653,58 @@ TEST_F(StatTest, StatxEmptyPath) { EXPECT_TRUE(S_ISREG(stx.stx_mode)); } +TEST_F(StatTest, StatxDoesNotRejectExtraneousMaskBits) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + // Set all mask bits except for STATX__RESERVED. + uint mask = 0xffffffff & ~0x80000000; + EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, mask, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISREG(stx.stx_mode)); +} + +TEST_F(StatTest, StatxRejectsReservedMaskBit) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + // Set STATX__RESERVED in the mask. + EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, 0x80000000, &stx), + SyscallFailsWithErrno(EINVAL)); +} + +TEST_F(StatTest, StatxSymlink) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + std::string parent_dir = "/tmp"; + TempPath link = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(parent_dir, test_file_name_)); + std::string p = link.path(); + + struct kernel_statx stx; + EXPECT_THAT(statx(AT_FDCWD, p.c_str(), AT_SYMLINK_NOFOLLOW, STATX_ALL, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISLNK(stx.stx_mode)); + EXPECT_THAT(statx(AT_FDCWD, p.c_str(), 0, STATX_ALL, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISREG(stx.stx_mode)); +} + +TEST_F(StatTest, StatxInvalidFlags) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx), + SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), + 0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx), + SyscallFailsWithErrno(EINVAL)); +} + } // namespace } // namespace testing -- cgit v1.2.3 From b55f0e5d40c17cadf68d6238564d675ed12f8f49 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 16 Mar 2020 18:28:29 -0700 Subject: fdtable: don't try to zap fdtable entry if close is called for non-existing fd FDTable.setAll is used to zap entries, but it grows the table up to a specified fd. Reported-by: syzbot+9e281b0750d2d4caa190@syzkaller.appspotmail.com PiperOrigin-RevId: 301280000 --- pkg/sentry/kernel/fd_table.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 7de2e509e..dddc28d5a 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -536,7 +536,9 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { case orig2 != nil: orig2.IncRef() } - f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + if orig != nil || orig2 != nil { + f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + } return orig, orig2 } -- cgit v1.2.3 From f1d1af2a4ad35dd20a7c56bd9e842e347b126c31 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 18 Mar 2020 15:12:11 -0700 Subject: Fix FDTable.NewFDVFS2 It was looking at VFS1 table to determine where to allocate the next FD from. Updates #1035 PiperOrigin-RevId: 301678858 --- pkg/sentry/kernel/fd_table.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index dddc28d5a..d09d97825 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -338,7 +338,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc fd = f.next } for fd < end { - if d, _, _ := f.get(fd); d == nil { + if d, _, _ := f.getVFS2(fd); d == nil { f.setVFS2(fd, file, flags) if fd == f.next { // Update next search start position. -- cgit v1.2.3 From e9e399c25d4fcad2adfe92d73b192b9784774964 Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Thu, 19 Mar 2020 07:18:47 -0700 Subject: Remove workMu from tcpip.Endpoint. workMu is removed and e.mu is now a mutex that supports TryLock. The packet processing path tries to lock the mutex and if its locked it will just queue the packet and move on. The endpoint.UnlockUser() will process any backlog of packets before unlocking the socket. This simplifies the locking inside tcp endpoints a lot. Further the endpoint.LockUser() implements spinning as long as the lock is not held by another syscall goroutine. This ensures low latency as not spinning leads to the task thread being put to sleep if the lock is held by the packet dispatch path. This is suboptimal as the lower layer rarely holds the lock for long so implementing spinning here helps. If the lock is held by another task goroutine then we just proceed to call LockUser() and the task could be put to sleep. The protocol goroutines themselves just call e.mu.Lock() and block if the lock is currently not available. Updates #231, #357 PiperOrigin-RevId: 301808349 --- pkg/sentry/kernel/epoll/epoll.go | 2 + pkg/sentry/socket/netstack/netstack.go | 24 +- pkg/tcpip/transport/tcp/accept.go | 90 +++--- pkg/tcpip/transport/tcp/connect.go | 78 ++--- pkg/tcpip/transport/tcp/dispatcher.go | 8 +- pkg/tcpip/transport/tcp/endpoint.go | 495 ++++++++++++++++-------------- pkg/tcpip/transport/tcp/endpoint_state.go | 5 +- pkg/tcpip/transport/tcp/protocol.go | 38 +-- pkg/tcpip/transport/tcp/rcv.go | 6 - pkg/tcpip/transport/tcp/segment_queue.go | 8 +- pkg/tcpip/transport/tcp/snd.go | 3 - pkg/tcpip/transport/tcp/tcp_test.go | 41 ++- 12 files changed, 424 insertions(+), 374 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 8bffb78fc..592650923 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -296,8 +296,10 @@ func (*readyCallback) Callback(w *waiter.Entry) { e.waitingList.Remove(entry) e.readyList.PushBack(entry) entry.curList = &e.readyList + e.listsMu.Unlock() e.Notify(waiter.EventIn) + return } e.listsMu.Unlock() diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 13a9a60b4..a2e1da02f 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -29,6 +29,7 @@ import ( "io" "math" "reflect" + "sync/atomic" "syscall" "time" @@ -264,6 +265,12 @@ type SocketOperations struct { skType linux.SockType protocol int + // readViewHasData is 1 iff readView has data to be read, 0 otherwise. + // Must be accessed using atomic operations. It must only be written + // with readMu held but can be read without holding readMu. The latter + // is required to avoid deadlocks in epoll Readiness checks. + readViewHasData uint32 + // readMu protects access to the below fields. readMu sync.Mutex `state:"nosave"` // readView contains the remaining payload from the last packet. @@ -410,21 +417,24 @@ func (s *SocketOperations) isPacketBased() bool { // fetchReadView updates the readView field of the socket if it's currently // empty. It assumes that the socket is locked. +// +// Precondition: s.readMu must be held. func (s *SocketOperations) fetchReadView() *syserr.Error { if len(s.readView) > 0 { return nil } - s.readView = nil s.sender = tcpip.FullAddress{} v, cms, err := s.Endpoint.Read(&s.sender) if err != nil { + atomic.StoreUint32(&s.readViewHasData, 0) return syserr.TranslateNetstackError(err) } s.readView = v s.readCM = cms + atomic.StoreUint32(&s.readViewHasData, 1) return nil } @@ -623,11 +633,9 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { // Check our cached value iff the caller asked for readability and the // endpoint itself is currently not readable. if (mask & ^r & waiter.EventIn) != 0 { - s.readMu.Lock() - if len(s.readView) > 0 { + if atomic.LoadUint32(&s.readViewHasData) == 1 { r |= waiter.EventIn } - s.readMu.Unlock() } return r @@ -2334,6 +2342,10 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq } copied += n s.readView.TrimFront(n) + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) + } + dst = dst.DropFirst(n) if e != nil { err = syserr.FromError(e) @@ -2456,6 +2468,10 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe s.readView.TrimFront(int(n)) } + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) + } + var flags int if msgLen > int(n) { flags |= linux.MSG_TRUNC diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 85049e54e..4d7602d54 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -221,7 +221,8 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu } // createConnectingEndpoint creates a new endpoint in a connecting state, with -// the connection parameters given by the arguments. +// the connection parameters given by the arguments. The endpoint is returned +// with n.mu held. func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) { // Create a new endpoint. netProto := l.netProto @@ -243,21 +244,6 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i n.initGSO() - // Now inherit any socket options that should be inherited from the - // listening endpoint. - // In case of Forwarder listenEP will be nil and hence this check. - if l.listenEP != nil { - l.listenEP.propagateInheritableOptions(n) - } - - // Register new endpoint so that packets are routed to it. - if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil { - n.Close() - return nil, err - } - - n.isRegistered = true - // Create sender and receiver. // // The receiver at least temporarily has a zero receive window scale, @@ -269,11 +255,27 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i // window to grow to a really large value. n.rcvAutoParams.prevCopied = n.initialReceiveWindow() + // Lock the endpoint before registering to ensure that no out of + // band changes are possible due to incoming packets etc till + // the endpoint is done initializing. + n.mu.Lock() + + // Register new endpoint so that packets are routed to it. + if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil { + n.mu.Unlock() + n.Close() + return nil, err + } + + n.isRegistered = true + return n, nil } // createEndpointAndPerformHandshake creates a new endpoint in connected state // and then performs the TCP 3-way handshake. +// +// The new endpoint is returned with e.mu held. func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) { // Create new endpoint. irs := s.sequenceNumber @@ -289,9 +291,25 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head l.listenEP.mu.Lock() if l.listenEP.EndpointState() != StateListen { l.listenEP.mu.Unlock() + // Ensure we release any registrations done by the newly + // created endpoint. + ep.mu.Unlock() + ep.Close() + + // Wake up any waiters. This is strictly not required normally + // as a socket that was never accepted can't really have any + // registered waiters except when stack.Wait() is called which + // waits for all registered endpoints to stop and expects an + // EventHUp. + ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) return nil, tcpip.ErrConnectionAborted } l.addPendingEndpoint(ep) + + // Propagate any inheritable options from the listening endpoint + // to the newly created endpoint. + l.listenEP.propagateInheritableOptionsLocked(ep) + deferAccept = l.listenEP.deferAccept l.listenEP.mu.Unlock() } @@ -299,6 +317,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head // Perform the 3-way handshake. h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept) if err := h.execute(); err != nil { + ep.mu.Unlock() ep.Close() // Wake up any waiters. This is strictly not required normally // as a socket that was never accepted can't really have any @@ -312,9 +331,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head } return nil, err } - ep.mu.Lock() ep.isConnectNotified = true - ep.mu.Unlock() // Update the receive window scaling. We can't do it before the // handshake because it's possible that the peer doesn't support window @@ -366,12 +383,12 @@ func (e *endpoint) deliverAccepted(n *endpoint) { } } -// propagateInheritableOptions propagates any options set on the listening +// propagateInheritableOptionsLocked propagates any options set on the listening // endpoint to the newly created endpoint. -func (e *endpoint) propagateInheritableOptions(n *endpoint) { - e.mu.Lock() +// +// Precondition: e.mu and n.mu must be held. +func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) { n.userTimeout = e.userTimeout - e.mu.Unlock() } // handleSynSegment is called in its own goroutine once the listening endpoint @@ -382,7 +399,11 @@ func (e *endpoint) propagateInheritableOptions(n *endpoint) { // cookies to accept connections. func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) { defer decSynRcvdCount() - defer e.decSynRcvdCount() + defer func() { + e.mu.Lock() + e.decSynRcvdCount() + e.mu.Unlock() + }() defer s.decRef() n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{}) @@ -399,29 +420,21 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header } func (e *endpoint) incSynRcvdCount() bool { - e.mu.Lock() - if e.synRcvdCount >= cap(e.acceptedChan) { - e.mu.Unlock() + if e.synRcvdCount >= (cap(e.acceptedChan)) { return false } e.synRcvdCount++ - e.mu.Unlock() return true } func (e *endpoint) decSynRcvdCount() { - e.mu.Lock() e.synRcvdCount-- - e.mu.Unlock() } func (e *endpoint) acceptQueueIsFull() bool { - e.mu.Lock() if l, c := len(e.acceptedChan)+e.synRcvdCount, cap(e.acceptedChan); l >= c { - e.mu.Unlock() return true } - e.mu.Unlock() return false } @@ -559,6 +572,10 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { return } + // Propagate any inheritable options from the listening endpoint + // to the newly created endpoint. + e.propagateInheritableOptionsLocked(n) + // clear the tsOffset for the newly created // endpoint as the Timestamp was already // randomly offset when the original SYN-ACK was @@ -593,14 +610,12 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error { e.mu.Lock() v6only := e.v6only - e.mu.Unlock() ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.NetProto) defer func() { // Mark endpoint as closed. This will prevent goroutines running // handleSynSegment() from attempting to queue new connections // to the endpoint. - e.mu.Lock() e.setEndpointState(StateClose) // close any endpoints in SYN-RCVD state. @@ -622,7 +637,10 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error { s.AddWaker(&e.notificationWaker, wakerForNotification) s.AddWaker(&e.newSegmentWaker, wakerForNewSegment) for { - switch index, _ := s.Fetch(true); index { + e.mu.Unlock() + index, _ := s.Fetch(true) + e.mu.Lock() + switch index { case wakerForNotification: n := e.fetchNotifications() if n¬ifyClose != 0 { @@ -635,7 +653,9 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error { s.decRef() } close(e.drainDone) + e.mu.Unlock() <-e.undrain + e.mu.Lock() } case wakerForNewSegment: diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index be86af502..edb37a549 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -61,6 +61,9 @@ const ( ) // handshake holds the state used during a TCP 3-way handshake. +// +// NOTE: handshake.ep.mu is held during handshake processing. It is released if +// we are going to block and reacquired when we start processing an event. type handshake struct { ep *endpoint state handshakeState @@ -209,9 +212,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea h.mss = opts.MSS h.sndWndScale = opts.WS h.deferAccept = deferAccept - h.ep.mu.Lock() h.ep.setEndpointState(StateSynRecv) - h.ep.mu.Unlock() } // checkAck checks if the ACK number, if present, of a segment received during @@ -241,9 +242,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error { // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK // was acceptable then signal the user "error: connection reset", drop // the segment, enter CLOSED state, delete TCB, and return." - h.ep.mu.Lock() h.ep.workerCleanup = true - h.ep.mu.Unlock() // Although the RFC above calls out ECONNRESET, Linux actually returns // ECONNREFUSED here so we do as well. return tcpip.ErrConnectionRefused @@ -281,9 +280,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error { if s.flagIsSet(header.TCPFlagAck) { h.state = handshakeCompleted - h.ep.mu.Lock() h.ep.transitionToStateEstablishedLocked(h) - h.ep.mu.Unlock() h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) return nil @@ -293,11 +290,9 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error { // but resend our own SYN and wait for it to be acknowledged in the // SYN-RCVD state. h.state = handshakeSynRcvd - h.ep.mu.Lock() ttl := h.ep.ttl amss := h.ep.amss h.ep.setEndpointState(StateSynRecv) - h.ep.mu.Unlock() synOpts := header.TCPSynOptions{ WS: int(h.effectiveRcvWndScale()), TS: rcvSynOpts.TS, @@ -357,10 +352,6 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error { return tcpip.ErrInvalidEndpointState } - h.ep.mu.RLock() - amss := h.ep.amss - h.ep.mu.RUnlock() - h.resetState() synOpts := header.TCPSynOptions{ WS: h.rcvWndScale, @@ -368,7 +359,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error { TSVal: h.ep.timestamp(), TSEcr: h.ep.recentTimestamp(), SACKPermitted: h.ep.sackPermitted, - MSS: amss, + MSS: h.ep.amss, } h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) return nil @@ -399,15 +390,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error { } h.state = handshakeCompleted - h.ep.mu.Lock() h.ep.transitionToStateEstablishedLocked(h) + // If the segment has data then requeue it for the receiver // to process it again once main loop is started. if s.data.Size() > 0 { s.incRef() h.ep.enqueueSegment(s) } - h.ep.mu.Unlock() return nil } @@ -493,7 +483,9 @@ func (h *handshake) resolveRoute() *tcpip.Error { } if n¬ifyDrain != 0 { close(h.ep.drainDone) + h.ep.mu.Unlock() <-h.ep.undrain + h.ep.mu.Lock() } } @@ -535,7 +527,6 @@ func (h *handshake) execute() *tcpip.Error { // Send the initial SYN segment and loop until the handshake is // completed. - h.ep.mu.Lock() h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route) synOpts := header.TCPSynOptions{ @@ -546,7 +537,6 @@ func (h *handshake) execute() *tcpip.Error { SACKPermitted: bool(sackEnabled), MSS: h.ep.amss, } - h.ep.mu.Unlock() // Execute is also called in a listen context so we want to make sure we // only send the TS/SACK option when we received the TS/SACK in the @@ -563,7 +553,11 @@ func (h *handshake) execute() *tcpip.Error { h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) for h.state != handshakeCompleted { - switch index, _ := s.Fetch(true); index { + h.ep.mu.Unlock() + index, _ := s.Fetch(true) + h.ep.mu.Lock() + switch index { + case wakerForResend: timeOut *= 2 if timeOut > MaxRTO { @@ -600,7 +594,9 @@ func (h *handshake) execute() *tcpip.Error { } } close(h.ep.drainDone) + h.ep.mu.Unlock() <-h.ep.undrain + h.ep.mu.Lock() } case wakerForNewSegment: @@ -1016,7 +1012,6 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) { // except SYN-SENT, all reset (RST) segments are // validated by checking their SEQ-fields." So // we only process it if it's acceptable. - e.mu.Lock() switch e.EndpointState() { // In case of a RST in CLOSE-WAIT linux moves // the socket to closed state with an error set @@ -1040,11 +1035,9 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) { case StateCloseWait: e.transitionToStateCloseLocked() e.HardError = tcpip.ErrAborted - e.mu.Unlock() e.notifyProtocolGoroutine(notifyTickleWorker) return false, nil default: - e.mu.Unlock() // RFC 793, page 37 states that "in all states // except SYN-SENT, all reset (RST) segments are // validated by checking their SEQ-fields." So @@ -1157,9 +1150,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) { // Now check if the received segment has caused us to transition // to a CLOSED state, if yes then terminate processing and do // not invoke the sender. - e.mu.RLock() state := e.state - e.mu.RUnlock() if state == StateClose { // When we get into StateClose while processing from the queue, // return immediately and let the protocolMainloop handle it. @@ -1182,9 +1173,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) { // keepalive packets periodically when the connection is idle. If we don't hear // from the other side after a number of tries, we terminate the connection. func (e *endpoint) keepaliveTimerExpired() *tcpip.Error { - e.mu.RLock() userTimeout := e.userTimeout - e.mu.RUnlock() e.keepalive.Lock() if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() { @@ -1248,6 +1237,7 @@ func (e *endpoint) disableKeepaliveTimer() { // goroutine and is responsible for sending segments and handling received // segments. func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error { + e.mu.Lock() var closeTimer *time.Timer var closeWaker sleep.Waker @@ -1269,7 +1259,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ } e.mu.Unlock() - e.workMu.Unlock() // When the protocol loop exits we should wake up our waiters. e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) } @@ -1280,16 +1269,13 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ // completion. initialRcvWnd := e.initialReceiveWindow() h := newHandshake(e, seqnum.Size(initialRcvWnd)) - e.mu.Lock() h.ep.setEndpointState(StateSynSent) - e.mu.Unlock() if err := h.execute(); err != nil { e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() - e.mu.Lock() e.setEndpointState(StateError) e.HardError = err @@ -1302,9 +1288,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ e.keepalive.timer.init(&e.keepalive.waker) defer e.keepalive.timer.cleanup() - e.mu.Lock() drained := e.drainDone != nil - e.mu.Unlock() if drained { close(e.drainDone) <-e.undrain @@ -1330,10 +1314,8 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ // This means the socket is being closed due // to the TCP-FIN-WAIT2 timeout was hit. Just // mark the socket as closed. - e.mu.Lock() e.transitionToStateCloseLocked() e.workerCleanup = true - e.mu.Unlock() return nil }, }, @@ -1388,7 +1370,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ } if n¬ifyClose != 0 && closeTimer == nil { - e.mu.Lock() if e.EndpointState() == StateFinWait2 && e.closed { // The socket has been closed and we are in FIN_WAIT2 // so start the FIN_WAIT2 timer. @@ -1397,7 +1378,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ }) e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) } - e.mu.Unlock() } if n¬ifyKeepaliveChanged != 0 { @@ -1417,7 +1397,9 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ // Only block the worker if the endpoint // is not in closed state or error state. close(e.drainDone) + e.mu.Unlock() <-e.undrain + e.mu.Lock() } } @@ -1460,7 +1442,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ } e.rcvListMu.Unlock() - e.mu.Lock() if e.workerCleanup { e.notifyProtocolGoroutine(notifyClose) } @@ -1468,7 +1449,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ // Main loop. Handle segments until both send and receive ends of the // connection have completed. cleanupOnError := func(err *tcpip.Error) { - e.mu.Lock() e.workerCleanup = true if err != nil { e.resetConnectionLocked(err) @@ -1480,16 +1460,11 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ loop: for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError { e.mu.Unlock() - e.workMu.Unlock() v, _ := s.Fetch(true) - e.workMu.Lock() + e.mu.Lock() - // We need to double check here because the notification maybe + // We need to double check here because the notification may be // stale by the time we got around to processing it. - // - // NOTE: since we now hold the workMu the processors cannot - // change the state of the endpoint so it's safe to proceed - // after this check. switch e.EndpointState() { case StateError: // If the endpoint has already transitioned to an ERROR @@ -1502,21 +1477,17 @@ loop: case StateTimeWait: fallthrough case StateClose: - e.mu.Lock() break loop default: if err := funcs[v].f(); err != nil { cleanupOnError(err) return nil } - e.mu.Lock() } } - state := e.EndpointState() - e.mu.Unlock() var reuseTW func() - if state == StateTimeWait { + if e.EndpointState() == StateTimeWait { // Disable close timer as we now entering real TIME_WAIT. if closeTimer != nil { closeTimer.Stop() @@ -1526,14 +1497,11 @@ loop: s.Done() // Wake up any waiters before we enter TIME_WAIT. e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) - e.mu.Lock() e.workerCleanup = true - e.mu.Unlock() reuseTW = e.doTimeWait() } // Mark endpoint as closed. - e.mu.Lock() if e.EndpointState() != StateError { e.transitionToStateCloseLocked() } @@ -1649,9 +1617,9 @@ func (e *endpoint) doTimeWait() (twReuse func()) { defer timeWaitTimer.Stop() for { - e.workMu.Unlock() + e.mu.Unlock() v, _ := s.Fetch(true) - e.workMu.Lock() + e.mu.Lock() switch v { case newSegment: extendTimeWait, reuseTW := e.handleTimeWaitSegments() @@ -1674,7 +1642,9 @@ func (e *endpoint) doTimeWait() (twReuse func()) { e.handleTimeWaitSegments() } close(e.drainDone) + e.mu.Unlock() <-e.undrain + e.mu.Lock() return nil } case timeWaitDone: diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go index d792b07d6..90ac956a9 100644 --- a/pkg/tcpip/transport/tcp/dispatcher.go +++ b/pkg/tcpip/transport/tcp/dispatcher.go @@ -128,7 +128,7 @@ func (p *processor) handleSegments() { continue } - if !ep.workMu.TryLock() { + if !ep.mu.TryLock() { ep.newSegmentWaker.Assert() continue } @@ -138,12 +138,10 @@ func (p *processor) handleSegments() { if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose { // Send any active resets if required. if err != nil { - ep.mu.Lock() ep.resetConnectionLocked(err) - ep.mu.Unlock() } ep.notifyProtocolGoroutine(notifyTickleWorker) - ep.workMu.Unlock() + ep.mu.Unlock() continue } @@ -151,7 +149,7 @@ func (p *processor) handleSegments() { p.epQ.enqueue(ep) } - ep.workMu.Unlock() + ep.mu.Unlock() } } } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 5187a5e25..eb8a9d73e 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -18,6 +18,7 @@ import ( "encoding/binary" "fmt" "math" + "runtime" "strings" "sync/atomic" "time" @@ -33,7 +34,6 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" - "gvisor.dev/gvisor/pkg/tmutex" "gvisor.dev/gvisor/pkg/waiter" ) @@ -283,6 +283,37 @@ func (*EndpointInfo) IsEndpointInfo() {} // synchronized. The protocol implementation, however, runs in a single // goroutine. // +// Each endpoint has a few mutexes: +// +// e.mu -> Primary mutex for an endpoint must be held for all operations except +// in e.Readiness where acquiring it will result in a deadlock in epoll +// implementation. +// +// The following three mutexes can be acquired independent of e.mu but if +// acquired with e.mu then e.mu must be acquired first. +// +// e.rcvListMu -> Protects the rcvList and associated fields. +// e.sndBufMu -> Protects the sndQueue and associated fields. +// e.lastErrorMu -> Protects the lastError field. +// +// LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different +// based on the context in which the lock is acquired. In the syscall context +// e.LockUser/e.UnlockUser should be used and when doing background processing +// e.mu.Lock/e.mu.Unlock should be used. The distinction is described below +// in brief. +// +// The reason for this locking behaviour is to avoid wakeups to handle packets. +// In cases where the endpoint is already locked the background processor can +// queue the packet up and go its merry way and the lock owner will eventually +// process the backlog when releasing the lock. Similarly when acquiring the +// lock from say a syscall goroutine we can implement a bit of spinning if we +// know that the lock is not held by another syscall goroutine. Background +// processors should never hold the lock for long and we can avoid an expensive +// sleep/wakeup by spinning for a shortwhile. +// +// For more details please see the detailed documentation on +// e.LockUser/e.UnlockUser methods. +// // +stateify savable type endpoint struct { EndpointInfo @@ -299,12 +330,6 @@ type endpoint struct { // Precondition: epQueue.mu must be held to read/write this field.. pendingProcessing bool `state:"nosave"` - // workMu is used to arbitrate which goroutine may perform protocol - // work. Only the main protocol goroutine is expected to call Lock() on - // it, but other goroutines (e.g., send) may call TryLock() to eagerly - // perform work without having to wait for the main one to wake up. - workMu tmutex.Mutex `state:"nosave"` - // The following fields are initialized at creation time and do not // change throughout the lifetime of the endpoint. stack *stack.Stack `state:"manual"` @@ -330,15 +355,11 @@ type endpoint struct { rcvBufSize int rcvBufUsed int rcvAutoParams rcvBufAutoTuneParams - // zeroWindow indicates that the window was closed due to receive buffer - // space being filled up. This is set by the worker goroutine before - // moving a segment to the rcvList. This setting is cleared by the - // endpoint when a Read() call reads enough data for the new window to - // be non-zero. - zeroWindow bool - // The following fields are protected by the mutex. - mu sync.RWMutex `state:"nosave"` + // mu protects all endpoint fields unless documented otherwise. mu must + // be acquired before interacting with the endpoint fields. + mu sync.Mutex `state:"nosave"` + ownedByUser uint32 // state must be read/set using the EndpointState()/setEndpointState() methods. state EndpointState `state:".(EndpointState)"` @@ -583,14 +604,93 @@ func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 { return maxMSS } +// LockUser tries to lock e.mu and if it fails it will check if the lock is held +// by another syscall goroutine. If yes, then it will goto sleep waiting for the +// lock to be released, if not then it will spin till it acquires the lock or +// another syscall goroutine acquires it in which case it will goto sleep as +// described above. +// +// The assumption behind spinning here being that background packet processing +// should not be holding the lock for long and spinning reduces latency as we +// avoid an expensive sleep/wakeup of of the syscall goroutine). +func (e *endpoint) LockUser() { + for { + // Try first if the sock is locked then check if it's owned + // by another user goroutine if not then we spin, otherwise + // we just goto sleep on the Lock() and wait. + if !e.mu.TryLock() { + // If socket is owned by the user then just goto sleep + // as the lock could be held for a reasonably long time. + if atomic.LoadUint32(&e.ownedByUser) == 1 { + e.mu.Lock() + atomic.StoreUint32(&e.ownedByUser, 1) + return + } + // Spin but yield the processor since the lower half + // should yield the lock soon. + runtime.Gosched() + continue + } + atomic.StoreUint32(&e.ownedByUser, 1) + return + } +} + +// UnlockUser will check if there are any segments already queued for processing +// and process any such segments before unlocking e.mu. This is required because +// we when packets arrive and endpoint lock is already held then such packets +// are queued up to be processed. If the lock is held by the endpoint goroutine +// then it will process these packets but if the lock is instead held by the +// syscall goroutine then we can have the syscall goroutine process the backlog +// before unlocking. +// +// This avoids an unnecessary wakeup of the endpoint protocol goroutine for the +// endpoint. It's also required eventually when we get rid of the endpoint +// protocol goroutine altogether. +// +// Precondition: e.LockUser() must have been called before calling e.UnlockUser() +func (e *endpoint) UnlockUser() { + // Lock segment queue before checking so that we avoid a race where + // segments can be queued between the time we check if queue is empty + // and actually unlock the endpoint mutex. + for { + e.segmentQueue.mu.Lock() + if e.segmentQueue.emptyLocked() { + if atomic.SwapUint32(&e.ownedByUser, 0) != 1 { + panic("e.UnlockUser() called without calling e.LockUser()") + } + e.mu.Unlock() + e.segmentQueue.mu.Unlock() + return + } + e.segmentQueue.mu.Unlock() + + switch e.EndpointState() { + case StateEstablished: + if err := e.handleSegments(true /* fastPath */); err != nil { + e.notifyProtocolGoroutine(notifyTickleWorker) + } + default: + // Since we are waking the endpoint goroutine here just unlock + // and let it process the queued segments. + e.newSegmentWaker.Assert() + if atomic.SwapUint32(&e.ownedByUser, 0) != 1 { + panic("e.UnlockUser() called without calling e.LockUser()") + } + e.mu.Unlock() + return + } + } +} + // StopWork halts packet processing. Only to be used in tests. func (e *endpoint) StopWork() { - e.workMu.Lock() + e.mu.Lock() } // ResumeWork resumes packet processing. Only to be used in tests. func (e *endpoint) ResumeWork() { - e.workMu.Unlock() + e.mu.Unlock() } // setEndpointState updates the state of the endpoint to state atomically. This @@ -709,8 +809,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue } e.segmentQueue.setLimit(MaxUnprocessedSegments) - e.workMu.Init() - e.workMu.Lock() e.tsOffset = timeStampOffset() return e @@ -721,9 +819,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { result := waiter.EventMask(0) - e.mu.RLock() - defer e.mu.RUnlock() - switch e.EndpointState() { case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv: // Ready for nothing. @@ -823,20 +918,22 @@ func (e *endpoint) Abort() { // with it. It must be called only once and with no other concurrent calls to // the endpoint. func (e *endpoint) Close() { - e.mu.Lock() - closed := e.closed - e.closed = true - e.mu.Unlock() - if closed { + e.LockUser() + defer e.UnlockUser() + if e.closed { return } // Issue a shutdown so that the peer knows we won't send any more data // if we're connected, or stop accepting if we're listening. - e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead) - - e.mu.Lock() + e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) + e.closeNoShutdownLocked() +} +// closeNoShutdown closes the endpoint without doing a full shutdown. This is +// used when a connection needs to be aborted with a RST and we want to skip +// a full 4 way TCP shutdown. +func (e *endpoint) closeNoShutdownLocked() { // For listening sockets, we always release ports inline so that they // are immediately available for reuse after Close() is called. If also // registered, we unregister as well otherwise the next user would fail @@ -853,6 +950,8 @@ func (e *endpoint) Close() { e.boundPortFlags = ports.Flags{} } + // Mark endpoint as closed. + e.closed = true // Either perform the local cleanup or kick the worker to make sure it // knows it needs to cleanup. switch e.EndpointState() { @@ -873,8 +972,6 @@ func (e *endpoint) Close() { // goroutine terminates. e.notifyProtocolGoroutine(notifyClose) } - - e.mu.Unlock() } // closePendingAcceptableConnections closes all connections that have completed @@ -909,7 +1006,6 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() { // after Close() is called and the worker goroutine (if any) is done with its // work. func (e *endpoint) cleanupLocked() { - // Close all endpoints that might have been accepted by TCP but not by // the client. if e.acceptedChan != nil { @@ -954,18 +1050,18 @@ func (e *endpoint) initialReceiveWindow() int { // ModerateRecvBuf adjusts the receive buffer and the advertised window // based on the number of bytes copied to user space. func (e *endpoint) ModerateRecvBuf(copied int) { - e.mu.RLock() + e.LockUser() + defer e.UnlockUser() + e.rcvListMu.Lock() if e.rcvAutoParams.disabled { e.rcvListMu.Unlock() - e.mu.RUnlock() return } now := time.Now() if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt { e.rcvAutoParams.copied += copied e.rcvListMu.Unlock() - e.mu.RUnlock() return } prevRTTCopied := e.rcvAutoParams.copied + copied @@ -1021,7 +1117,6 @@ func (e *endpoint) ModerateRecvBuf(copied int) { e.rcvAutoParams.measureTime = now e.rcvAutoParams.copied = 0 e.rcvListMu.Unlock() - e.mu.RUnlock() } // IPTables implements tcpip.Endpoint.IPTables. @@ -1031,7 +1126,7 @@ func (e *endpoint) IPTables() (iptables.IPTables, error) { // Read reads data from the endpoint. func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { - e.mu.RLock() + e.LockUser() // The endpoint can be read if it's connected, or if it's already closed // but has some pending unread data. Also note that a RST being received // would cause the state to become StateError so we should allow the @@ -1041,7 +1136,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { e.rcvListMu.Unlock() he := e.HardError - e.mu.RUnlock() + e.UnlockUser() if s == StateError { return buffer.View{}, tcpip.ControlMessages{}, he } @@ -1051,7 +1146,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, v, err := e.readLocked() e.rcvListMu.Unlock() - e.mu.RUnlock() + e.UnlockUser() if err == tcpip.ErrClosedForReceive { e.stats.ReadErrors.ReadClosed.Increment() @@ -1124,13 +1219,13 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More // and opts.EndOfRecord are also ignored. - e.mu.RLock() + e.LockUser() e.sndBufMu.Lock() avail, err := e.isEndpointWritableLocked() if err != nil { e.sndBufMu.Unlock() - e.mu.RUnlock() + e.UnlockUser() e.stats.WriteErrors.WriteClosed.Increment() return 0, nil, err } @@ -1142,113 +1237,68 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c // are copying data in. if !opts.Atomic { e.sndBufMu.Unlock() - e.mu.RUnlock() + e.UnlockUser() } // Fetch data. v, perr := p.Payload(avail) if perr != nil || len(v) == 0 { - if opts.Atomic { // See above. + // Note that perr may be nil if len(v) == 0. + if opts.Atomic { e.sndBufMu.Unlock() - e.mu.RUnlock() + e.UnlockUser() } - // Note that perr may be nil if len(v) == 0. return 0, nil, perr } - if opts.Atomic { + queueAndSend := func() (int64, <-chan struct{}, *tcpip.Error) { // Add data to the send queue. s := newSegmentFromView(&e.route, e.ID, v) e.sndBufUsed += len(v) e.sndBufInQueue += seqnum.Size(len(v)) e.sndQueue.PushBack(s) e.sndBufMu.Unlock() - // Release the endpoint lock to prevent deadlocks due to lock - // order inversion when acquiring workMu. - e.mu.RUnlock() - } - if e.workMu.TryLock() { - // Since we released locks in between it's possible that the - // endpoint transitioned to a CLOSED/ERROR states so make - // sure endpoint is still writable before trying to write. - if !opts.Atomic { // See above. - e.mu.RLock() - e.sndBufMu.Lock() - - // Because we released the lock before copying, check state again - // to make sure the endpoint is still in a valid state for a write. - avail, err = e.isEndpointWritableLocked() - if err != nil { - e.sndBufMu.Unlock() - e.mu.RUnlock() - e.stats.WriteErrors.WriteClosed.Increment() - return 0, nil, err - } - - // Discard any excess data copied in due to avail being reduced due - // to a simultaneous write call to the socket. - if avail < len(v) { - v = v[:avail] - } - // Add data to the send queue. - s := newSegmentFromView(&e.route, e.ID, v) - e.sndBufUsed += len(v) - e.sndBufInQueue += seqnum.Size(len(v)) - e.sndQueue.PushBack(s) - e.sndBufMu.Unlock() - // Release the endpoint lock to prevent deadlocks due to lock - // order inversion when acquiring workMu. - e.mu.RUnlock() - - } // Do the work inline. e.handleWrite() - e.workMu.Unlock() - } else { - if !opts.Atomic { // See above. - e.mu.RLock() - e.sndBufMu.Lock() + e.UnlockUser() + return int64(len(v)), nil, nil + } - // Because we released the lock before copying, check state again - // to make sure the endpoint is still in a valid state for a write. - avail, err = e.isEndpointWritableLocked() - if err != nil { - e.sndBufMu.Unlock() - e.mu.RUnlock() - e.stats.WriteErrors.WriteClosed.Increment() - return 0, nil, err - } + if opts.Atomic { + // Locks released in queueAndSend() + return queueAndSend() + } - // Discard any excess data copied in due to avail being reduced due - // to a simultaneous write call to the socket. - if avail < len(v) { - v = v[:avail] - } - // Add data to the send queue. - s := newSegmentFromView(&e.route, e.ID, v) - e.sndBufUsed += len(v) - e.sndBufInQueue += seqnum.Size(len(v)) - e.sndQueue.PushBack(s) - e.sndBufMu.Unlock() - // Release the endpoint lock to prevent deadlocks due to lock - // order inversion when acquiring workMu. - e.mu.RUnlock() + // Since we released locks in between it's possible that the + // endpoint transitioned to a CLOSED/ERROR states so make + // sure endpoint is still writable before trying to write. + e.LockUser() + e.sndBufMu.Lock() + avail, err = e.isEndpointWritableLocked() + if err != nil { + e.sndBufMu.Unlock() + e.UnlockUser() + e.stats.WriteErrors.WriteClosed.Increment() + return 0, nil, err + } - } - // Let the protocol goroutine do the work. - e.sndWaker.Assert() + // Discard any excess data copied in due to avail being reduced due + // to a simultaneous write call to the socket. + if avail < len(v) { + v = v[:avail] } - return int64(len(v)), nil, nil + // Locks released in queueAndSend() + return queueAndSend() } // Peek reads data without consuming it from the endpoint. // // This method does not block if there is no data pending. func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { - e.mu.RLock() - defer e.mu.RUnlock() + e.LockUser() + defer e.UnlockUser() // The endpoint can be read if it's connected, or if it's already closed // but has some pending unread data. @@ -1339,6 +1389,9 @@ func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed boo // SetSockOptBool sets a socket option. func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { + e.LockUser() + defer e.UnlockUser() + switch opt { case tcpip.V6OnlyOption: // We only recognize this option on v6 endpoints. @@ -1346,9 +1399,6 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { return tcpip.ErrInvalidEndpointState } - e.mu.Lock() - defer e.mu.Unlock() - // We only allow this to be set when we're in the initial state. if e.EndpointState() != StateInitial { return tcpip.ErrInvalidEndpointState @@ -1379,7 +1429,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { mask := uint32(notifyReceiveWindowChanged) - e.mu.RLock() + e.LockUser() e.rcvListMu.Lock() // Make sure the receive buffer size allows us to send a @@ -1409,8 +1459,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above { mask |= notifyNonZeroReceiveWindow } + e.rcvListMu.Unlock() - e.mu.RUnlock() + e.UnlockUser() e.notifyProtocolGoroutine(mask) return nil @@ -1466,15 +1517,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { return nil case tcpip.ReuseAddressOption: - e.mu.Lock() + e.LockUser() e.reuseAddr = v != 0 - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.ReusePortOption: - e.mu.Lock() + e.LockUser() e.reusePort = v != 0 - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.BindToDeviceOption: @@ -1482,9 +1533,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { if id != 0 && !e.stack.HasNIC(id) { return tcpip.ErrUnknownDevice } - e.mu.Lock() + e.LockUser() e.bindToDevice = id - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.QuickAckOption: @@ -1500,16 +1551,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { return tcpip.ErrInvalidOptionValue } - e.mu.Lock() + e.LockUser() e.userMSS = uint16(userMSS) - e.mu.Unlock() + e.UnlockUser() e.notifyProtocolGoroutine(notifyMSSChanged) return nil case tcpip.TTLOption: - e.mu.Lock() + e.LockUser() e.ttl = uint8(v) - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.KeepaliveEnabledOption: @@ -1541,15 +1592,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { return nil case tcpip.TCPUserTimeoutOption: - e.mu.Lock() + e.LockUser() e.userTimeout = time.Duration(v) - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.BroadcastOption: - e.mu.Lock() + e.LockUser() e.broadcast = v != 0 - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.CongestionControlOption: @@ -1563,22 +1614,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { availCC := strings.Split(string(avail), " ") for _, cc := range availCC { if v == tcpip.CongestionControlOption(cc) { - // Acquire the work mutex as we may need to - // reinitialize the congestion control state. - e.mu.Lock() + e.LockUser() state := e.EndpointState() e.cc = v - e.mu.Unlock() switch state { case StateEstablished: - e.workMu.Lock() - e.mu.Lock() if e.EndpointState() == state { e.snd.cc = e.snd.initCongestionControl(e.cc) } - e.mu.Unlock() - e.workMu.Unlock() } + e.UnlockUser() return nil } } @@ -1588,23 +1633,23 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { return tcpip.ErrNoSuchFile case tcpip.IPv4TOSOption: - e.mu.Lock() + e.LockUser() // TODO(gvisor.dev/issue/995): ECN is not currently supported, // ignore the bits for now. e.sendTOS = uint8(v) & ^uint8(inetECNMask) - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.IPv6TrafficClassOption: - e.mu.Lock() + e.LockUser() // TODO(gvisor.dev/issue/995): ECN is not currently supported, // ignore the bits for now. e.sendTOS = uint8(v) & ^uint8(inetECNMask) - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.TCPLingerTimeoutOption: - e.mu.Lock() + e.LockUser() if v < 0 { // Same as effectively disabling TCPLinger timeout. v = 0 @@ -1622,16 +1667,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { v = stkTCPLingerTimeout } e.tcpLingerTimeout = time.Duration(v) - e.mu.Unlock() + e.UnlockUser() return nil case tcpip.TCPDeferAcceptOption: - e.mu.Lock() + e.LockUser() if time.Duration(v) > MaxRTO { v = tcpip.TCPDeferAcceptOption(MaxRTO) } e.deferAccept = time.Duration(v) - e.mu.Unlock() + e.UnlockUser() return nil default: @@ -1641,8 +1686,8 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { // readyReceiveSize returns the number of bytes ready to be received. func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) { - e.mu.RLock() - defer e.mu.RUnlock() + e.LockUser() + defer e.UnlockUser() // The endpoint cannot be in listen state. if e.EndpointState() == StateListen { @@ -1664,9 +1709,9 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { return false, tcpip.ErrUnknownProtocolOption } - e.mu.Lock() + e.LockUser() v := e.v6only - e.mu.Unlock() + e.UnlockUser() return v, nil } @@ -1730,9 +1775,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { return nil case *tcpip.ReuseAddressOption: - e.mu.RLock() + e.LockUser() v := e.reuseAddr - e.mu.RUnlock() + e.UnlockUser() *o = 0 if v { @@ -1741,9 +1786,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { return nil case *tcpip.ReusePortOption: - e.mu.RLock() + e.LockUser() v := e.reusePort - e.mu.RUnlock() + e.UnlockUser() *o = 0 if v { @@ -1752,9 +1797,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { return nil case *tcpip.BindToDeviceOption: - e.mu.RLock() + e.LockUser() *o = tcpip.BindToDeviceOption(e.bindToDevice) - e.mu.RUnlock() + e.UnlockUser() return nil case *tcpip.QuickAckOption: @@ -1765,16 +1810,16 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { return nil case *tcpip.TTLOption: - e.mu.Lock() + e.LockUser() *o = tcpip.TTLOption(e.ttl) - e.mu.Unlock() + e.UnlockUser() return nil case *tcpip.TCPInfoOption: *o = tcpip.TCPInfoOption{} - e.mu.RLock() + e.LockUser() snd := e.snd - e.mu.RUnlock() + e.UnlockUser() if snd != nil { snd.rtt.Lock() o.RTT = snd.rtt.srtt @@ -1813,9 +1858,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { return nil case *tcpip.TCPUserTimeoutOption: - e.mu.Lock() + e.LockUser() *o = tcpip.TCPUserTimeoutOption(e.userTimeout) - e.mu.Unlock() + e.UnlockUser() return nil case *tcpip.OutOfBandInlineOption: @@ -1824,9 +1869,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { return nil case *tcpip.BroadcastOption: - e.mu.Lock() + e.LockUser() v := e.broadcast - e.mu.Unlock() + e.UnlockUser() *o = 0 if v { @@ -1835,33 +1880,33 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { return nil case *tcpip.CongestionControlOption: - e.mu.Lock() + e.LockUser() *o = e.cc - e.mu.Unlock() + e.UnlockUser() return nil case *tcpip.IPv4TOSOption: - e.mu.RLock() + e.LockUser() *o = tcpip.IPv4TOSOption(e.sendTOS) - e.mu.RUnlock() + e.UnlockUser() return nil case *tcpip.IPv6TrafficClassOption: - e.mu.RLock() + e.LockUser() *o = tcpip.IPv6TrafficClassOption(e.sendTOS) - e.mu.RUnlock() + e.UnlockUser() return nil case *tcpip.TCPLingerTimeoutOption: - e.mu.Lock() + e.LockUser() *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) - e.mu.Unlock() + e.UnlockUser() return nil case *tcpip.TCPDeferAcceptOption: - e.mu.Lock() + e.LockUser() *o = tcpip.TCPDeferAcceptOption(e.deferAccept) - e.mu.Unlock() + e.UnlockUser() return nil default: @@ -1901,8 +1946,8 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { // yet accepted by the app, they are restored without running the main goroutine // here. func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error { - e.mu.Lock() - defer e.mu.Unlock() + e.LockUser() + defer e.UnlockUser() connectingAddr := addr.Addr @@ -2071,9 +2116,13 @@ func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error { // Shutdown closes the read and/or write end of the endpoint connection to its // peer. func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { - e.mu.Lock() + e.LockUser() + defer e.UnlockUser() + return e.shutdownLocked(flags) +} + +func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error { e.shutdownFlags |= flags - finQueued := false switch { case e.EndpointState().connected(): // Close for read. @@ -2087,24 +2136,9 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { // If we're fully closed and we have unread data we need to abort // the connection with a RST. if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 { - e.mu.Unlock() - // Try to send an active reset immediately if the - // work mutex is available. - if e.workMu.TryLock() { - e.mu.Lock() - // We need to double check here to make - // sure worker has not transitioned the - // endpoint out of a connected state - // before trying to send a reset. - if e.EndpointState().connected() { - e.resetConnectionLocked(tcpip.ErrConnectionAborted) - e.notifyProtocolGoroutine(notifyTickleWorker) - } - e.mu.Unlock() - e.workMu.Unlock() - } else { - e.notifyProtocolGoroutine(notifyReset) - } + e.resetConnectionLocked(tcpip.ErrConnectionAborted) + // Wake up worker to terminate loop. + e.notifyProtocolGoroutine(notifyTickleWorker) return nil } } @@ -2116,42 +2150,32 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { // Already closed. e.sndBufMu.Unlock() if e.EndpointState() == StateTimeWait { - e.mu.Unlock() return tcpip.ErrNotConnected } - break + return nil } // Queue fin segment. s := newSegmentFromView(&e.route, e.ID, nil) e.sndQueue.PushBack(s) e.sndBufInQueue++ - finQueued = true // Mark endpoint as closed. e.sndClosed = true e.sndBufMu.Unlock() + e.handleClose() } + return nil case e.EndpointState() == StateListen: // Tell protocolListenLoop to stop. if flags&tcpip.ShutdownRead != 0 { e.notifyProtocolGoroutine(notifyClose) } + return nil + default: - e.mu.Unlock() return tcpip.ErrNotConnected } - e.mu.Unlock() - if finQueued { - if e.workMu.TryLock() { - e.handleClose() - e.workMu.Unlock() - } else { - // Tell protocol goroutine to close. - e.sndCloseWaker.Assert() - } - } - return nil } // Listen puts the endpoint in "listen" mode, which allows it to accept @@ -2166,8 +2190,8 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error { } func (e *endpoint) listen(backlog int) *tcpip.Error { - e.mu.Lock() - defer e.mu.Unlock() + e.LockUser() + defer e.UnlockUser() // Allow the backlog to be adjusted if the endpoint is not shutting down. // When the endpoint shuts down, it sets workerCleanup to true, and from @@ -2229,7 +2253,6 @@ func (e *endpoint) listen(backlog int) *tcpip.Error { // startAcceptedLoop sets up required state and starts a goroutine with the // main loop for accepted connections. func (e *endpoint) startAcceptedLoop() { - e.mu.Lock() e.workerRunning = true e.mu.Unlock() wakerInitDone := make(chan struct{}) @@ -2240,8 +2263,8 @@ func (e *endpoint) startAcceptedLoop() { // Accept returns a new endpoint if a peer has established a connection // to an endpoint previously set to listen mode. func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { - e.mu.RLock() - defer e.mu.RUnlock() + e.LockUser() + defer e.UnlockUser() // Endpoint must be in listen state before it can accept connections. if e.EndpointState() != StateListen { @@ -2260,8 +2283,8 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { // Bind binds the endpoint to a specific local port and optionally address. func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) { - e.mu.Lock() - defer e.mu.Unlock() + e.LockUser() + defer e.UnlockUser() return e.bindLocked(addr) } @@ -2339,8 +2362,8 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) { // GetLocalAddress returns the address to which the endpoint is bound. func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { - e.mu.RLock() - defer e.mu.RUnlock() + e.LockUser() + defer e.UnlockUser() return tcpip.FullAddress{ Addr: e.ID.LocalAddress, @@ -2351,8 +2374,8 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { // GetRemoteAddress returns the address to which the endpoint is connected. func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { - e.mu.RLock() - defer e.mu.RUnlock() + e.LockUser() + defer e.UnlockUser() if !e.EndpointState().connected() { return tcpip.FullAddress{}, tcpip.ErrNotConnected @@ -2419,7 +2442,6 @@ func (e *endpoint) updateSndBufferUsage(v int) { // to be read, or when the connection is closed for receiving (in which case // s will be nil). func (e *endpoint) readyToRead(s *segment) { - e.mu.RLock() e.rcvListMu.Lock() if s != nil { s.incRef() @@ -2434,7 +2456,6 @@ func (e *endpoint) readyToRead(s *segment) { e.rcvClosed = true } e.rcvListMu.Unlock() - e.mu.RUnlock() e.waiterQueue.Notify(waiter.EventIn) } @@ -2578,9 +2599,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState { s.SegTime = time.Now() // Copy EndpointID. - e.mu.Lock() s.ID = stack.TCPEndpointID(e.ID) - e.mu.Unlock() // Copy endpoint rcv state. e.rcvListMu.Lock() @@ -2710,10 +2729,10 @@ func (e *endpoint) State() uint32 { // Info returns a copy of the endpoint info. func (e *endpoint) Info() tcpip.EndpointInfo { - e.mu.RLock() + e.LockUser() // Make a copy of the endpoint info. ret := e.EndpointInfo - e.mu.RUnlock() + e.UnlockUser() return &ret } @@ -2728,9 +2747,9 @@ func (e *endpoint) Wait() { e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp) defer e.waiterQueue.EventUnregister(&waitEntry) for { - e.mu.Lock() + e.LockUser() running := e.workerRunning - e.mu.Unlock() + e.UnlockUser() if !running { break } diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index 4a46f0ec5..9175de441 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -162,8 +162,8 @@ func (e *endpoint) loadState(state EndpointState) { connectingLoading.Add(1) } // Directly update the state here rather than using e.setEndpointState - // as the endpoint is still being loaded and the stack reference to increment - // metrics is not yet initialized. + // as the endpoint is still being loaded and the stack reference is not + // yet initialized. atomic.StoreUint32((*uint32)(&e.state), uint32(state)) } @@ -180,7 +180,6 @@ func (e *endpoint) afterLoad() { func (e *endpoint) Resume(s *stack.Stack) { e.stack = s e.segmentQueue.setLimit(MaxUnprocessedSegments) - e.workMu.Init() state := e.origEndpointState switch state { case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished: diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 73098d904..b0f918bb4 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -95,7 +95,7 @@ const ( ) type protocol struct { - mu sync.Mutex + mu sync.RWMutex sackEnabled bool delayEnabled bool sendBufferSize SendBufferSizeOption @@ -273,57 +273,57 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error { func (p *protocol) Option(option interface{}) *tcpip.Error { switch v := option.(type) { case *SACKEnabled: - p.mu.Lock() + p.mu.RLock() *v = SACKEnabled(p.sackEnabled) - p.mu.Unlock() + p.mu.RUnlock() return nil case *DelayEnabled: - p.mu.Lock() + p.mu.RLock() *v = DelayEnabled(p.delayEnabled) - p.mu.Unlock() + p.mu.RUnlock() return nil case *SendBufferSizeOption: - p.mu.Lock() + p.mu.RLock() *v = p.sendBufferSize - p.mu.Unlock() + p.mu.RUnlock() return nil case *ReceiveBufferSizeOption: - p.mu.Lock() + p.mu.RLock() *v = p.recvBufferSize - p.mu.Unlock() + p.mu.RUnlock() return nil case *tcpip.CongestionControlOption: - p.mu.Lock() + p.mu.RLock() *v = tcpip.CongestionControlOption(p.congestionControl) - p.mu.Unlock() + p.mu.RUnlock() return nil case *tcpip.AvailableCongestionControlOption: - p.mu.Lock() + p.mu.RLock() *v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " ")) - p.mu.Unlock() + p.mu.RUnlock() return nil case *tcpip.ModerateReceiveBufferOption: - p.mu.Lock() + p.mu.RLock() *v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer) - p.mu.Unlock() + p.mu.RUnlock() return nil case *tcpip.TCPLingerTimeoutOption: - p.mu.Lock() + p.mu.RLock() *v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout) - p.mu.Unlock() + p.mu.RUnlock() return nil case *tcpip.TCPTimeWaitTimeoutOption: - p.mu.Lock() + p.mu.RLock() *v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout) - p.mu.Unlock() + p.mu.RUnlock() return nil default: diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index d80aff1b6..caf8977b3 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -168,7 +168,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum // We just received a FIN, our next state depends on whether we sent a // FIN already or not. - r.ep.mu.Lock() switch r.ep.EndpointState() { case StateEstablished: r.ep.setEndpointState(StateCloseWait) @@ -183,7 +182,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum case StateFinWait2: r.ep.setEndpointState(StateTimeWait) } - r.ep.mu.Unlock() // Flush out any pending segments, except the very first one if // it happens to be the one we're handling now because the @@ -208,7 +206,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum // Handle ACK (not FIN-ACK, which we handled above) during one of the // shutdown states. if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt { - r.ep.mu.Lock() switch r.ep.EndpointState() { case StateFinWait1: r.ep.setEndpointState(StateFinWait2) @@ -222,7 +219,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum case StateLastAck: r.ep.transitionToStateCloseLocked() } - r.ep.mu.Unlock() } return true @@ -336,10 +332,8 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo // handleRcvdSegment handles TCP segments directed at the connection managed by // r as they arrive. It is called by the protocol main loop. func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) { - r.ep.mu.RLock() state := r.ep.EndpointState() closed := r.ep.closed - r.ep.mu.RUnlock() if state != StateEstablished { drop, err := r.handleRcvdSegmentClosing(s, state, closed) diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go index bd20a7ee9..48a257137 100644 --- a/pkg/tcpip/transport/tcp/segment_queue.go +++ b/pkg/tcpip/transport/tcp/segment_queue.go @@ -28,10 +28,16 @@ type segmentQueue struct { used int } +// emptyLocked determines if the queue is empty. +// Preconditions: q.mu must be held. +func (q *segmentQueue) emptyLocked() bool { + return q.used == 0 +} + // empty determines if the queue is empty. func (q *segmentQueue) empty() bool { q.mu.Lock() - r := q.used == 0 + r := q.emptyLocked() q.mu.Unlock() return r diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 657c3146e..17fed4ec5 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -455,9 +455,7 @@ func (s *sender) retransmitTimerExpired() bool { // Give up if we've waited more than a minute since the last resend or // if a user time out is set and we have exceeded the user specified // timeout since the first retransmission. - s.ep.mu.RLock() uto := s.ep.userTimeout - s.ep.mu.RUnlock() if s.firstRetransmittedSegXmitTime.IsZero() { // We store the original xmitTime of the segment that we are @@ -713,7 +711,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se default: s.ep.setEndpointState(StateFinWait1) } - } else { // We're sending a non-FIN segment. if seg.flags&header.TCPFlagFin != 0 { diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 5b2b16afa..39d36d2ba 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -2236,9 +2236,17 @@ func TestSegmentMerging(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) - // Prevent the endpoint from processing packets. - test.stop(c.EP) + // Send 10 1 byte segments to fill up InitialWindow but don't + // ACK. That should prevent anymore packets from going out. + for i := 0; i < 10; i++ { + view := buffer.NewViewFromBytes([]byte{0}) + if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + t.Fatalf("Write #%d failed: %v", i+1, err) + } + } + // Now send the segments that should get merged as the congestion + // window is full and we won't be able to send any more packets. var allData []byte for i, data := range [][]byte{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} { allData = append(allData, data...) @@ -2248,8 +2256,29 @@ func TestSegmentMerging(t *testing.T) { } } - // Let the endpoint process the segments that we just sent. - test.resume(c.EP) + // Check that we get 10 packets of 1 byte each. + for i := 0; i < 10; i++ { + b := c.GetPacket() + checker.IPv4(t, b, + checker.PayloadLen(header.TCPMinimumSize+1), + checker.TCP( + checker.DstPort(context.TestPort), + checker.SeqNum(uint32(c.IRS)+uint32(i)+1), + checker.AckNum(790), + checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)), + ), + ) + } + + // Acknowledge the data. + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + SeqNum: 790, + AckNum: c.IRS.Add(1 + 10), // 10 for the 10 bytes of payload. + RcvWnd: 30000, + }) // Check that data is received. b := c.GetPacket() @@ -2257,7 +2286,7 @@ func TestSegmentMerging(t *testing.T) { checker.PayloadLen(len(allData)+header.TCPMinimumSize), checker.TCP( checker.DstPort(context.TestPort), - checker.SeqNum(uint32(c.IRS)+1), + checker.SeqNum(uint32(c.IRS)+11), checker.AckNum(790), checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)), ), @@ -2273,7 +2302,7 @@ func TestSegmentMerging(t *testing.T) { DstPort: c.Port, Flags: header.TCPFlagAck, SeqNum: 790, - AckNum: c.IRS.Add(1 + seqnum.Size(len(allData))), + AckNum: c.IRS.Add(11 + seqnum.Size(len(allData))), RcvWnd: 30000, }) }) -- cgit v1.2.3 From 92b9069b67b927cef25a1490ebd142ad6d65690d Mon Sep 17 00:00:00 2001 From: Nayana Bidari Date: Fri, 20 Mar 2020 12:00:21 -0700 Subject: Support owner matching for iptables. This feature will match UID and GID of the packet creator, for locally generated packets. This match is only valid in the OUTPUT and POSTROUTING chains. Forwarded packets do not have any socket associated with them. Packets from kernel threads do have a socket, but usually no owner. --- pkg/abi/linux/netfilter.go | 41 ++++++++ pkg/abi/linux/netfilter_test.go | 1 + pkg/sentry/kernel/task.go | 12 +++ pkg/sentry/socket/netfilter/BUILD | 1 + pkg/sentry/socket/netfilter/netfilter.go | 7 +- pkg/sentry/socket/netfilter/owner_matcher.go | 128 ++++++++++++++++++++++++ pkg/sentry/socket/netstack/provider.go | 6 ++ pkg/tcpip/network/ipv4/ipv4.go | 15 +++ pkg/tcpip/stack/packet_buffer.go | 9 +- pkg/tcpip/stack/transport_test.go | 2 + pkg/tcpip/tcpip.go | 12 +++ pkg/tcpip/transport/icmp/endpoint.go | 12 ++- pkg/tcpip/transport/packet/endpoint.go | 2 + pkg/tcpip/transport/raw/endpoint.go | 9 ++ pkg/tcpip/transport/tcp/accept.go | 5 +- pkg/tcpip/transport/tcp/connect.go | 10 +- pkg/tcpip/transport/tcp/endpoint.go | 7 ++ pkg/tcpip/transport/tcp/forwarder.go | 2 +- pkg/tcpip/transport/tcp/protocol.go | 2 +- pkg/tcpip/transport/udp/endpoint.go | 12 ++- test/iptables/filter_output.go | 143 +++++++++++++++++++++++++++ test/iptables/iptables_test.go | 30 ++++++ 22 files changed, 451 insertions(+), 17 deletions(-) create mode 100644 pkg/sentry/socket/netfilter/owner_matcher.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index 80dc09aa9..a8d4f9d69 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -509,3 +509,44 @@ const ( // Enable all flags. XT_UDP_INV_MASK = 0x03 ) + +// IPTOwnerInfo holds data for matching packets with owner. It corresponds +// to struct ipt_owner_info in libxt_owner.c of iptables binary. +type IPTOwnerInfo struct { + // UID is user id which created the packet. + UID uint32 + + // GID is group id which created the packet. + GID uint32 + + // PID is process id of the process which created the packet. + PID uint32 + + // SID is session id which created the packet. + SID uint32 + + // Comm is the command name which created the packet. + Comm [16]byte + + // Match is used to match UID/GID of the socket. See the + // XT_OWNER_* flags below. + Match uint8 + + // Invert flips the meaning of Match field. + Invert uint8 +} + +// SizeOfIPTOwnerInfo is the size of an XTOwnerMatchInfo. +const SizeOfIPTOwnerInfo = 34 + +// Flags in IPTOwnerInfo.Match. Corresponding constants are in +// include/uapi/linux/netfilter/xt_owner.h. +const ( + // Match the UID of the packet. + XT_OWNER_UID = 1 << 0 + // Match the GID of the packet. + XT_OWNER_GID = 1 << 1 + // Match if the socket exists for the packet. Forwarded + // packets do not have an associated socket. + XT_OWNER_SOCKET = 1 << 2 +) diff --git a/pkg/abi/linux/netfilter_test.go b/pkg/abi/linux/netfilter_test.go index 21e237f92..565dd550e 100644 --- a/pkg/abi/linux/netfilter_test.go +++ b/pkg/abi/linux/netfilter_test.go @@ -29,6 +29,7 @@ func TestSizes(t *testing.T) { {IPTGetEntries{}, SizeOfIPTGetEntries}, {IPTGetinfo{}, SizeOfIPTGetinfo}, {IPTIP{}, SizeOfIPTIP}, + {IPTOwnerInfo{}, SizeOfIPTOwnerInfo}, {IPTReplace{}, SizeOfIPTReplace}, {XTCounters{}, SizeOfXTCounters}, {XTEntryMatch{}, SizeOfXTEntryMatch}, diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 8452ddf5b..d6546735e 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -863,3 +863,15 @@ func (t *Task) SetOOMScoreAdj(adj int32) error { atomic.StoreInt32(&t.tg.oomScoreAdj, adj) return nil } + +// UID returns t's uid. +// TODO(gvisor.dev/issue/170): This method is not namespaced yet. +func (t *Task) UID() uint32 { + return uint32(t.Credentials().EffectiveKUID) +} + +// GID returns t's gid. +// TODO(gvisor.dev/issue/170): This method is not namespaced yet. +func (t *Task) GID() uint32 { + return uint32(t.Credentials().EffectiveKGID) +} diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index e801abeb8..721094bbf 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "extensions.go", "netfilter.go", + "owner_matcher.go", "targets.go", "tcp_matcher.go", "udp_matcher.go", diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 55bcc3ace..878f81fd5 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -517,11 +517,10 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { } // TODO(gvisor.dev/issue/170): Support other chains. - // Since we only support modifying the INPUT chain and redirect for - // PREROUTING chain right now, make sure all other chains point to - // ACCEPT rules. + // Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now, + // make sure all other chains point to ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { - if hook != stack.Input && hook != stack.Prerouting { + if hook == stack.Forward || hook == stack.Postrouting { if _, ok := table.Rules[ruleIdx].Target.(stack.AcceptTarget); !ok { nflog("hook %d is unsupported.", hook) return syserr.ErrInvalidArgument diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go new file mode 100644 index 000000000..5949a7c29 --- /dev/null +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -0,0 +1,128 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameOwner = "owner" + +func init() { + registerMatchMaker(ownerMarshaler{}) +} + +// ownerMarshaler implements matchMaker for owner matching. +type ownerMarshaler struct{} + +// name implements matchMaker.name. +func (ownerMarshaler) name() string { + return matcherNameOwner +} + +// marshal implements matchMaker.marshal. +func (ownerMarshaler) marshal(mr stack.Matcher) []byte { + matcher := mr.(*OwnerMatcher) + iptOwnerInfo := linux.IPTOwnerInfo{ + UID: matcher.uid, + GID: matcher.gid, + } + + // Support for UID match. + // TODO(gvisor.dev/issue/170): Need to support gid match. + if matcher.matchUID { + iptOwnerInfo.Match = linux.XT_OWNER_UID + } else if matcher.matchGID { + panic("GID match is not supported.") + } else { + panic("UID match is not set.") + } + + buf := make([]byte, 0, linux.SizeOfIPTOwnerInfo) + return marshalEntryMatch(matcherNameOwner, binary.Marshal(buf, usermem.ByteOrder, iptOwnerInfo)) +} + +// unmarshal implements matchMaker.unmarshal. +func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { + if len(buf) < linux.SizeOfIPTOwnerInfo { + return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may + // exceed what's strictly necessary to hold matchData. + var matchData linux.IPTOwnerInfo + binary.Unmarshal(buf[:linux.SizeOfIPTOwnerInfo], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData) + + if matchData.Invert != 0 { + return nil, fmt.Errorf("invert flag is not supported for owner match") + } + + // Support for UID match. + // TODO(gvisor.dev/issue/170): Need to support gid match. + if matchData.Match&linux.XT_OWNER_UID != linux.XT_OWNER_UID { + return nil, fmt.Errorf("owner match is only supported for uid") + } + + // Check Flags. + var owner OwnerMatcher + owner.uid = matchData.UID + owner.gid = matchData.GID + owner.matchUID = true + + return &owner, nil +} + +type OwnerMatcher struct { + uid uint32 + gid uint32 + matchUID bool + matchGID bool + invert uint8 +} + +// Name implements Matcher.Name. +func (*OwnerMatcher) Name() string { + return matcherNameOwner +} + +// Match implements Matcher.Match. +func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) { + // Support only for OUTPUT chain. + // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. + if hook != stack.Output { + return false, true + } + + // If the packet owner is not set, drop the packet. + // Support for uid match. + // TODO(gvisor.dev/issue/170): Need to support gid match. + if pkt.Owner == nil || !om.matchUID { + return false, true + } + + // TODO(gvisor.dev/issue/170): Need to add tests to verify + // drop rule when packet UID does not match owner matcher UID. + if pkt.Owner.UID() != om.uid { + return false, false + } + + return true, false +} diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index 5f181f017..eb090e79b 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -126,6 +126,12 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) } else { ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) + + // Assign task to PacketOwner interface to get the UID and GID for + // iptables owner matching. + if e == nil { + ep.SetOwner(t) + } } if e != nil { return nil, syserr.TranslateNetstackError(e) diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index b3ee6000e..a7d9a8b25 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -244,6 +244,14 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params) pkt.NetworkHeader = buffer.View(ip) + // iptables filtering. All packets that reach here are locally + // generated. + ipt := e.stack.IPTables() + if ok := ipt.Check(stack.Output, pkt); !ok { + // iptables is telling us to drop the packet. + return nil + } + if r.Loop&stack.PacketLoop != 0 { // The inbound path expects the network header to still be in // the PacketBuffer's Data field. @@ -280,7 +288,14 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.Pac return len(pkts), nil } + // iptables filtering. All packets that reach here are locally + // generated. + ipt := e.stack.IPTables() for i := range pkts { + if ok := ipt.Check(stack.Output, pkts[i]); !ok { + // iptables is telling us to drop the packet. + continue + } ip := e.addIPHeader(r, &pkts[i].Header, pkts[i].DataSize, params) pkts[i].NetworkHeader = buffer.View(ip) } diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go index 9505a4e92..9367de180 100644 --- a/pkg/tcpip/stack/packet_buffer.go +++ b/pkg/tcpip/stack/packet_buffer.go @@ -13,7 +13,10 @@ package stack -import "gvisor.dev/gvisor/pkg/tcpip/buffer" +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" +) // A PacketBuffer contains all the data of a network packet. // @@ -59,6 +62,10 @@ type PacketBuffer struct { // Hash is the transport layer hash of this packet. A value of zero // indicates no valid hash has been set. Hash uint32 + + // Owner is implemented by task to get the uid and gid. + // Only set for locally generated packets. + Owner tcpip.PacketOwner } // Clone makes a copy of pk. It clones the Data field, which creates a new diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go index 8ca9ac3cf..3084e6593 100644 --- a/pkg/tcpip/stack/transport_test.go +++ b/pkg/tcpip/stack/transport_test.go @@ -56,6 +56,8 @@ func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats { return nil } +func (f *fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {} + func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint { return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID} } diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 3dc5d87d6..2ef3271f1 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -336,6 +336,15 @@ type ControlMessages struct { PacketInfo IPPacketInfo } +// PacketOwner is used to get UID and GID of the packet. +type PacketOwner interface { + // UID returns UID of the packet. + UID() uint32 + + // GID returns GID of the packet. + GID() uint32 +} + // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp) // that exposes functionality like read, write, connect, etc. to users of the // networking stack. @@ -470,6 +479,9 @@ type Endpoint interface { // Stats returns a reference to the endpoint stats. Stats() EndpointStats + + // SetOwner sets the task owner to the endpoint owner. + SetOwner(owner PacketOwner) } // EndpointInfo is the interface implemented by each endpoint info struct. diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 613b12ead..b007302fb 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -73,6 +73,9 @@ type endpoint struct { route stack.Route `state:"manual"` ttl uint8 stats tcpip.TransportEndpointStats `state:"nosave"` + + // owner is used to get uid and gid of the packet. + owner tcpip.PacketOwner } func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { @@ -133,6 +136,10 @@ func (e *endpoint) Close() { // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. func (e *endpoint) ModerateRecvBuf(copied int) {} +func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { + e.owner = owner +} + // IPTables implements tcpip.Endpoint.IPTables. func (e *endpoint) IPTables() (stack.IPTables, error) { return e.stack.IPTables(), nil @@ -321,7 +328,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c switch e.NetProto { case header.IPv4ProtocolNumber: - err = send4(route, e.ID.LocalPort, v, e.ttl) + err = send4(route, e.ID.LocalPort, v, e.ttl, e.owner) case header.IPv6ProtocolNumber: err = send6(route, e.ID.LocalPort, v, e.ttl) @@ -415,7 +422,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { } } -func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Error { +func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error { if len(data) < header.ICMPv4MinimumSize { return tcpip.ErrInvalidEndpointState } @@ -444,6 +451,7 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err Header: hdr, Data: data.ToVectorisedView(), TransportHeader: buffer.View(icmpv4), + Owner: owner, }) } diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index df49d0995..23158173d 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -392,3 +392,5 @@ func (ep *endpoint) Info() tcpip.EndpointInfo { func (ep *endpoint) Stats() tcpip.EndpointStats { return &ep.stats } + +func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {} diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 536dafd1e..337bc1c71 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -80,6 +80,9 @@ type endpoint struct { // Connect(), and is valid only when conneted is true. route stack.Route `state:"manual"` stats tcpip.TransportEndpointStats `state:"nosave"` + + // owner is used to get uid and gid of the packet. + owner tcpip.PacketOwner } // NewEndpoint returns a raw endpoint for the given protocols. @@ -159,6 +162,10 @@ func (e *endpoint) Close() { // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. func (e *endpoint) ModerateRecvBuf(copied int) {} +func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { + e.owner = owner +} + // IPTables implements tcpip.Endpoint.IPTables. func (e *endpoint) IPTables() (stack.IPTables, error) { return e.stack.IPTables(), nil @@ -348,10 +355,12 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64, } break } + hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength())) if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{ Header: hdr, Data: buffer.View(payloadBytes).ToVectorisedView(), + Owner: e.owner, }); err != nil { return 0, nil, err } diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 375ca21f6..7a9dea4ac 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -276,7 +276,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i // and then performs the TCP 3-way handshake. // // The new endpoint is returned with e.mu held. -func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) { +func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, *tcpip.Error) { // Create new endpoint. irs := s.sequenceNumber isn := generateSecureISN(s.id, l.stack.Seed()) @@ -284,6 +284,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head if err != nil { return nil, err } + ep.owner = owner // listenEP is nil when listenContext is used by tcp.Forwarder. deferAccept := time.Duration(0) @@ -414,7 +415,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header }() defer s.decRef() - n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{}) + n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{}, e.owner) if err != nil { e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 1d245c2c6..3239a5911 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -745,7 +745,7 @@ func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOp func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error { tf.txHash = e.txHash - if err := sendTCP(r, tf, data, gso); err != nil { + if err := sendTCP(r, tf, data, gso, e.owner); err != nil { e.stats.SendErrors.SegmentSendToNetworkFailed.Increment() return err } @@ -787,7 +787,7 @@ func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *sta } } -func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error { +func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error { optLen := len(tf.opts) if tf.rcvWnd > 0xffff { tf.rcvWnd = 0xffff @@ -816,6 +816,7 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso pkts[i].DataSize = packetSize pkts[i].Data = data pkts[i].Hash = tf.txHash + pkts[i].Owner = owner buildTCPHdr(r, tf, &pkts[i], gso) off += packetSize tf.seq = tf.seq.Add(seqnum.Size(packetSize)) @@ -833,14 +834,14 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso // sendTCP sends a TCP segment with the provided options via the provided // network endpoint and under the provided identity. -func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error { +func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error { optLen := len(tf.opts) if tf.rcvWnd > 0xffff { tf.rcvWnd = 0xffff } if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() { - return sendTCPBatch(r, tf, data, gso) + return sendTCPBatch(r, tf, data, gso, owner) } pkt := stack.PacketBuffer{ @@ -849,6 +850,7 @@ func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stac DataSize: data.Size(), Data: data, Hash: tf.txHash, + Owner: owner, } buildTCPHdr(r, tf, &pkt, gso) diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 1ebee0cfe..9b123e968 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -603,6 +603,9 @@ type endpoint struct { // txHash is the transport layer hash to be set on outbound packets // emitted by this endpoint. txHash uint32 + + // owner is used to get uid and gid of the packet. + owner tcpip.PacketOwner } // UniqueID implements stack.TransportEndpoint.UniqueID. @@ -1132,6 +1135,10 @@ func (e *endpoint) ModerateRecvBuf(copied int) { e.rcvListMu.Unlock() } +func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { + e.owner = owner +} + // IPTables implements tcpip.Endpoint.IPTables. func (e *endpoint) IPTables() (stack.IPTables, error) { return e.stack.IPTables(), nil diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go index a094471b8..808410c92 100644 --- a/pkg/tcpip/transport/tcp/forwarder.go +++ b/pkg/tcpip/transport/tcp/forwarder.go @@ -157,7 +157,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, TSVal: r.synOptions.TSVal, TSEcr: r.synOptions.TSEcr, SACKPermitted: r.synOptions.SACKPermitted, - }, queue) + }, queue, nil) if err != nil { return nil, err } diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 1377107ca..dce9a1652 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -199,7 +199,7 @@ func replyWithReset(s *segment) { seq: seq, ack: ack, rcvWnd: 0, - }, buffer.VectorisedView{}, nil /* gso */) + }, buffer.VectorisedView{}, nil /* gso */, nil /* PacketOwner */) } // SetOption implements stack.TransportProtocol.SetOption. diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index a3372ac58..120d3baa3 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -143,6 +143,9 @@ type endpoint struct { // TODO(b/142022063): Add ability to save and restore per endpoint stats. stats tcpip.TransportEndpointStats `state:"nosave"` + + // owner is used to get uid and gid of the packet. + owner tcpip.PacketOwner } // +stateify savable @@ -484,7 +487,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c useDefaultTTL = false } - if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS); err != nil { + if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner); err != nil { return 0, nil, err } return int64(len(v)), nil, nil @@ -886,7 +889,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { // sendUDP sends a UDP segment via the provided network endpoint and under the // provided identity. -func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8) *tcpip.Error { +func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner) *tcpip.Error { // Allocate a buffer for the UDP header. hdr := buffer.NewPrependable(header.UDPMinimumSize + int(r.MaxHeaderLength())) @@ -916,6 +919,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u Header: hdr, Data: data, TransportHeader: buffer.View(udp), + Owner: owner, }); err != nil { r.Stats().UDP.PacketSendErrors.Increment() return err @@ -1356,3 +1360,7 @@ func (*endpoint) Wait() {} func isBroadcastOrMulticast(a tcpip.Address) bool { return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a) } + +func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { + e.owner = owner +} diff --git a/test/iptables/filter_output.go b/test/iptables/filter_output.go index 4582d514c..f6d974b85 100644 --- a/test/iptables/filter_output.go +++ b/test/iptables/filter_output.go @@ -24,6 +24,11 @@ func init() { RegisterTestCase(FilterOutputDropTCPSrcPort{}) RegisterTestCase(FilterOutputDestination{}) RegisterTestCase(FilterOutputInvertDestination{}) + RegisterTestCase(FilterOutputAcceptTCPOwner{}) + RegisterTestCase(FilterOutputDropTCPOwner{}) + RegisterTestCase(FilterOutputAcceptUDPOwner{}) + RegisterTestCase(FilterOutputDropUDPOwner{}) + RegisterTestCase(FilterOutputOwnerFail{}) } // FilterOutputDropTCPDestPort tests that connections are not accepted on @@ -90,6 +95,144 @@ func (FilterOutputDropTCPSrcPort) LocalAction(ip net.IP) error { return nil } +// FilterOutputAcceptTCPOwner tests that TCP connections from uid owner are accepted. +type FilterOutputAcceptTCPOwner struct{} + +// Name implements TestCase.Name. +func (FilterOutputAcceptTCPOwner) Name() string { + return "FilterOutputAcceptTCPOwner" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterOutputAcceptTCPOwner) ContainerAction(ip net.IP) error { + if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "owner", "--uid-owner", "root", "-j", "ACCEPT"); err != nil { + return err + } + + // Listen for TCP packets on accept port. + if err := listenTCP(acceptPort, sendloopDuration); err != nil { + return fmt.Errorf("connection on port %d should be accepted, but got dropped", acceptPort) + } + + return nil +} + +// LocalAction implements TestCase.LocalAction. +func (FilterOutputAcceptTCPOwner) LocalAction(ip net.IP) error { + if err := connectTCP(ip, acceptPort, sendloopDuration); err != nil { + return fmt.Errorf("connection destined to port %d should be accepted, but got dropped", acceptPort) + } + + return nil +} + +// FilterOutputDropTCPOwner tests that TCP connections from uid owner are dropped. +type FilterOutputDropTCPOwner struct{} + +// Name implements TestCase.Name. +func (FilterOutputDropTCPOwner) Name() string { + return "FilterOutputDropTCPOwner" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterOutputDropTCPOwner) ContainerAction(ip net.IP) error { + if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "owner", "--uid-owner", "root", "-j", "DROP"); err != nil { + return err + } + + // Listen for TCP packets on accept port. + if err := listenTCP(acceptPort, sendloopDuration); err == nil { + return fmt.Errorf("connection on port %d should be dropped, but got accepted", acceptPort) + } + + return nil +} + +// LocalAction implements TestCase.LocalAction. +func (FilterOutputDropTCPOwner) LocalAction(ip net.IP) error { + if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil { + return fmt.Errorf("connection destined to port %d should be dropped, but got accepted", acceptPort) + } + + return nil +} + +// FilterOutputAcceptUDPOwner tests that UDP packets from uid owner are accepted. +type FilterOutputAcceptUDPOwner struct{} + +// Name implements TestCase.Name. +func (FilterOutputAcceptUDPOwner) Name() string { + return "FilterOutputAcceptUDPOwner" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterOutputAcceptUDPOwner) ContainerAction(ip net.IP) error { + if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "--uid-owner", "root", "-j", "ACCEPT"); err != nil { + return err + } + + // Send UDP packets on acceptPort. + return sendUDPLoop(ip, acceptPort, sendloopDuration) +} + +// LocalAction implements TestCase.LocalAction. +func (FilterOutputAcceptUDPOwner) LocalAction(ip net.IP) error { + // Listen for UDP packets on acceptPort. + return listenUDP(acceptPort, sendloopDuration) +} + +// FilterOutputDropUDPOwner tests that UDP packets from uid owner are dropped. +type FilterOutputDropUDPOwner struct{} + +// Name implements TestCase.Name. +func (FilterOutputDropUDPOwner) Name() string { + return "FilterOutputDropUDPOwner" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterOutputDropUDPOwner) ContainerAction(ip net.IP) error { + if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "--uid-owner", "root", "-j", "DROP"); err != nil { + return err + } + + // Send UDP packets on dropPort. + return sendUDPLoop(ip, dropPort, sendloopDuration) +} + +// LocalAction implements TestCase.LocalAction. +func (FilterOutputDropUDPOwner) LocalAction(ip net.IP) error { + // Listen for UDP packets on dropPort. + if err := listenUDP(dropPort, sendloopDuration); err == nil { + return fmt.Errorf("packets should not be received") + } + + return nil +} + +// FilterOutputOwnerFail tests that without uid/gid option, owner rule +// will fail. +type FilterOutputOwnerFail struct{} + +// Name implements TestCase.Name. +func (FilterOutputOwnerFail) Name() string { + return "FilterOutputOwnerFail" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterOutputOwnerFail) ContainerAction(ip net.IP) error { + if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "-j", "ACCEPT"); err == nil { + return fmt.Errorf("Invalid argument") + } + + return nil +} + +// LocalAction implements TestCase.LocalAction. +func (FilterOutputOwnerFail) LocalAction(ip net.IP) error { + // no-op. + return nil +} + // FilterOutputDestination tests that we can selectively allow packets to // certain destinations. type FilterOutputDestination struct{} diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index 7f1f70606..493d69052 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -274,6 +274,36 @@ func TestFilterOutputDropTCPSrcPort(t *testing.T) { } } +func TestFilterOutputAcceptTCPOwner(t *testing.T) { + if err := singleTest(FilterOutputAcceptTCPOwner{}); err != nil { + t.Fatal(err) + } +} + +func TestFilterOutputDropTCPOwner(t *testing.T) { + if err := singleTest(FilterOutputDropTCPOwner{}); err != nil { + t.Fatal(err) + } +} + +func TestFilterOutputAcceptUDPOwner(t *testing.T) { + if err := singleTest(FilterOutputAcceptUDPOwner{}); err != nil { + t.Fatal(err) + } +} + +func TestFilterOutputDropUDPOwner(t *testing.T) { + if err := singleTest(FilterOutputDropUDPOwner{}); err != nil { + t.Fatal(err) + } +} + +func TestFilterOutputOwnerFail(t *testing.T) { + if err := singleTest(FilterOutputOwnerFail{}); err != nil { + t.Fatal(err) + } +} + func TestJumpSerialize(t *testing.T) { if err := singleTest(FilterInputSerializeJump{}); err != nil { t.Fatal(err) -- cgit v1.2.3 From 639d94f9f71b43e86320a6e9157c932f5d7936a7 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 31 Mar 2020 19:15:55 -0700 Subject: Add socket filesystem and global disconnected socket mount for VFS2. A socket mount where anonymous sockets will reside is added to the VirtualFilesystem. Socketfs is built on top of kernfs. Updates #1476, #1478, #1484, #1485. PiperOrigin-RevId: 304095251 --- pkg/sentry/fsimpl/sockfs/BUILD | 16 +++++++++ pkg/sentry/fsimpl/sockfs/sockfs.go | 73 ++++++++++++++++++++++++++++++++++++++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 24 +++++++++++++ test/syscalls/linux/socket_unix.cc | 2 ++ 5 files changed, 116 insertions(+) create mode 100644 pkg/sentry/fsimpl/sockfs/BUILD create mode 100644 pkg/sentry/fsimpl/sockfs/sockfs.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD new file mode 100644 index 000000000..790d50e65 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "sockfs", + srcs = ["sockfs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go new file mode 100644 index 000000000..c13511de2 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -0,0 +1,73 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sockfs provides a filesystem implementation for anonymous sockets. +package sockfs + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// NewFilesystem creates a new sockfs filesystem. +// +// Note that there should only ever be one instance of sockfs.Filesystem, +// backing a global socket mount. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{}) + if err != nil { + panic("failed to create sockfs filesystem") + } + return fs +} + +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fs := &filesystem{} + fs.Init(vfsObj, fsType) + return fs.VFSFilesystem(), nil, nil +} + +// Name implements FilesystemType.Name. +// +// Note that registering sockfs is unnecessary, except for the fact that it +// will not show up under /proc/filesystems as a result. This is a very minor +// discrepancy from Linux. +func (filesystemType) Name() string { + return "sockfs" +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeAttrs + kernfs.InodeNoopRefCount +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return nil, syserror.ENXIO +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index beba29a09..bb7e3cbc3 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -169,6 +169,7 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 6feda8fa1..0a448b57c 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -50,6 +50,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -225,6 +226,11 @@ type Kernel struct { // by extMu. nextSocketEntry uint64 + // socketMount is a disconnected vfs.Mount, not included in k.vfs, + // representing a sockfs.filesystem. socketMount is used to back + // VirtualDentries representing anonymous sockets. + socketMount *vfs.Mount + // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` @@ -348,6 +354,19 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() + if VFS2Enabled { + if err := k.vfs.Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %v", err) + } + fs := sockfs.NewFilesystem(&k.vfs) + // NewDisconnectedMount will take an additional reference on fs. + defer fs.DecRef() + sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to initialize socket mount: %v", err) + } + k.socketMount = sm + } return nil } @@ -1452,6 +1471,11 @@ func (k *Kernel) ListSockets() []*SocketEntry { return socks } +// SocketMount returns the global socket mount. +func (k *Kernel) SocketMount() *vfs.Mount { + return k.socketMount +} + // supervisorContext is a privileged context. type supervisorContext struct { context.NoopSleeper diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc index 4cf1f76f1..8bf663e8b 100644 --- a/test/syscalls/linux/socket_unix.cc +++ b/test/syscalls/linux/socket_unix.cc @@ -257,6 +257,8 @@ TEST_P(UnixSocketPairTest, ShutdownWrite) { TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) { // TODO(b/122310852): We should be returning ENXIO and NOT EIO. + // TODO(github.dev/issue/1624): This should be resolved in VFS2. Verify + // that this is the case and delete the SKIP_IF once we delete VFS1. SKIP_IF(IsRunningOnGvisor()); auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); -- cgit v1.2.3 From 5eb41c8fbabac090251fbfb43bd9c814124aa575 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Tue, 11 Feb 2020 02:55:51 -0500 Subject: Arm64 signal#2: signal support in arch module SA_RESTORER is always used on Intel platform. But this flag is optional on other platforms. The vdso is enabled, so we can use the sigreturn trampolines the vdso provides instead on Arm platform. Signed-off-by: Bin Lu --- pkg/sentry/arch/signal_arm64.go | 30 ++++++++++++++++++++++++------ pkg/sentry/kernel/task_signals.go | 13 +++++++++++++ 2 files changed, 37 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 4f4cc46a8..b57d6a17d 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -83,9 +83,12 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt if ucSize < 0 { panic("can't get size of UContext64") } - // st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128. - frameSize := int(st.Arch.Width()) + ucSize + 128 - frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8 + + // frameSize = ucSize + sizeof(siginfo). + // sizeof(siginfo) == 128. + // R30 stores the restorer address. + frameSize := ucSize + 128 + frameBottom := (sp - usermem.Addr(frameSize)) & ^usermem.Addr(15) sp = frameBottom + usermem.Addr(frameSize) st.Bottom = sp @@ -115,12 +118,27 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt c.Regs.Regs[0] = uint64(info.Signo) c.Regs.Regs[1] = uint64(infoAddr) c.Regs.Regs[2] = uint64(ucAddr) - + c.Regs.Regs[30] = uint64(act.Restorer) return nil } // SignalRestore implements Context.SignalRestore. -// Only used on intel. func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) { - return 0, SignalStack{}, nil + // Copy out the stack frame. + var uc UContext64 + if _, err := st.Pop(&uc); err != nil { + return 0, SignalStack{}, err + } + var info SignalInfo + if _, err := st.Pop(&info); err != nil { + return 0, SignalStack{}, err + } + + // Restore registers. + c.Regs.Regs = uc.MContext.Regs + c.Regs.Pc = uc.MContext.Pc + c.Regs.Sp = uc.MContext.Sp + c.Regs.Pstate = uc.MContext.Pstate + + return uc.Sigset, uc.Stack, nil } diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 8802db142..0e74236c9 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -263,6 +263,19 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) if t.haveSavedSignalMask { mask = t.savedSignalMask } + + // Set up the restorer. + // x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms. + // Please see the linux code as reference: + // linux/arch/x86/kernel/signal.c:__setup_rt_frame() + // If SA_RESTORER is not configured, we can use the sigreturn trampolines + // the vdso provides instead. + // Please see the linux code as reference: + // linux/arch/arm64/kernel/signal.c:setup_return() + if act.Flags&linux.SA_RESTORER == 0 { + act.Restorer = t.MemoryManager().VDSOSigReturn() + } + if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { return err } -- cgit v1.2.3 From 840980aeba0b5224b13bcaadf5785ac5305a5230 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Tue, 31 Mar 2020 22:54:50 -0700 Subject: Implement automated marshalling for slices of Marshallable types. PiperOrigin-RevId: 304119255 --- pkg/sentry/kernel/rseq.go | 2 +- pkg/sentry/syscalls/linux/sys_stat.go | 6 +- pkg/sentry/syscalls/linux/vfs2/epoll.go | 4 +- pkg/sentry/syscalls/linux/vfs2/poll.go | 14 +- pkg/sentry/syscalls/linux/vfs2/setstat.go | 2 +- pkg/sentry/syscalls/linux/vfs2/stat.go | 23 +- tools/go_marshal/analysis/analysis_unsafe.go | 4 + tools/go_marshal/defs.bzl | 3 +- tools/go_marshal/gomarshal/generator.go | 130 ++++-- tools/go_marshal/gomarshal/generator_interfaces.go | 62 +++ .../generator_interfaces_array_newtype.go | 84 +--- .../generator_interfaces_primitive_newtype.go | 173 ++++--- .../gomarshal/generator_interfaces_struct.go | 308 +++++++++--- tools/go_marshal/gomarshal/generator_tests.go | 52 ++- tools/go_marshal/gomarshal/util.go | 21 +- tools/go_marshal/marshal/marshal.go | 103 ++++- tools/go_marshal/primitive/BUILD | 18 + tools/go_marshal/primitive/primitive.go | 175 +++++++ tools/go_marshal/test/BUILD | 14 + tools/go_marshal/test/benchmark_test.go | 42 ++ tools/go_marshal/test/external/external.go | 8 + tools/go_marshal/test/marshal_test.go | 515 +++++++++++++++++++++ tools/go_marshal/test/test.go | 36 +- 23 files changed, 1525 insertions(+), 274 deletions(-) create mode 100644 tools/go_marshal/primitive/BUILD create mode 100644 tools/go_marshal/primitive/primitive.go create mode 100644 tools/go_marshal/test/marshal_test.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index ded95f532..18416643b 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -304,7 +304,7 @@ func (t *Task) rseqAddrInterrupt() { } var cs linux.RSeqCriticalSection - if err := cs.CopyIn(t, critAddr); err != nil { + if _, err := cs.CopyIn(t, critAddr); err != nil { t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index a11a87cd1..46ebf27a2 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -115,7 +115,8 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err return err } s := statFromAttrs(t, d.Inode.StableAttr, uattr) - return s.CopyOut(t, statAddr) + _, err = s.CopyOut(t, statAddr) + return err } // fstat implements fstat for the given *fs.File. @@ -125,7 +126,8 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error { return err } s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr) - return s.CopyOut(t, statAddr) + _, err = s.CopyOut(t, statAddr) + return err } // Statx implements linux syscall statx(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index d6cb0e79a..5a938cee2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -101,14 +101,14 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc var event linux.EpollEvent switch op { case linux.EPOLL_CTL_ADD: - if err := event.CopyIn(t, eventAddr); err != nil { + if _, err := event.CopyIn(t, eventAddr); err != nil { return 0, nil, err } return 0, nil, ep.AddInterest(file, fd, event) case linux.EPOLL_CTL_DEL: return 0, nil, ep.DeleteInterest(file, fd) case linux.EPOLL_CTL_MOD: - if err := event.CopyIn(t, eventAddr); err != nil { + if _, err := event.CopyIn(t, eventAddr); err != nil { return 0, nil, err } return 0, nil, ep.ModifyInterest(file, fd, event) diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go index dbf4882da..ff1b25d7b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/poll.go +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -374,7 +374,8 @@ func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.D } remaining := timeoutRemaining(t, startNs, timeout) tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) - return tsRemaining.CopyOut(t, timespecAddr) + _, err := tsRemaining.CopyOut(t, timespecAddr) + return err } // copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. @@ -386,7 +387,8 @@ func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Du } remaining := timeoutRemaining(t, startNs, timeout) tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) - return tvRemaining.CopyOut(t, timevalAddr) + _, err := tvRemaining.CopyOut(t, timevalAddr) + return err } // pollRestartBlock encapsulates the state required to restart poll(2) via @@ -477,7 +479,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal timeout := time.Duration(-1) if timevalAddr != 0 { var timeval linux.Timeval - if err := timeval.CopyIn(t, timevalAddr); err != nil { + if _, err := timeval.CopyIn(t, timevalAddr); err != nil { return 0, nil, err } if timeval.Sec < 0 || timeval.Usec < 0 { @@ -519,7 +521,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width())) } var maskStruct sigSetWithSize - if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil { + if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil { return 0, nil, err } if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil { @@ -554,7 +556,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.D timeout := time.Duration(-1) if timespecAddr != 0 { var timespec linux.Timespec - if err := timespec.CopyIn(t, timespecAddr); err != nil { + if _, err := timespec.CopyIn(t, timespecAddr); err != nil { return 0, err } if !timespec.Valid() { @@ -573,7 +575,7 @@ func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) erro return syserror.EINVAL } var mask linux.SignalSet - if err := mask.CopyIn(t, maskAddr); err != nil { + if _, err := mask.CopyIn(t, maskAddr); err != nil { return err } mask &^= kernel.UnblockableSignals diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 136453ccc..4e61f1452 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -226,7 +226,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall opts.Stat.Mtime.Nsec = linux.UTIME_NOW } else { var times linux.Utime - if err := times.CopyIn(t, timesAddr); err != nil { + if _, err := times.CopyIn(t, timesAddr); err != nil { return 0, nil, err } opts.Stat.Atime.Sec = times.Actime diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index fdfe49243..bb1d5cac4 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -91,7 +91,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } var stat linux.Stat convertStatxToUserStat(t, &statx, &stat) - return stat.CopyOut(t, statAddr) + _, err = stat.CopyOut(t, statAddr) + return err } start = dirfile.VirtualDentry() start.IncRef() @@ -111,7 +112,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } var stat linux.Stat convertStatxToUserStat(t, &statx, &stat) - return stat.CopyOut(t, statAddr) + _, err = stat.CopyOut(t, statAddr) + return err } func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec { @@ -140,7 +142,8 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } var stat linux.Stat convertStatxToUserStat(t, &statx, &stat) - return 0, nil, stat.CopyOut(t, statAddr) + _, err = stat.CopyOut(t, statAddr) + return 0, nil, err } // Statx implements Linux syscall statx(2). @@ -199,7 +202,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } userifyStatx(t, &statx) - return 0, nil, statx.CopyOut(t, statxAddr) + _, err = statx.CopyOut(t, statxAddr) + return 0, nil, err } start = dirfile.VirtualDentry() start.IncRef() @@ -218,7 +222,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } userifyStatx(t, &statx) - return 0, nil, statx.CopyOut(t, statxAddr) + _, err = statx.CopyOut(t, statxAddr) + return 0, nil, err } func userifyStatx(t *kernel.Task, statx *linux.Statx) { @@ -359,8 +364,8 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - - return 0, nil, statfs.CopyOut(t, bufAddr) + _, err = statfs.CopyOut(t, bufAddr) + return 0, nil, err } // Fstatfs implements Linux syscall fstatfs(2). @@ -378,6 +383,6 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if err != nil { return 0, nil, err } - - return 0, nil, statfs.CopyOut(t, bufAddr) + _, err = statfs.CopyOut(t, bufAddr) + return 0, nil, err } diff --git a/tools/go_marshal/analysis/analysis_unsafe.go b/tools/go_marshal/analysis/analysis_unsafe.go index 9a9a4f298..cd55cf5cb 100644 --- a/tools/go_marshal/analysis/analysis_unsafe.go +++ b/tools/go_marshal/analysis/analysis_unsafe.go @@ -161,6 +161,10 @@ func AlignmentCheck(t *testing.T, typ reflect.Type) (ok bool, delta uint64) { if typ.NumField() > 0 && nextXOff != int(typ.Size()) { implicitPad := int(typ.Size()) - nextXOff f := typ.Field(typ.NumField() - 1) // Final field + if tag, ok := f.Tag.Lookup("marshal"); ok && tag == "unaligned" { + // Final field explicitly marked unaligned. + break + } t.Fatalf("Suspect offset for field %s.%s at the end of %s, detected an implicit %d byte padding from offset %d to %d at the end of the struct; either add %d bytes of explict padding at end of the struct or tag the final field %s as `marshal:\"unaligned\"`.", typ.Name(), f.Name, typ.Name(), implicitPad, nextXOff, typ.Size(), implicitPad, f.Name) } diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl index d79786a68..323e33882 100644 --- a/tools/go_marshal/defs.bzl +++ b/tools/go_marshal/defs.bzl @@ -53,9 +53,10 @@ go_marshal = rule( # marshal_deps are the dependencies requied by generated code. marshal_deps = [ - "//tools/go_marshal/marshal", + "//pkg/gohacks", "//pkg/safecopy", "//pkg/usermem", + "//tools/go_marshal/marshal", ] # marshal_test_deps are required by test targets. diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go index 82983804c..935a36b25 100644 --- a/tools/go_marshal/gomarshal/generator.go +++ b/tools/go_marshal/gomarshal/generator.go @@ -28,12 +28,6 @@ import ( "gvisor.dev/gvisor/tools/tags" ) -const ( - marshalImport = "gvisor.dev/gvisor/tools/go_marshal/marshal" - safecopyImport = "gvisor.dev/gvisor/pkg/safecopy" - usermemImport = "gvisor.dev/gvisor/pkg/usermem" -) - // List of identifiers we use in generated code that may conflict with a // similarly-named source identifier. Abort gracefully when we see these to // avoid potentially confusing compilation failures in generated code. @@ -44,8 +38,8 @@ const ( // All recievers are single letters, so we don't allow import aliases to be a // single letter. var badIdents = []string{ - "addr", "blk", "buf", "dst", "dsts", "err", "hdr", "idx", "inner", "len", - "ptr", "src", "srcs", "task", "val", + "addr", "blk", "buf", "dst", "dsts", "count", "err", "hdr", "idx", "inner", + "length", "limit", "ptr", "size", "src", "srcs", "task", "val", // All single-letter identifiers. } @@ -110,9 +104,10 @@ func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*G g.imports.add("reflect") g.imports.add("runtime") g.imports.add("unsafe") - g.imports.add(marshalImport) - g.imports.add(safecopyImport) - g.imports.add(usermemImport) + g.imports.add("gvisor.dev/gvisor/pkg/gohacks") + g.imports.add("gvisor.dev/gvisor/pkg/safecopy") + g.imports.add("gvisor.dev/gvisor/pkg/usermem") + g.imports.add("gvisor.dev/gvisor/tools/go_marshal/marshal") return &g, nil } @@ -194,10 +189,73 @@ func (g *Generator) parse() ([]*ast.File, []*token.FileSet, error) { return files, fsets, nil } +// sliceAPI carries information about the '+marshal slice' directive. +type sliceAPI struct { + // Comment node in the AST containing the +marshal tag. + comment *ast.Comment + // Identifier fragment to use when naming generated functions for the slice + // API. + ident string + // Whether the generated functions should reference the newtype name, or the + // inner type name. Only meaningful on newtype declarations on primitives. + inner bool +} + +// marshallableType carries information about a type marked with the '+marshal' +// directive. +type marshallableType struct { + spec *ast.TypeSpec + slice *sliceAPI +} + +func newMarshallableType(fset *token.FileSet, tagLine *ast.Comment, spec *ast.TypeSpec) marshallableType { + mt := marshallableType{ + spec: spec, + slice: nil, + } + + var unhandledTags []string + + for _, tag := range strings.Fields(strings.TrimPrefix(tagLine.Text, "// +marshal")) { + if strings.HasPrefix(tag, "slice:") { + tokens := strings.Split(tag, ":") + if len(tokens) < 2 || len(tokens) > 3 { + abortAt(fset.Position(tagLine.Slash), fmt.Sprintf("+marshal directive has invalid 'slice' clause. Expecting format 'slice:[:inner]', got '%v'", tag)) + } + if len(tokens[1]) == 0 { + abortAt(fset.Position(tagLine.Slash), "+marshal slice directive has empty identifier argument. Expecting '+marshal slice:identifier'") + } + + sa := &sliceAPI{ + comment: tagLine, + ident: tokens[1], + } + mt.slice = sa + + if len(tokens) == 3 { + if tokens[2] != "inner" { + abortAt(fset.Position(tagLine.Slash), "+marshal slice directive has an invalid argument. Expecting '+marshal slice:[:inner]'") + } + sa.inner = true + } + + continue + } + + unhandledTags = append(unhandledTags, tag) + } + + if len(unhandledTags) > 0 { + abortAt(fset.Position(tagLine.Slash), fmt.Sprintf("+marshal directive contained the following unknown clauses: %v", strings.Join(unhandledTags, " "))) + } + + return mt +} + // collectMarshallableTypes walks the parsed AST and collects a list of type // declarations for which we need to generate the Marshallable interface. -func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []*ast.TypeSpec { - var types []*ast.TypeSpec +func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []marshallableType { + var types []marshallableType for _, decl := range a.Decls { gdecl, ok := decl.(*ast.GenDecl) // Type declaration? @@ -212,9 +270,11 @@ func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []*a } // Does the comment contain a "+marshal" line? marked := false + var tagLine *ast.Comment for _, c := range gdecl.Doc.List { - if c.Text == "// +marshal" { + if strings.HasPrefix(c.Text, "// +marshal") { marked = true + tagLine = c break } } @@ -229,20 +289,17 @@ func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []*a switch t.Type.(type) { case *ast.StructType: debugfAt(f.Position(t.Pos()), "Collected marshallable struct %s.\n", t.Name.Name) - types = append(types, t) - continue case *ast.Ident: // Newtype on primitive. debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on primitive %s.\n", t.Name.Name) - types = append(types, t) - continue case *ast.ArrayType: // Newtype on array. debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on array %s.\n", t.Name.Name) - types = append(types, t) - continue + default: + // A user specifically requested marshalling on this type, but we + // don't support it. + abortAt(f.Position(t.Pos()), fmt.Sprintf("Marshalling codegen was requested on type '%s', but go-marshal doesn't support this kind of declaration.\n", t.Name)) } - // A user specifically requested marshalling on this type, but we - // don't support it. - abortAt(f.Position(t.Pos()), fmt.Sprintf("Marshalling codegen was requested on type '%s', but go-marshal doesn't support this kind of declaration.\n", t.Name)) + types = append(types, newMarshallableType(f, tagLine, t)) + } } return types @@ -281,19 +338,28 @@ func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]imp } -func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator { - i := newInterfaceGenerator(t, fset) - switch ty := t.Type.(type) { +func (g *Generator) generateOne(t marshallableType, fset *token.FileSet) *interfaceGenerator { + i := newInterfaceGenerator(t.spec, fset) + switch ty := t.spec.Type.(type) { case *ast.StructType: - i.validateStruct(t, ty) + i.validateStruct(t.spec, ty) i.emitMarshallableForStruct(ty) + if t.slice != nil { + i.emitMarshallableSliceForStruct(ty, t.slice) + } case *ast.Ident: i.validatePrimitiveNewtype(ty) i.emitMarshallableForPrimitiveNewtype(ty) + if t.slice != nil { + i.emitMarshallableSliceForPrimitiveNewtype(ty, t.slice) + } case *ast.ArrayType: - i.validateArrayNewtype(t.Name, ty) + i.validateArrayNewtype(t.spec.Name, ty) // After validate, we can safely call arrayLen. - i.emitMarshallableForArrayNewtype(t.Name, ty.Elt.(*ast.Ident), arrayLen(ty)) + i.emitMarshallableForArrayNewtype(t.spec.Name, ty.Elt.(*ast.Ident), arrayLen(ty)) + if t.slice != nil { + abortAt(fset.Position(t.slice.comment.Slash), fmt.Sprintf("Array type marked as '+marshal slice:...', but this is not supported. Perhaps fold one of the dimensions?")) + } default: // This should've been filtered out by collectMarshallabeTypes. panic(fmt.Sprintf("Unexpected type %+v", ty)) @@ -303,9 +369,9 @@ func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interface // generateOneTestSuite generates a test suite for the automatically generated // implementations type t. -func (g *Generator) generateOneTestSuite(t *ast.TypeSpec) *testGenerator { - i := newTestGenerator(t) - i.emitTests() +func (g *Generator) generateOneTestSuite(t marshallableType) *testGenerator { + i := newTestGenerator(t.spec) + i.emitTests(t.slice) return i } diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go index 8babf61d2..8812c6878 100644 --- a/tools/go_marshal/gomarshal/generator_interfaces.go +++ b/tools/go_marshal/gomarshal/generator_interfaces.go @@ -163,3 +163,65 @@ func (g *interfaceGenerator) unmarshalScalar(accessor, typ, bufVar string) { g.recordPotentiallyNonPackedField(accessor) } } + +// emitCastToByteSlice unsafely casts an arbitrary type's underlying memory to a +// byte slice, bypassing escape analysis. The caller is responsible for ensuring +// srcPtr lives until they're done with dstVar, the runtime does not consider +// dstVar dependent on srcPtr due to the escape analysis bypass. +// +// srcPtr must be a pointer. +// +// This function uses internally uses the identifier "hdr", and cannot be used +// in a context where it is already bound. +func (g *interfaceGenerator) emitCastToByteSlice(srcPtr, dstVar, lenExpr string) { + g.recordUsedImport("gohacks") + g.emit("// Construct a slice backed by dst's underlying memory.\n") + g.emit("var %s []byte\n", dstVar) + g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&%s))\n", dstVar) + g.emit("hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(%s)))\n", srcPtr) + g.emit("hdr.Len = %s\n", lenExpr) + g.emit("hdr.Cap = %s\n\n", lenExpr) +} + +// emitCastToByteSlice unsafely casts a slice with elements of an abitrary type +// to a byte slice. As part of the cast, the byte slice is made to look +// independent of the src slice by bypassing escape analysis. This means the +// byte slice can be used without causing the source to escape. The caller is +// responsible for ensuring srcPtr lives until they're done with dstVar, as the +// runtime no longer considers dstVar dependent on srcPtr and is free to GC it. +// +// srcPtr must be a pointer. +// +// This function uses internally uses the identifiers "ptr", "val" and "hdr", +// and cannot be used in a context where these identifiers are already bound. +func (g *interfaceGenerator) emitCastSliceToByteSlice(srcPtr, dstVar, lenExpr string) { + g.emitNoEscapeSliceDataPointer(srcPtr, "val") + + g.emit("// Construct a slice backed by dst's underlying memory.\n") + g.emit("var %s []byte\n", dstVar) + g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&%s))\n", dstVar) + g.emit("hdr.Data = uintptr(val)\n") + g.emit("hdr.Len = %s\n", lenExpr) + g.emit("hdr.Cap = %s\n\n", lenExpr) +} + +// emitNoEscapeSliceDataPointer unsafely casts a slice's data pointer to an +// unsafe.Pointer, bypassing escape analysis. The caller is responsible for +// ensuring srcPtr lives until they're done with dstVar, as the runtime no +// longer considers dstVar dependent on srcPtr and is free to GC it. +// +// srcPtr must be a pointer. +// +// This function uses internally uses the identifier "ptr" cannot be used in a +// context where this identifier is already bound. +func (g *interfaceGenerator) emitNoEscapeSliceDataPointer(srcPtr, dstVar string) { + g.recordUsedImport("gohacks") + g.emit("ptr := unsafe.Pointer(%s)\n", srcPtr) + g.emit("%s := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data))\n\n", dstVar) +} + +func (g *interfaceGenerator) emitKeepAlive(ptrVar string) { + g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", ptrVar) + g.emit("// must live until the use above.\n") + g.emit("runtime.KeepAlive(%s)\n", ptrVar) +} diff --git a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go index da36d9305..5ba74a606 100644 --- a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go +++ b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go @@ -104,79 +104,43 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n, elt *ast.Ident, }) g.emit("}\n\n") + g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n") + g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName()) + g.inIndent(func() { + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n") + g.emitKeepAlive(g.r) + g.emit("return length, err\n") + }) + g.emit("}\n\n") + g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") - g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName()) + g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { - // Fast serialization. - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("_, err := task.CopyOutBytes(addr, buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the CopyOutBytes.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return err\n") + g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n") - g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName()) + g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("_, err := task.CopyInBytes(addr, buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the CopyInBytes.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return err\n") + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := task.CopyInBytes(addr, buf)\n") + g.emitKeepAlive(g.r) + g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// WriteTo implements io.WriterTo.WriteTo.\n") g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName()) g.inIndent(func() { - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("len, err := w.Write(buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the Write.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return int64(len), err\n") + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := w.Write(buf)\n") + g.emitKeepAlive(g.r) + g.emit("return int64(length), err\n") }) g.emit("}\n\n") diff --git a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go index 159397825..ef9bb903d 100644 --- a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go +++ b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go @@ -150,80 +150,133 @@ func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident) }) g.emit("}\n\n") + g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n") + g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName()) + g.inIndent(func() { + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n") + g.emitKeepAlive(g.r) + g.emit("return length, err\n") + }) + g.emit("}\n\n") + g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") - g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName()) + g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { - // Fast serialization. - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("_, err := task.CopyOutBytes(addr, buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the CopyOutBytes.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return err\n") + g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n") - g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName()) + g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("_, err := task.CopyInBytes(addr, buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the CopyInBytes.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return err\n") + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := task.CopyInBytes(addr, buf)\n") + g.emitKeepAlive(g.r) + g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// WriteTo implements io.WriterTo.WriteTo.\n") g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName()) g.inIndent(func() { - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("len, err := w.Write(buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the Write.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return int64(len), err\n") + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := w.Write(buf)\n") + g.emitKeepAlive(g.r) + g.emit("return int64(length), err\n") + + }) + g.emit("}\n\n") +} + +func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Ident, slice *sliceAPI) { + g.recordUsedImport("marshal") + g.recordUsedImport("usermem") + g.recordUsedImport("reflect") + g.recordUsedImport("runtime") + g.recordUsedImport("unsafe") + + eltType := g.typeName() + if slice.inner { + eltType = nt.Name + } + + g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, eltType) + g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, eltType) + g.inIndent(func() { + g.emit("count := len(dst)\n") + g.emit("if count == 0 {\n") + g.inIndent(func() { + g.emit("return 0, nil\n") + }) + g.emit("}\n") + g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) + + g.emitCastSliceToByteSlice("&dst", "buf", "size * count") + + g.emit("length, err := task.CopyInBytes(addr, buf)\n") + g.emitKeepAlive("dst") + g.emit("return length, err\n") + }) + g.emit("}\n\n") + + g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, eltType) + g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, eltType) + g.inIndent(func() { + g.emit("count := len(src)\n") + g.emit("if count == 0 {\n") + g.inIndent(func() { + g.emit("return 0, nil\n") + }) + g.emit("}\n") + g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) + + g.emitCastSliceToByteSlice("&src", "buf", "size * count") + + g.emit("length, err := task.CopyOutBytes(addr, buf)\n") + g.emitKeepAlive("src") + g.emit("return length, err\n") + }) + g.emit("}\n\n") + + g.emit("// MarshalUnsafe%s is like %s.MarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName()) + g.emit("func MarshalUnsafe%s(src []%s, dst []byte) (int, error) {\n", slice.ident, g.typeName()) + g.inIndent(func() { + g.emit("count := len(src)\n") + g.emit("if count == 0 {\n") + g.inIndent(func() { + g.emit("return 0, nil\n") + }) + g.emit("}\n") + g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) + + g.emitNoEscapeSliceDataPointer("&src", "val") + + g.emit("length, err := safecopy.CopyIn(dst[:(size*count)], val)\n") + g.emitKeepAlive("src") + g.emit("return length, err\n") + }) + g.emit("}\n\n") + + g.emit("// UnmarshalUnsafe%s is like %s.UnmarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName()) + g.emit("func UnmarshalUnsafe%s(dst []%s, src []byte) (int, error) {\n", slice.ident, g.typeName()) + g.inIndent(func() { + g.emit("count := len(dst)\n") + g.emit("if count == 0 {\n") + g.inIndent(func() { + g.emit("return 0, nil\n") + }) + g.emit("}\n") + g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) + + g.emitNoEscapeSliceDataPointer("&dst", "val") + g.emit("length, err := safecopy.CopyOut(val, src[:(size*count)])\n") + g.emitKeepAlive("dst") + g.emit("return length, err\n") }) g.emit("}\n\n") } diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go index e66a38b2e..bd57eae0e 100644 --- a/tools/go_marshal/gomarshal/generator_interfaces_struct.go +++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go @@ -72,20 +72,24 @@ func (g *interfaceGenerator) validateStruct(ts *ast.TypeSpec, st *ast.StructType }) } -func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { - // Is g.t a packed struct without consideing field types? - thisPacked := true +func (g *interfaceGenerator) isStructPacked(st *ast.StructType) bool { + packed := true forEachStructField(st, func(f *ast.Field) { if f.Tag != nil { if f.Tag.Value == "`marshal:\"unaligned\"`" { - if thisPacked { + if packed { debugfAt(g.f.Position(g.t.Pos()), fmt.Sprintf("Marking type '%s' as not packed due to tag `marshal:\"unaligned\"`.\n", g.t.Name)) - thisPacked = false + packed = false } } } }) + return packed +} + +func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { + thisPacked := g.isStructPacked(st) g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n") g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName()) @@ -302,17 +306,16 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { }) g.emit("}\n\n") - g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") + g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n") g.recordUsedImport("marshal") g.recordUsedImport("usermem") - g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName()) + g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r) g.emit("%s.MarshalBytes(buf)\n", g.r) - g.emit("_, err := task.CopyOutBytes(addr, buf)\n") - g.emit("return err\n") + g.emit("return task.CopyOutBytes(addr, buf[:limit])\n") } if thisPacked { g.recordUsedImport("reflect") @@ -324,48 +327,39 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { g.emit("}\n\n") } // Fast serialization. - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("_, err := task.CopyOutBytes(addr, buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the CopyOutBytes.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return err\n") + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n") + g.emitKeepAlive(g.r) + g.emit("return length, err\n") } else { fallback() } }) g.emit("}\n\n") + g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") + g.recordUsedImport("marshal") + g.recordUsedImport("usermem") + g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName()) + g.inIndent(func() { + g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r) + }) + g.emit("}\n\n") + g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n") g.recordUsedImport("marshal") g.recordUsedImport("usermem") - g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName()) + g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r) - g.emit("_, err := task.CopyInBytes(addr, buf)\n") - g.emit("if err != nil {\n") - g.inIndent(func() { - g.emit("return err\n") - }) - g.emit("}\n") - + g.emit("length, err := task.CopyInBytes(addr, buf)\n") + g.emit("// Unmarshal unconditionally. If we had a short copy-in, this results in a\n") + g.emit("// partially unmarshalled struct.\n") g.emit("%s.UnmarshalBytes(buf)\n", g.r) - g.emit("return nil\n") + g.emit("return length, err\n") } if thisPacked { g.recordUsedImport("reflect") @@ -377,25 +371,11 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { g.emit("}\n\n") } // Fast deserialization. - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("_, err := task.CopyInBytes(addr, buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the CopyInBytes.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return err\n") + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := task.CopyInBytes(addr, buf)\n") + g.emitKeepAlive(g.r) + g.emit("return length, err\n") } else { fallback() } @@ -410,8 +390,8 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r) g.emit("%s.MarshalBytes(buf)\n", g.r) - g.emit("n, err := w.Write(buf)\n") - g.emit("return int64(n), err\n") + g.emit("length, err := w.Write(buf)\n") + g.emit("return int64(length), err\n") } if thisPacked { g.recordUsedImport("reflect") @@ -423,25 +403,199 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { g.emit("}\n\n") } // Fast serialization. - g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r) - g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r) - g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n") - g.emit("ptr := unsafe.Pointer(%s)\n", g.r) - g.emit("val := uintptr(ptr)\n") - g.emit("val = val^0\n\n") - - g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r) - g.emit("var buf []byte\n") - g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n") - g.emit("hdr.Data = val\n") - g.emit("hdr.Len = %s.SizeBytes()\n", g.r) - g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r) - - g.emit("len, err := w.Write(buf)\n") - g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r) - g.emit("// must live until after the Write.\n") - g.emit("runtime.KeepAlive(%s)\n", g.r) - g.emit("return int64(len), err\n") + g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) + + g.emit("length, err := w.Write(buf)\n") + g.emitKeepAlive(g.r) + g.emit("return int64(length), err\n") + } else { + fallback() + } + }) + g.emit("}\n\n") +} + +func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType, slice *sliceAPI) { + thisPacked := g.isStructPacked(st) + + if slice.inner { + abortAt(g.f.Position(slice.comment.Slash), fmt.Sprintf("The ':inner' argument to '+marshal slice:%s:inner' is only applicable to newtypes on primitives. Remove it from this struct declaration.", slice.ident)) + } + + g.recordUsedImport("marshal") + g.recordUsedImport("usermem") + + g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, g.typeName()) + g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, g.typeName()) + g.inIndent(func() { + g.emit("count := len(dst)\n") + g.emit("if count == 0 {\n") + g.inIndent(func() { + g.emit("return 0, nil\n") + }) + g.emit("}\n") + g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) + + fallback := func() { + g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) + g.emit("buf := task.CopyScratchBuffer(size * count)\n") + g.emit("length, err := task.CopyInBytes(addr, buf)\n\n") + + g.emit("// Unmarshal as much as possible, even on error. First handle full objects.\n") + g.emit("limit := length/size\n") + g.emit("for idx := 0; idx < limit; idx++ {\n") + g.inIndent(func() { + g.emit("dst[idx].UnmarshalBytes(buf[size*idx:size*(idx+1)])\n") + }) + g.emit("}\n\n") + + g.emit("// Handle any final partial object.\n") + g.emit("if length < size*count && length%size != 0 {\n") + g.inIndent(func() { + g.emit("idx := limit\n") + g.emit("dst[idx].UnmarshalBytes(buf[size*idx:size*(idx+1)])\n") + }) + g.emit("}\n\n") + + g.emit("return length, err\n") + } + if thisPacked { + g.recordUsedImport("reflect") + g.recordUsedImport("runtime") + g.recordUsedImport("unsafe") + if _, ok := g.areFieldsPackedExpression(); ok { + g.emit("if !dst[0].Packed() {\n") + g.inIndent(fallback) + g.emit("}\n\n") + } + // Fast deserialization. + g.emitCastSliceToByteSlice("&dst", "buf", "size * count") + + g.emit("length, err := task.CopyInBytes(addr, buf)\n") + g.emitKeepAlive("dst") + g.emit("return length, err\n") + } else { + fallback() + } + }) + g.emit("}\n\n") + + g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, g.typeName()) + g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, g.typeName()) + g.inIndent(func() { + g.emit("count := len(src)\n") + g.emit("if count == 0 {\n") + g.inIndent(func() { + g.emit("return 0, nil\n") + }) + g.emit("}\n") + g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) + + fallback := func() { + g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) + g.emit("buf := task.CopyScratchBuffer(size * count)\n") + g.emit("for idx := 0; idx < count; idx++ {\n") + g.inIndent(func() { + g.emit("src[idx].MarshalBytes(buf[size*idx:size*(idx+1)])\n") + }) + g.emit("}\n") + g.emit("return task.CopyOutBytes(addr, buf)\n") + } + if thisPacked { + g.recordUsedImport("reflect") + g.recordUsedImport("runtime") + g.recordUsedImport("unsafe") + if _, ok := g.areFieldsPackedExpression(); ok { + g.emit("if !src[0].Packed() {\n") + g.inIndent(fallback) + g.emit("}\n\n") + } + // Fast serialization. + g.emitCastSliceToByteSlice("&src", "buf", "size * count") + + g.emit("length, err := task.CopyOutBytes(addr, buf)\n") + g.emitKeepAlive("src") + g.emit("return length, err\n") + } else { + fallback() + } + }) + g.emit("}\n\n") + + g.emit("// MarshalUnsafe%s is like %s.MarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName()) + g.emit("func MarshalUnsafe%s(src []%s, dst []byte) (int, error) {\n", slice.ident, g.typeName()) + g.inIndent(func() { + g.emit("count := len(src)\n") + g.emit("if count == 0 {\n") + g.inIndent(func() { + g.emit("return 0, nil\n") + }) + g.emit("}\n") + g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) + + fallback := func() { + g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) + g.emit("for idx := 0; idx < count; idx++ {\n") + g.inIndent(func() { + g.emit("src[idx].MarshalBytes(dst[size*idx:(size)*(idx+1)])\n") + }) + g.emit("}\n") + g.emit("return size * count, nil\n") + } + if thisPacked { + g.recordUsedImport("reflect") + g.recordUsedImport("runtime") + g.recordUsedImport("unsafe") + if _, ok := g.areFieldsPackedExpression(); ok { + g.emit("if !src[0].Packed() {\n") + g.inIndent(fallback) + g.emit("}\n\n") + } + g.emitNoEscapeSliceDataPointer("&src", "val") + + g.emit("length, err := safecopy.CopyIn(dst[:(size*count)], val)\n") + g.emitKeepAlive("src") + g.emit("return length, err\n") + } else { + fallback() + } + }) + g.emit("}\n\n") + + g.emit("// UnmarshalUnsafe%s is like %s.UnmarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName()) + g.emit("func UnmarshalUnsafe%s(dst []%s, src []byte) (int, error) {\n", slice.ident, g.typeName()) + g.inIndent(func() { + g.emit("count := len(dst)\n") + g.emit("if count == 0 {\n") + g.inIndent(func() { + g.emit("return 0, nil\n") + }) + g.emit("}\n") + g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) + + fallback := func() { + g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) + g.emit("for idx := 0; idx < count; idx++ {\n") + g.inIndent(func() { + g.emit("dst[idx].UnmarshalBytes(src[size*idx:size*(idx+1)])\n") + }) + g.emit("}\n") + g.emit("return size * count, nil\n") + } + if thisPacked { + g.recordUsedImport("reflect") + g.recordUsedImport("runtime") + g.recordUsedImport("unsafe") + if _, ok := g.areFieldsPackedExpression(); ok { + g.emit("if !dst[0].Packed() {\n") + g.inIndent(fallback) + g.emit("}\n\n") + } + g.emitNoEscapeSliceDataPointer("&dst", "val") + + g.emit("length, err := safecopy.CopyOut(val, src[:(size*count)])\n") + g.emitKeepAlive("dst") + g.emit("return length, err\n") } else { fallback() } diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go index fd992e44a..631295373 100644 --- a/tools/go_marshal/gomarshal/generator_tests.go +++ b/tools/go_marshal/gomarshal/generator_tests.go @@ -30,6 +30,11 @@ var standardImports = []string{ "gvisor.dev/gvisor/tools/go_marshal/analysis", } +var sliceAPIImports = []string{ + "encoding/binary", + "gvisor.dev/gvisor/pkg/usermem", +} + type testGenerator struct { sourceBuffer @@ -58,6 +63,11 @@ func newTestGenerator(t *ast.TypeSpec) *testGenerator { for _, i := range standardImports { g.imports.add(i).markUsed() } + // These imports are used if a type requests the slice API. Don't + // mark them as used by default. + for _, i := range sliceAPIImports { + g.imports.add(i) + } return g } @@ -132,6 +142,42 @@ func (g *testGenerator) emitTestMarshalUnmarshalPreservesData() { }) } +func (g *testGenerator) emitTestMarshalUnmarshalSlicePreservesData(slice *sliceAPI) { + for _, name := range []string{"binary", "usermem"} { + if !g.imports.markUsed(name) { + panic(fmt.Sprintf("Generated test for '%s' referenced a non-existent import with local name '%s'", g.typeName(), name)) + } + } + + g.inTestFunction("TestSafeMarshalUnmarshalSlicePreservesData", func() { + g.emit("var x, y, yUnsafe [8]%s\n", g.typeName()) + g.emit("analysis.RandomizeValue(&x)\n\n") + g.emit("size := (*%s)(nil).SizeBytes() * len(x)\n", g.typeName()) + g.emit("buf := bytes.NewBuffer(make([]byte, size))\n") + g.emit("buf.Reset()\n") + g.emit("if err := binary.Write(buf, usermem.ByteOrder, x[:]); err != nil {\n") + g.inIndent(func() { + g.emit("t.Fatal(fmt.Sprintf(\"binary.Write failed: %v\", err))\n") + }) + g.emit("}\n") + g.emit("bufUnsafe := make([]byte, size)\n") + g.emit("MarshalUnsafe%s(x[:], bufUnsafe)\n\n", slice.ident) + + g.emit("UnmarshalUnsafe%s(y[:], buf.Bytes())\n", slice.ident) + g.emit("if !reflect.DeepEqual(x, y) {\n") + g.inIndent(func() { + g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across binary.Write/UnmarshalUnsafeSlice cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n") + }) + g.emit("}\n") + g.emit("UnmarshalUnsafe%s(yUnsafe[:], bufUnsafe)\n", slice.ident) + g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n") + g.inIndent(func() { + g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafeSlice/UnmarshalUnsafeSlice cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n") + }) + g.emit("}\n\n") + }) +} + func (g *testGenerator) emitTestWriteToUnmarshalPreservesData() { g.inTestFunction("TestWriteToUnmarshalPreservesData", func() { g.emit("var x, y, yUnsafe %s\n", g.typeName()) @@ -170,12 +216,16 @@ func (g *testGenerator) emitTestSizeBytesOnTypedNilPtr() { }) } -func (g *testGenerator) emitTests() { +func (g *testGenerator) emitTests(slice *sliceAPI) { g.emitTestNonZeroSize() g.emitTestSuspectAlignment() g.emitTestMarshalUnmarshalPreservesData() g.emitTestWriteToUnmarshalPreservesData() g.emitTestSizeBytesOnTypedNilPtr() + + if slice != nil { + g.emitTestMarshalUnmarshalSlicePreservesData(slice) + } } func (g *testGenerator) write(out io.Writer) error { diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go index a0936e013..4cb22dd2d 100644 --- a/tools/go_marshal/gomarshal/util.go +++ b/tools/go_marshal/gomarshal/util.go @@ -344,22 +344,25 @@ func newImportTable() *importTable { // result in a panic. func (i *importTable) merge(other *importTable) { for name, im := range other.is { - if dup, ok := i.is[name]; ok && !dup.equivalent(im) { - panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im)) + dup, ok := i.is[name] + if ok { + // When merging two imports, if either are marked used, the merged entry + // should also be marked used. + im.used = im.used || dup.used + + if !dup.equivalent(im) { + panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im)) + } } - i.is[name] = im } } func (i *importTable) addStmt(s *importStmt) *importStmt { if old, ok := i.is[s.name]; ok && !old.equivalent(s) { - // A collision should always be between an import inserted by the - // go-marshal tool and an import from the original source file (assuming - // the original source file was valid). We could theoretically handle - // the collision by assigning a local name to our import. However, this - // would need to be plumbed throughout the generator. Given that - // collisions should be rare, simply panic on collision. + // We could theoretically handle the collision by assigning a local name + // to one of the imports. However, this is a non-trivial transformation. + // Given that collisions should be rare, simply panic on collision. panic(fmt.Sprintf("Import collision: old: %s as %v; new: %v as %v", old.path, old.name, s.path, s.name)) } i.is[s.name] = s diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go index f129788e0..cb2166252 100644 --- a/tools/go_marshal/marshal/marshal.go +++ b/tools/go_marshal/marshal/marshal.go @@ -42,7 +42,11 @@ type Task interface { CopyInBytes(addr usermem.Addr, b []byte) (int, error) } -// Marshallable represents a type that can be marshalled to and from memory. +// Marshallable represents operations on a type that can be marshalled to and +// from memory. +// +// go-marshal automatically generates implementations for this interface for +// types marked as '+marshal'. type Marshallable interface { io.WriterTo @@ -54,12 +58,18 @@ type Marshallable interface { // likely make use of the type of these fields). SizeBytes() int - // MarshalBytes serializes a copy of a type to dst. dst must be at least - // SizeBytes() long. + // MarshalBytes serializes a copy of a type to dst. dst may be smaller than + // SizeBytes(), which results in a part of the struct being marshalled. Note + // that this may have unexpected results for non-packed types, as implicit + // padding needs to be taken into account when reasoning about how much of + // the type is serialized. MarshalBytes(dst []byte) - // UnmarshalBytes deserializes a type from src. src must be at least - // SizeBytes() long. + // UnmarshalBytes deserializes a type from src. src may be smaller than + // SizeBytes(), which results in a partially deserialized struct. Note that + // this may have unexpected results for non-packed types, as implicit + // padding needs to be taken into account when reasoning about how much of + // the type is deserialized. UnmarshalBytes(src []byte) // Packed returns true if the marshalled size of the type is the same as the @@ -67,13 +77,20 @@ type Marshallable interface { // starting at unaligned addresses (should always be true by default for ABI // structs, verified by automatically generated tests when using // go_marshal), and has no fields marked `marshal:"unaligned"`. + // + // Packed must return the same result for all possible values of the type + // implementing it. Violating this constraint implies the type doesn't have + // a static memory layout, and will lead to memory corruption. + // Go-marshal-generated code reuses the result of Packed for multiple values + // of the same type. Packed() bool // MarshalUnsafe serializes a type by bulk copying its in-memory // representation to the dst buffer. This is only safe to do when the type // has no implicit padding, see Marshallable.Packed. When Packed would // return false, MarshalUnsafe should fall back to the safer but slower - // MarshalBytes. + // MarshalBytes. dst may be smaller than SizeBytes(), see comment for + // MarshalBytes for implications. MarshalUnsafe(dst []byte) // UnmarshalUnsafe deserializes a type by directly copying to the underlying @@ -82,7 +99,8 @@ type Marshallable interface { // This allows much faster unmarshalling of types which have no implicit // padding, see Marshallable.Packed. When Packed would return false, // UnmarshalUnsafe should fall back to the safer but slower unmarshal - // mechanism implemented in UnmarshalBytes. + // mechanism implemented in UnmarshalBytes. src may be smaller than + // SizeBytes(), see comment for UnmarshalBytes for implications. UnmarshalUnsafe(src []byte) // CopyIn deserializes a Marshallable type from a task's memory. This may @@ -91,12 +109,79 @@ type Marshallable interface { // marshalled does not escape. The implementation should avoid creating // extra copies in memory by directly deserializing to the object's // underlying memory. - CopyIn(task Task, addr usermem.Addr) error + // + // If the copy-in from the task memory is only partially successful, CopyIn + // should still attempt to deserialize as much data as possible. See comment + // for UnmarshalBytes. + CopyIn(task Task, addr usermem.Addr) (int, error) // CopyOut serializes a Marshallable type to a task's memory. This may only // be called from a task goroutine. This is more efficient than calling // MarshalUnsafe on Marshallable.Packed types, as the type being serialized // does not escape. The implementation should avoid creating extra copies in // memory by directly serializing from the object's underlying memory. - CopyOut(task Task, addr usermem.Addr) error + // + // The copy-out to the task memory may be partially successful, in which + // case CopyOut returns how much data was serialized. See comment for + // MarshalBytes for implications. + CopyOut(task Task, addr usermem.Addr) (int, error) + + // CopyOutN is like CopyOut, but explicitly requests a partial + // copy-out. Note that this may yield unexpected results for non-packed + // types and the caller may only want to allow this for packed types. See + // comment on MarshalBytes. + // + // The limit must be less than or equal to SizeBytes(). + CopyOutN(task Task, addr usermem.Addr, limit int) (int, error) } + +// go-marshal generates additional functions for a type based on additional +// clauses to the +marshal directive. They are documented below. +// +// Slice API +// ========= +// +// Adding a "slice" clause to the +marshal directive for structs or newtypes on +// primitives like this: +// +// // +marshal slice:FooSlice +// type Foo struct { ... } +// +// Generates four additional functions for marshalling slices of Foos like this: +// +// // MarshalUnsafeFooSlice is like Foo.MarshalUnsafe, buf for a []Foo. It's +// // more efficient that repeatedly calling calling Foo.MarshalUnsafe over a +// // []Foo in a loop. +// func MarshalUnsafeFooSlice(src []Foo, dst []byte) (int, error) { ... } +// +// // UnmarshalUnsafeFooSlice is like Foo.UnmarshalUnsafe, buf for a []Foo. It's +// // more efficient that repeatedly calling calling Foo.UnmarshalUnsafe over a +// // []Foo in a loop. +// func UnmarshalUnsafeFooSlice(dst []Foo, src []byte) (int, error) { ... } +// +// // CopyFooSliceIn copies in a slice of Foo objects from the task's memory. +// func CopyFooSliceIn(task marshal.Task, addr usermem.Addr, dst []Foo) (int, error) { ... } +// +// // CopyFooSliceIn copies out a slice of Foo objects to the task's memory. +// func CopyFooSliceOut(task marshal.Task, addr usermem.Addr, src []Foo) (int, error) { ... } +// +// The name of the functions are of the format "Copy%sIn" and "Copy%sOut", where +// %s is the first argument to the slice clause. This directive is not supported +// for newtypes on arrays. +// +// The slice clause also takes an optional second argument, which must be the +// value "inner": +// +// // +marshal slice:Int32Slice:inner +// type Int32 int32 +// +// This is only valid on newtypes on primitives, and causes the generated +// functions to accept slices of the inner type instead: +// +// func CopyInt32SliceIn(task marshal.Task, addr usermem.Addr, dst []int32) (int, error) { ... } +// +// Without "inner", they would instead be: +// +// func CopyInt32SliceIn(task marshal.Task, addr usermem.Addr, dst []Int32) (int, error) { ... } +// +// This may help avoid a cast depending on how the generated functions are used. diff --git a/tools/go_marshal/primitive/BUILD b/tools/go_marshal/primitive/BUILD new file mode 100644 index 000000000..cc08ba63a --- /dev/null +++ b/tools/go_marshal/primitive/BUILD @@ -0,0 +1,18 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "primitive", + srcs = [ + "primitive.go", + ], + marshal = True, + visibility = [ + "//:sandbox", + ], + deps = [ + "//pkg/usermem", + "//tools/go_marshal/marshal", + ], +) diff --git a/tools/go_marshal/primitive/primitive.go b/tools/go_marshal/primitive/primitive.go new file mode 100644 index 000000000..ebcf130ae --- /dev/null +++ b/tools/go_marshal/primitive/primitive.go @@ -0,0 +1,175 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package primitive defines marshal.Marshallable implementations for primitive +// types. +package primitive + +import ( + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" +) + +// Int16 is a marshal.Marshallable implementation for int16. +// +// +marshal slice:Int16Slice:inner +type Int16 int16 + +// Uint16 is a marshal.Marshallable implementation for uint16. +// +// +marshal slice:Uint16Slice:inner +type Uint16 uint16 + +// Int32 is a marshal.Marshallable implementation for int32. +// +// +marshal slice:Int32Slice:inner +type Int32 int32 + +// Uint32 is a marshal.Marshallable implementation for uint32. +// +// +marshal slice:Uint32Slice:inner +type Uint32 uint32 + +// Int64 is a marshal.Marshallable implementation for int64. +// +// +marshal slice:Int64Slice:inner +type Int64 int64 + +// Uint64 is a marshal.Marshallable implementation for uint64. +// +// +marshal slice:Uint64Slice:inner +type Uint64 uint64 + +// Below, we define some convenience functions for marshalling primitive types +// using the newtypes above, without requiring superfluous casts. + +// 16-bit integers + +// CopyInt16In is a convenient wrapper for copying in an int16 from the task's +// memory. +func CopyInt16In(task marshal.Task, addr usermem.Addr, dst *int16) (int, error) { + var buf Int16 + n, err := buf.CopyIn(task, addr) + if err != nil { + return n, err + } + *dst = int16(buf) + return n, nil +} + +// CopyInt16Out is a convenient wrapper for copying out an int16 to the task's +// memory. +func CopyInt16Out(task marshal.Task, addr usermem.Addr, src int16) (int, error) { + srcP := Int16(src) + return srcP.CopyOut(task, addr) +} + +// CopyUint16In is a convenient wrapper for copying in a uint16 from the task's +// memory. +func CopyUint16In(task marshal.Task, addr usermem.Addr, dst *uint16) (int, error) { + var buf Uint16 + n, err := buf.CopyIn(task, addr) + if err != nil { + return n, err + } + *dst = uint16(buf) + return n, nil +} + +// CopyUint16Out is a convenient wrapper for copying out a uint16 to the task's +// memory. +func CopyUint16Out(task marshal.Task, addr usermem.Addr, src uint16) (int, error) { + srcP := Uint16(src) + return srcP.CopyOut(task, addr) +} + +// 32-bit integers + +// CopyInt32In is a convenient wrapper for copying in an int32 from the task's +// memory. +func CopyInt32In(task marshal.Task, addr usermem.Addr, dst *int32) (int, error) { + var buf Int32 + n, err := buf.CopyIn(task, addr) + if err != nil { + return n, err + } + *dst = int32(buf) + return n, nil +} + +// CopyInt32Out is a convenient wrapper for copying out an int32 to the task's +// memory. +func CopyInt32Out(task marshal.Task, addr usermem.Addr, src int32) (int, error) { + srcP := Int32(src) + return srcP.CopyOut(task, addr) +} + +// CopyUint32In is a convenient wrapper for copying in a uint32 from the task's +// memory. +func CopyUint32In(task marshal.Task, addr usermem.Addr, dst *uint32) (int, error) { + var buf Uint32 + n, err := buf.CopyIn(task, addr) + if err != nil { + return n, err + } + *dst = uint32(buf) + return n, nil +} + +// CopyUint32Out is a convenient wrapper for copying out a uint32 to the task's +// memory. +func CopyUint32Out(task marshal.Task, addr usermem.Addr, src uint32) (int, error) { + srcP := Uint32(src) + return srcP.CopyOut(task, addr) +} + +// 64-bit integers + +// CopyInt64In is a convenient wrapper for copying in an int64 from the task's +// memory. +func CopyInt64In(task marshal.Task, addr usermem.Addr, dst *int64) (int, error) { + var buf Int64 + n, err := buf.CopyIn(task, addr) + if err != nil { + return n, err + } + *dst = int64(buf) + return n, nil +} + +// CopyInt64Out is a convenient wrapper for copying out an int64 to the task's +// memory. +func CopyInt64Out(task marshal.Task, addr usermem.Addr, src int64) (int, error) { + srcP := Int64(src) + return srcP.CopyOut(task, addr) +} + +// CopyUint64In is a convenient wrapper for copying in a uint64 from the task's +// memory. +func CopyUint64In(task marshal.Task, addr usermem.Addr, dst *uint64) (int, error) { + var buf Uint64 + n, err := buf.CopyIn(task, addr) + if err != nil { + return n, err + } + *dst = uint64(buf) + return n, nil +} + +// CopyUint64Out is a convenient wrapper for copying out a uint64 to the task's +// memory. +func CopyUint64Out(task marshal.Task, addr usermem.Addr, src uint64) (int, error) { + srcP := Uint64(src) + return srcP.CopyOut(task, addr) +} diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD index f27c5ce52..3b839799d 100644 --- a/tools/go_marshal/test/BUILD +++ b/tools/go_marshal/test/BUILD @@ -39,3 +39,17 @@ go_binary( "//tools/go_marshal/marshal", ], ) + +go_test( + name = "marshal_test", + size = "small", + srcs = ["marshal_test.go"], + deps = [ + ":test", + "//pkg/syserror", + "//pkg/usermem", + "//tools/go_marshal/analysis", + "//tools/go_marshal/marshal", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/tools/go_marshal/test/benchmark_test.go b/tools/go_marshal/test/benchmark_test.go index c79defe9e..224d308c7 100644 --- a/tools/go_marshal/test/benchmark_test.go +++ b/tools/go_marshal/test/benchmark_test.go @@ -176,3 +176,45 @@ func BenchmarkGoMarshalUnsafe(b *testing.B) { panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2)) } } + +func BenchmarkBinarySlice(b *testing.B) { + var s1, s2 [64]test.Stat + analysis.RandomizeValue(&s1) + + size := binary.Size(s1) + + b.ResetTimer() + + for n := 0; n < b.N; n++ { + buf := make([]byte, 0, size) + buf = binary.Marshal(buf, usermem.ByteOrder, &s1) + binary.Unmarshal(buf, usermem.ByteOrder, &s2) + } + + b.StopTimer() + + // Sanity check, make sure the values were preserved. + if !reflect.DeepEqual(s1, s2) { + panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2)) + } +} + +func BenchmarkGoMarshalUnsafeSlice(b *testing.B) { + var s1, s2 [64]test.Stat + analysis.RandomizeValue(&s1) + + b.ResetTimer() + + for n := 0; n < b.N; n++ { + buf := make([]byte, (*test.Stat)(nil).SizeBytes()*len(s1)) + test.MarshalUnsafeStatSlice(s1[:], buf) + test.UnmarshalUnsafeStatSlice(s2[:], buf) + } + + b.StopTimer() + + // Sanity check, make sure the values were preserved. + if !reflect.DeepEqual(s1, s2) { + panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2)) + } +} diff --git a/tools/go_marshal/test/external/external.go b/tools/go_marshal/test/external/external.go index 4be3722f3..26fe8e0c8 100644 --- a/tools/go_marshal/test/external/external.go +++ b/tools/go_marshal/test/external/external.go @@ -21,3 +21,11 @@ package external type External struct { j int64 } + +// NotPacked is an unaligned Marshallable type for use in testing. +// +// +marshal +type NotPacked struct { + a int32 + b byte `marshal:"unaligned"` +} diff --git a/tools/go_marshal/test/marshal_test.go b/tools/go_marshal/test/marshal_test.go new file mode 100644 index 000000000..16829ee45 --- /dev/null +++ b/tools/go_marshal/test/marshal_test.go @@ -0,0 +1,515 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package marshal_test contains manual tests for the marshal interface. These +// are intended to test behaviour not covered by the automatically generated +// tests. +package marshal_test + +import ( + "bytes" + "encoding/binary" + "fmt" + "reflect" + "runtime" + "testing" + "unsafe" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/analysis" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/test" +) + +var simulatedErr error = syserror.EFAULT + +// mockTask implements marshal.Task. +type mockTask struct { + taskMem usermem.BytesIO +} + +// populate fills the task memory with the contents of val. +func (t *mockTask) populate(val interface{}) { + var buf bytes.Buffer + // Use binary.Write so we aren't testing go-marshal against its own + // potentially buggy implementation. + if err := binary.Write(&buf, usermem.ByteOrder, val); err != nil { + panic(err) + } + t.taskMem.Bytes = buf.Bytes() +} + +func (t *mockTask) setLimit(n int) { + if len(t.taskMem.Bytes) < n { + grown := make([]byte, n) + copy(grown, t.taskMem.Bytes) + t.taskMem.Bytes = grown + return + } + t.taskMem.Bytes = t.taskMem.Bytes[:n] +} + +// CopyScratchBuffer implements marshal.Task.CopyScratchBuffer. +func (t *mockTask) CopyScratchBuffer(size int) []byte { + return make([]byte, size) +} + +// CopyOutBytes implements marshal.Task.CopyOutBytes. The implementation +// completely ignores the target address and stores a copy of b in its +// internally buffer, overriding any previous contents. +func (t *mockTask) CopyOutBytes(_ usermem.Addr, b []byte) (int, error) { + return t.taskMem.CopyOut(nil, 0, b, usermem.IOOpts{}) +} + +// CopyInBytes implements marshal.Task.CopyInBytes. The implementation +// completely ignores the source address and always fills b from the begining of +// its internal buffer. +func (t *mockTask) CopyInBytes(_ usermem.Addr, b []byte) (int, error) { + return t.taskMem.CopyIn(nil, 0, b, usermem.IOOpts{}) +} + +// unsafeMemory returns the underlying memory for m. The returned slice is only +// valid for the lifetime for m. The garbage collector isn't aware that the +// returned slice is related to m, the caller must ensure m lives long enough. +func unsafeMemory(m marshal.Marshallable) []byte { + if !m.Packed() { + // We can't return a slice pointing to the underlying memory + // since the layout isn't packed. Allocate a temporary buffer + // and marshal instead. + var buf bytes.Buffer + if err := binary.Write(&buf, usermem.ByteOrder, m); err != nil { + panic(err) + } + return buf.Bytes() + } + + // reflect.ValueOf(m) + // .Elem() // Unwrap interface to inner concrete object + // .Addr() // Pointer value to object + // .Pointer() // Actual address from the pointer value + ptr := reflect.ValueOf(m).Elem().Addr().Pointer() + + size := m.SizeBytes() + + var mem []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&mem)) + hdr.Data = ptr + hdr.Len = size + hdr.Cap = size + + return mem +} + +// unsafeMemorySlice returns the underlying memory for m. The returned slice is +// only valid for the lifetime for m. The garbage collector isn't aware that the +// returned slice is related to m, the caller must ensure m lives long enough. +// +// Precondition: m must be a slice. +func unsafeMemorySlice(m interface{}, elt marshal.Marshallable) []byte { + kind := reflect.TypeOf(m).Kind() + if kind != reflect.Slice { + panic("unsafeMemorySlice called on non-slice") + } + + if !elt.Packed() { + // We can't return a slice pointing to the underlying memory + // since the layout isn't packed. Allocate a temporary buffer + // and marshal instead. + var buf bytes.Buffer + if err := binary.Write(&buf, usermem.ByteOrder, m); err != nil { + panic(err) + } + return buf.Bytes() + } + + v := reflect.ValueOf(m) + length := v.Len() * elt.SizeBytes() + + var mem []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&mem)) + hdr.Data = v.Pointer() // This is a pointer to the first elem for slices. + hdr.Len = length + hdr.Cap = length + + return mem +} + +func isZeroes(buf []byte) bool { + for _, b := range buf { + if b != 0 { + return false + } + } + return true +} + +// compareMemory compares the first n bytes of two chuncks of memory represented +// by expected and actual. +func compareMemory(t *testing.T, expected, actual []byte, n int) { + t.Logf("Expected (%d): %v (%d) + (%d) %v\n", len(expected), expected[:n], n, len(expected)-n, expected[n:]) + t.Logf("Actual (%d): %v (%d) + (%d) %v\n", len(actual), actual[:n], n, len(actual)-n, actual[n:]) + + if diff := cmp.Diff(expected[:n], actual[:n]); diff != "" { + t.Errorf("Memory buffers don't match:\n--- expected only\n+++ actual only\n%v", diff) + } +} + +// limitedCopyIn populates task memory with src, then unmarshals task memory to +// dst. The task signals an error at limit bytes during copy-in, which should +// result in a truncated unmarshalling. +func limitedCopyIn(t *testing.T, src, dst marshal.Marshallable, limit int) { + var task mockTask + task.populate(src) + task.setLimit(limit) + + n, err := dst.CopyIn(&task, usermem.Addr(0)) + if n != limit { + t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n) + } + if err != simulatedErr { + t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err) + } + + expectedMem := unsafeMemory(src) + defer runtime.KeepAlive(src) + actualMem := unsafeMemory(dst) + defer runtime.KeepAlive(dst) + + compareMemory(t, expectedMem, actualMem, n) + + // The last n bytes should be zero for actual, since actual was + // zero-initialized, and CopyIn shouldn't have touched those bytes. However + // we can only guarantee we didn't touch anything in the last n bytes if the + // layout is packed. + if dst.Packed() && !isZeroes(actualMem[n:]) { + t.Errorf("Expected the last %d bytes of copied in object to be zeroes, got %v\n", dst.SizeBytes()-n, actualMem) + } +} + +// limitedCopyOut marshals src to task memory. The task signals an error at +// limit bytes during copy-out, which should result in a truncated marshalling. +func limitedCopyOut(t *testing.T, src marshal.Marshallable, limit int) { + var task mockTask + task.setLimit(limit) + + n, err := src.CopyOut(&task, usermem.Addr(0)) + if n != limit { + t.Errorf("CopyOut copied unexpected number of bytes, expected %d, got %d", limit, n) + } + if err != simulatedErr { + t.Errorf("CopyOut returned unexpected error, expected %v, got %v", simulatedErr, err) + } + + expectedMem := unsafeMemory(src) + defer runtime.KeepAlive(src) + actualMem := task.taskMem.Bytes + + compareMemory(t, expectedMem, actualMem, n) +} + +// copyOutN marshals src to task memory, requesting the marshalling to be +// limited to limit bytes. +func copyOutN(t *testing.T, src marshal.Marshallable, limit int) { + var task mockTask + task.setLimit(limit) + + n, err := src.CopyOutN(&task, usermem.Addr(0), limit) + if err != nil { + t.Errorf("CopyOut returned unexpected error: %v", err) + } + if n != limit { + t.Errorf("CopyOut copied unexpected number of bytes, expected %d, got %d", limit, n) + } + + expectedMem := unsafeMemory(src) + defer runtime.KeepAlive(src) + actualMem := task.taskMem.Bytes + + t.Logf("Expected: %v + %v\n", expectedMem[:n], expectedMem[n:]) + t.Logf("Actual : %v + %v\n", actualMem[:n], actualMem[n:]) + + compareMemory(t, expectedMem, actualMem, n) +} + +// TestLimitedMarshalling verifies marshalling/unmarshalling succeeds when the +// underyling copy in/out operations partially succeed. +func TestLimitedMarshalling(t *testing.T) { + types := []reflect.Type{ + // Packed types. + reflect.TypeOf((*test.Type2)(nil)), + reflect.TypeOf((*test.Type3)(nil)), + reflect.TypeOf((*test.Timespec)(nil)), + reflect.TypeOf((*test.Stat)(nil)), + reflect.TypeOf((*test.InetAddr)(nil)), + reflect.TypeOf((*test.SignalSet)(nil)), + reflect.TypeOf((*test.SignalSetAlias)(nil)), + // Non-packed types. + reflect.TypeOf((*test.Type1)(nil)), + reflect.TypeOf((*test.Type4)(nil)), + reflect.TypeOf((*test.Type5)(nil)), + reflect.TypeOf((*test.Type6)(nil)), + reflect.TypeOf((*test.Type7)(nil)), + reflect.TypeOf((*test.Type8)(nil)), + } + + for _, tyPtr := range types { + // Remove one level of pointer-indirection from the type. We get this + // back when we pass the type to reflect.New. + ty := tyPtr.Elem() + + // Partial copy-in. + t.Run(fmt.Sprintf("PartialCopyIn_%v", ty), func(t *testing.T) { + expected := reflect.New(ty).Interface().(marshal.Marshallable) + actual := reflect.New(ty).Interface().(marshal.Marshallable) + analysis.RandomizeValue(expected) + + limitedCopyIn(t, expected, actual, expected.SizeBytes()/2) + }) + + // Partial copy-out. + t.Run(fmt.Sprintf("PartialCopyOut_%v", ty), func(t *testing.T) { + expected := reflect.New(ty).Interface().(marshal.Marshallable) + analysis.RandomizeValue(expected) + + limitedCopyOut(t, expected, expected.SizeBytes()/2) + }) + + // Explicitly request partial copy-out. + t.Run(fmt.Sprintf("PartialCopyOutN_%v", ty), func(t *testing.T) { + expected := reflect.New(ty).Interface().(marshal.Marshallable) + analysis.RandomizeValue(expected) + + copyOutN(t, expected, expected.SizeBytes()/2) + }) + } +} + +// TestLimitedMarshalling verifies marshalling/unmarshalling of slices of +// marshallable types succeed when the underyling copy in/out operations +// partially succeed. +func TestLimitedSliceMarshalling(t *testing.T) { + types := []struct { + arrayPtrType reflect.Type + copySliceIn func(task marshal.Task, addr usermem.Addr, dstSlice interface{}) (int, error) + copySliceOut func(task marshal.Task, addr usermem.Addr, srcSlice interface{}) (int, error) + unsafeMemory func(arrPtr interface{}) []byte + }{ + // Packed types. + { + reflect.TypeOf((*[20]test.Stat)(nil)), + func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) { + slice := dst.(*[20]test.Stat)[:] + return test.CopyStatSliceIn(task, addr, slice) + }, + func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) { + slice := src.(*[20]test.Stat)[:] + return test.CopyStatSliceOut(task, addr, slice) + }, + func(a interface{}) []byte { + slice := a.(*[20]test.Stat)[:] + return unsafeMemorySlice(slice, &slice[0]) + }, + }, + { + reflect.TypeOf((*[1]test.Stat)(nil)), + func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) { + slice := dst.(*[1]test.Stat)[:] + return test.CopyStatSliceIn(task, addr, slice) + }, + func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) { + slice := src.(*[1]test.Stat)[:] + return test.CopyStatSliceOut(task, addr, slice) + }, + func(a interface{}) []byte { + slice := a.(*[1]test.Stat)[:] + return unsafeMemorySlice(slice, &slice[0]) + }, + }, + { + reflect.TypeOf((*[5]test.SignalSetAlias)(nil)), + func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) { + slice := dst.(*[5]test.SignalSetAlias)[:] + return test.CopySignalSetAliasSliceIn(task, addr, slice) + }, + func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) { + slice := src.(*[5]test.SignalSetAlias)[:] + return test.CopySignalSetAliasSliceOut(task, addr, slice) + }, + func(a interface{}) []byte { + slice := a.(*[5]test.SignalSetAlias)[:] + return unsafeMemorySlice(slice, &slice[0]) + }, + }, + // Non-packed types. + { + reflect.TypeOf((*[20]test.Type1)(nil)), + func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) { + slice := dst.(*[20]test.Type1)[:] + return test.CopyType1SliceIn(task, addr, slice) + }, + func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) { + slice := src.(*[20]test.Type1)[:] + return test.CopyType1SliceOut(task, addr, slice) + }, + func(a interface{}) []byte { + slice := a.(*[20]test.Type1)[:] + return unsafeMemorySlice(slice, &slice[0]) + }, + }, + { + reflect.TypeOf((*[1]test.Type1)(nil)), + func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) { + slice := dst.(*[1]test.Type1)[:] + return test.CopyType1SliceIn(task, addr, slice) + }, + func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) { + slice := src.(*[1]test.Type1)[:] + return test.CopyType1SliceOut(task, addr, slice) + }, + func(a interface{}) []byte { + slice := a.(*[1]test.Type1)[:] + return unsafeMemorySlice(slice, &slice[0]) + }, + }, + { + reflect.TypeOf((*[7]test.Type8)(nil)), + func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) { + slice := dst.(*[7]test.Type8)[:] + return test.CopyType8SliceIn(task, addr, slice) + }, + func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) { + slice := src.(*[7]test.Type8)[:] + return test.CopyType8SliceOut(task, addr, slice) + }, + func(a interface{}) []byte { + slice := a.(*[7]test.Type8)[:] + return unsafeMemorySlice(slice, &slice[0]) + }, + }, + } + + for _, tt := range types { + // The body of this loop is generic over the type tt.arrayPtrType, with + // the help of reflection. To aid in readability, comments below show + // the equivalent go code assuming + // tt.arrayPtrType = typeof(*[20]test.Stat). + + // Equivalent: + // var x *[20]test.Stat + // arrayTy := reflect.TypeOf(*x) + arrayTy := tt.arrayPtrType.Elem() + + // Partial copy-in of slices. + t.Run(fmt.Sprintf("PartialCopySliceIn_%v", arrayTy), func(t *testing.T) { + // Equivalent: + // var x [20]test.Stat + // length := len(x) + length := arrayTy.Len() + if length < 1 { + panic("Test type can't be zero-length array") + } + // Equivalent: + // elem := new(test.Stat).(marshal.Marshallable) + elem := reflect.New(arrayTy.Elem()).Interface().(marshal.Marshallable) + + // Equivalent: + // var expected, actual interface{} + // expected = new([20]test.Stat) + // actual = new([20]test.Stat) + expected := reflect.New(arrayTy).Interface() + actual := reflect.New(arrayTy).Interface() + + analysis.RandomizeValue(expected) + + limit := (length * elem.SizeBytes()) / 2 + // Also make sure the limit is partially inside one of the elements. + limit += elem.SizeBytes() / 2 + analysis.RandomizeValue(expected) + + var task mockTask + task.populate(expected) + task.setLimit(limit) + + n, err := tt.copySliceIn(&task, usermem.Addr(0), actual) + if n != limit { + t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n) + } + if n < length*elem.SizeBytes() && err != simulatedErr { + t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err) + } + + expectedMem := tt.unsafeMemory(expected) + defer runtime.KeepAlive(expected) + actualMem := tt.unsafeMemory(actual) + defer runtime.KeepAlive(actual) + + compareMemory(t, expectedMem, actualMem, n) + + // The last n bytes should be zero for actual, since actual was + // zero-initialized, and CopyIn shouldn't have touched those bytes. However + // we can only guarantee we didn't touch anything in the last n bytes if the + // layout is packed. + if elem.Packed() && !isZeroes(actualMem[n:]) { + t.Errorf("Expected the last %d bytes of copied in object to be zeroes, got %v\n", (elem.SizeBytes()*length)-n, actualMem) + } + }) + + // Partial copy-out of slices. + t.Run(fmt.Sprintf("PartialCopySliceOut_%v", arrayTy), func(t *testing.T) { + // Equivalent: + // var x [20]test.Stat + // length := len(x) + length := arrayTy.Len() + if length < 1 { + panic("Test type can't be zero-length array") + } + // Equivalent: + // elem := new(test.Stat).(marshal.Marshallable) + elem := reflect.New(arrayTy.Elem()).Interface().(marshal.Marshallable) + + // Equivalent: + // var expected, actual interface{} + // expected = new([20]test.Stat) + // actual = new([20]test.Stat) + expected := reflect.New(arrayTy).Interface() + + analysis.RandomizeValue(expected) + + limit := (length * elem.SizeBytes()) / 2 + // Also make sure the limit is partially inside one of the elements. + limit += elem.SizeBytes() / 2 + analysis.RandomizeValue(expected) + + var task mockTask + task.populate(expected) + task.setLimit(limit) + + n, err := tt.copySliceOut(&task, usermem.Addr(0), expected) + if n != limit { + t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n) + } + if n < length*elem.SizeBytes() && err != simulatedErr { + t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err) + } + + expectedMem := tt.unsafeMemory(expected) + defer runtime.KeepAlive(expected) + actualMem := task.taskMem.Bytes + + compareMemory(t, expectedMem, actualMem, n) + }) + } +} diff --git a/tools/go_marshal/test/test.go b/tools/go_marshal/test/test.go index c829db6da..43df73545 100644 --- a/tools/go_marshal/test/test.go +++ b/tools/go_marshal/test/test.go @@ -23,7 +23,7 @@ import ( // Type1 is a test data type. // -// +marshal +// +marshal slice:Type1Slice type Type1 struct { a Type2 x, y int64 // Multiple field names. @@ -75,6 +75,34 @@ type Type5 struct { m int64 } +// Type6 is a test data type ends mid-word. +// +// +marshal +type Type6 struct { + a int64 + b int64 + // If c isn't marked unaligned, analysis fails (as it should, since + // the unsafe API corrupts Type7). + c byte `marshal:"unaligned"` +} + +// Type7 is a test data type that contains a child struct that ends +// mid-word. +// +marshal +type Type7 struct { + x Type6 + y int64 +} + +// Type8 is a test data type which contains an external non-packed field. +// +// +marshal slice:Type8Slice +type Type8 struct { + a int64 + np ex.NotPacked + b int64 +} + // Timespec represents struct timespec in . // // +marshal @@ -85,7 +113,7 @@ type Timespec struct { // Stat represents struct stat. // -// +marshal +// +marshal slice:StatSlice type Stat struct { Dev uint64 Ino uint64 @@ -111,10 +139,10 @@ type InetAddr [4]byte // SignalSet is an example marshallable newtype on a primitive. // -// +marshal +// +marshal slice:SignalSetSlice:inner type SignalSet uint64 // SignalSetAlias is an example newtype on another marshallable type. // -// +marshal +// +marshal slice:SignalSetAliasSlice type SignalSetAlias SignalSet -- cgit v1.2.3 From a94309628ebbc2e6c4997890f1b966fa7a16be20 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Fri, 3 Apr 2020 13:39:45 -0700 Subject: Ensure EOF is handled propertly during splice. PiperOrigin-RevId: 304684417 --- pkg/sentry/kernel/pipe/pipe.go | 13 ++++++++++--- test/syscalls/linux/sendfile.cc | 28 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 725e9db7d..62c8691f1 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -255,7 +255,8 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be // atomic, but requires no atomicity for writes larger than this. wanted := ops.left() - if avail := p.max - p.view.Size(); wanted > avail { + avail := p.max - p.view.Size() + if wanted > avail { if wanted <= p.atomicIOBytes { return 0, syserror.ErrWouldBlock } @@ -268,8 +269,14 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { return done, err } - if wanted > done { - // Partial write due to full pipe. + if done < avail { + // Non-failure, but short write. + return done, nil + } + if done < wanted { + // Partial write due to full pipe. Note that this could also be + // the short write case above, we would expect a second call + // and the write to return zero bytes in this case. return done, syserror.ErrWouldBlock } diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc index 580ab5193..ebaafe47e 100644 --- a/test/syscalls/linux/sendfile.cc +++ b/test/syscalls/linux/sendfile.cc @@ -530,6 +530,34 @@ TEST(SendFileTest, SendToSpecialFile) { SyscallSucceedsWithValue(kSize & (~7))); } +TEST(SendFileTest, SendFileToPipe) { + // Create temp file. + constexpr char kData[] = ""; + constexpr int kDataSize = sizeof(kData) - 1; + const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( + GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode)); + const FileDescriptor inf = + ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY)); + + // Create a pipe for sending to a pipe. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor rfd(fds[0]); + const FileDescriptor wfd(fds[1]); + + // Expect to read up to the given size. + std::vector buf(kDataSize); + ScopedThread t([&]() { + absl::SleepFor(absl::Milliseconds(100)); + ASSERT_THAT(read(rfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kDataSize)); + }); + + // Send with twice the size of the file, which should hit EOF. + EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize * 2), + SyscallSucceedsWithValue(kDataSize)); +} + } // namespace } // namespace testing -- cgit v1.2.3 From 5818663ebe26857845685702d99db41c7aa2cf3d Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 3 Apr 2020 14:07:42 -0700 Subject: Add FileDescriptionImpl for Unix sockets. This change involves several steps: - Refactor the VFS1 unix socket implementation to share methods between VFS1 and VFS2 where possible. Re-implement the rest. - Override the default PRead, Read, PWrite, Write, Ioctl, Release methods in FileDescriptionDefaultImpl. - Add functions to create and initialize a new Dentry/Inode and FileDescription for a Unix socket file. Updates #1476 PiperOrigin-RevId: 304689796 --- pkg/sentry/fsimpl/sockfs/BUILD | 1 + pkg/sentry/fsimpl/sockfs/sockfs.go | 29 +++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/socket/netstack/netstack.go | 8 +- pkg/sentry/socket/unix/BUILD | 4 + pkg/sentry/socket/unix/unix.go | 89 ++++++--- pkg/sentry/socket/unix/unix_vfs2.go | 348 +++++++++++++++++++++++++++++++++ pkg/sentry/vfs/options.go | 5 + 8 files changed, 456 insertions(+), 29 deletions(-) create mode 100644 pkg/sentry/socket/unix/unix_vfs2.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD index 790d50e65..52084ddb5 100644 --- a/pkg/sentry/fsimpl/sockfs/BUILD +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -7,6 +7,7 @@ go_library( srcs = ["sockfs.go"], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", "//pkg/context", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index c13511de2..3f7ad1d65 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -16,6 +16,7 @@ package sockfs import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -60,6 +61,10 @@ type filesystem struct { } // inode implements kernfs.Inode. +// +// TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are +// not included in InodeAttrs) to store the numbers of the appropriate +// socket device. Override InodeAttrs.Stat() accordingly. type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink @@ -71,3 +76,27 @@ type inode struct { func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ENXIO } + +// InitSocket initializes a socket FileDescription, with a corresponding +// Dentry in mnt. +// +// fd should be the FileDescription associated with socketImpl, i.e. its first +// field. mnt should be the global socket mount, Kernel.socketMount. +func InitSocket(socketImpl vfs.FileDescriptionImpl, fd *vfs.FileDescription, mnt *vfs.Mount, creds *auth.Credentials) error { + fsimpl := mnt.Filesystem().Impl() + fs := fsimpl.(*kernfs.Filesystem) + + // File mode matches net/socket.c:sock_alloc. + filemode := linux.FileMode(linux.S_IFSOCK | 0600) + i := &inode{} + i.Init(creds, fs.NextIno(), filemode) + + d := &kernfs.Dentry{} + d.Init(i) + + opts := &vfs.FileDescriptionOptions{UseDentryMetadata: true} + if err := fd.Init(socketImpl, linux.O_RDWR, mnt, d.VFSDentry(), opts); err != nil { + return err + } + return nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index bb7e3cbc3..e0ff58d8c 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -169,6 +169,7 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 06a5b53bc..5d0085462 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -940,7 +940,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us // GetSockOpt can be used to implement the linux syscall getsockopt(2) for // sockets backed by a commonEndpoint. -func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { +func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { switch level { case linux.SOL_SOCKET: return getSockOptSocket(t, s, ep, family, skType, name, outLen) @@ -966,7 +966,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_ERROR: @@ -1541,7 +1541,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa // SetSockOpt can be used to implement the linux syscall setsockopt(2) for // sockets backed by a commonEndpoint. -func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { +func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { switch level { case linux.SOL_SOCKET: return setSockOptSocket(t, s, ep, name, optVal) @@ -1568,7 +1568,7 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n } // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. -func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { +func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { switch name { case linux.SO_SNDBUF: if len(optVal) < sizeOfInt32 { diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 08743deba..de2cc4bdf 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -8,23 +8,27 @@ go_library( "device.go", "io.go", "unix.go", + "unix_vfs2.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 4d30aa714..7c64f30fa 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -33,6 +34,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -52,11 +54,8 @@ type SocketOperations struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` - refs.AtomicRefCount - socket.SendReceiveTimeout - ep transport.Endpoint - stype linux.SockType + socketOpsCommon } // New creates a new unix socket. @@ -75,16 +74,29 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty } s := SocketOperations{ - ep: ep, - stype: stype, + socketOpsCommon: socketOpsCommon{ + ep: ep, + stype: stype, + }, } s.EnableLeakCheck("unix.SocketOperations") return fs.NewFile(ctx, d, flags, &s) } +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { + refs.AtomicRefCount + socket.SendReceiveTimeout + + ep transport.Endpoint + stype linux.SockType +} + // DecRef implements RefCounter.DecRef. -func (s *SocketOperations) DecRef() { +func (s *socketOpsCommon) DecRef() { s.DecRefWithDestructor(func() { s.ep.Close() }) @@ -97,7 +109,7 @@ func (s *SocketOperations) Release() { s.DecRef() } -func (s *SocketOperations) isPacket() bool { +func (s *socketOpsCommon) isPacket() bool { switch s.stype { case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: return true @@ -110,7 +122,7 @@ func (s *SocketOperations) isPacket() bool { } // Endpoint extracts the transport.Endpoint. -func (s *SocketOperations) Endpoint() transport.Endpoint { +func (s *socketOpsCommon) Endpoint() transport.Endpoint { return s.ep } @@ -143,7 +155,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) { // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.ep.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -155,7 +167,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, // GetSockName implements the linux syscall getsockname(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.ep.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -178,7 +190,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us // Listen implements the linux syscall listen(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return s.ep.Listen(backlog) } @@ -310,6 +322,8 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Create the socket. + // + // TODO(gvisor.dev/issue/2324): Correctly set file permissions. childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}) if err != nil { return syserr.ErrPortInUse @@ -345,6 +359,31 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, return ep, nil } + if kernel.VFS2Enabled { + p := fspath.Parse(path) + root := t.FSContext().RootDirectoryVFS2() + start := root + relPath := !p.Absolute + if relPath { + start = t.FSContext().WorkingDirectoryVFS2() + } + pop := vfs.PathOperation{ + Root: root, + Start: start, + Path: p, + FollowFinalSymlink: true, + } + ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop) + root.DecRef() + if relPath { + start.DecRef() + } + if e != nil { + return nil, syserr.FromError(e) + } + return ep, nil + } + // Find the node in the filesystem. root := t.FSContext().RootDirectory() cwd := t.FSContext().WorkingDirectory() @@ -363,12 +402,11 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, // No socket! return nil, syserr.ErrConnectionRefused } - return ep, nil } // Connect implements the linux syscall connect(2) for unix sockets. -func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { ep, err := extractEndpoint(t, sockaddr) if err != nil { return err @@ -379,7 +417,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo return s.ep.Connect(t, ep) } -// Writev implements fs.FileOperations.Write. +// Write implements fs.FileOperations.Write. func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { t := kernel.TaskFromContext(ctx) ctrl := control.New(t, s.ep, nil) @@ -399,7 +437,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by // a transport.Endpoint. -func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { w := EndpointWriter{ Ctx: t, Endpoint: s.ep, @@ -453,27 +491,27 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] } // Passcred implements transport.Credentialer.Passcred. -func (s *SocketOperations) Passcred() bool { +func (s *socketOpsCommon) Passcred() bool { return s.ep.Passcred() } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. -func (s *SocketOperations) ConnectedPasscred() bool { +func (s *socketOpsCommon) ConnectedPasscred() bool { return s.ep.ConnectedPasscred() } // Readiness implements waiter.Waitable.Readiness. -func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { return s.ep.Readiness(mask) } // EventRegister implements waiter.Waitable.EventRegister. -func (s *SocketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.ep.EventRegister(e, mask) } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *SocketOperations) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } @@ -485,7 +523,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa // Shutdown implements the linux syscall shutdown(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { f, err := netstack.ConvertShutdown(how) if err != nil { return err @@ -511,7 +549,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { trunc := flags&linux.MSG_TRUNC != 0 peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 @@ -648,12 +686,12 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } // State implements socket.Socket.State. -func (s *SocketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { return s.ep.State() } // Type implements socket.Socket.Type. -func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { // Unix domain sockets always have a protocol of 0. return linux.AF_UNIX, s.stype, 0 } @@ -706,4 +744,5 @@ func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.F func init() { socket.RegisterProvider(linux.AF_UNIX, &provider{}) + socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{}) } diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go new file mode 100644 index 000000000..ca1388e2c --- /dev/null +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -0,0 +1,348 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unix + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 implements socket.SocketVFS2 (and by extension, +// vfs.FileDescriptionImpl) for Unix sockets. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +// NewVFS2File creates and returns a new vfs.FileDescription for a unix socket. +func NewVFS2File(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { + sock := NewFDImpl(ep, stype) + vfsfd := &sock.vfsfd + if err := sockfs.InitSocket(sock, vfsfd, t.Kernel().SocketMount(), t.Credentials()); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// NewFDImpl creates and returns a new SocketVFS2. +func NewFDImpl(ep transport.Endpoint, stype linux.SockType) *SocketVFS2 { + // You can create AF_UNIX, SOCK_RAW sockets. They're the same as + // SOCK_DGRAM and don't require CAP_NET_RAW. + if stype == linux.SOCK_RAW { + stype = linux.SOCK_DGRAM + } + + return &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + ep: ep, + stype: stype, + }, + } +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) +} + +// blockingAccept implements a blocking version of accept(2), that is, if no +// connections are ready to be accept, it will block until one becomes ready. +func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) { + // Register for notifications. + e, ch := waiter.NewChannelEntry(nil) + s.socketOpsCommon.EventRegister(&e, waiter.EventIn) + defer s.socketOpsCommon.EventUnregister(&e) + + // Try to accept the connection; if it fails, then wait until we get a + // notification. + for { + if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock { + return ep, err + } + + if err := t.Block(ch); err != nil { + return nil, syserr.FromError(err) + } + } +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, err := s.ep.Accept() + if err != nil { + if err != syserr.ErrWouldBlock || !blocking { + return 0, nil, 0, err + } + + var err *syserr.Error + ep, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + // We expect this to be a FileDescription here. + ns, err := NewVFS2File(t, ep, s.stype) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if flags&linux.SOCK_NONBLOCK != 0 { + ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer. + var err *syserr.Error + addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + if e != nil { + return 0, nil, 0, syserr.FromError(e) + } + + // TODO: add vfs2 sockets to global table. + return fd, addr, addrLen, nil +} + +// Bind implements the linux syscall bind(2) for unix sockets. +func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + p, e := extractPath(sockaddr) + if e != nil { + return e + } + + bep, ok := s.ep.(transport.BoundEndpoint) + if !ok { + // This socket can't be bound. + return syserr.ErrInvalidArgument + } + + return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error { + // Is it abstract? + if p[0] == 0 { + if t.IsNetworkNamespaced() { + return syserr.ErrInvalidEndpointState + } + if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + // syserr.ErrPortInUse corresponds to EADDRINUSE. + return syserr.ErrPortInUse + } + } else { + path := fspath.Parse(p) + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + relPath := !path.Absolute + if relPath { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } + pop := vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + } + err := t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ + // TODO(gvisor.dev/issue/2324): The file permissions should be taken + // from s and t.FSContext().Umask() (see net/unix/af_unix.c:unix_bind), + // but VFS1 just always uses 0400. Resolve this inconsistency. + Mode: linux.S_IFSOCK | 0400, + Endpoint: bep, + }) + if err == syserror.EEXIST { + return syserr.ErrAddressInUse + } + return syserr.FromError(err) + } + + return nil + }) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return netstack.Ioctl(ctx, s.ep, uio, args) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &EndpointReader{ + Ctx: ctx, + Endpoint: s.ep, + NumRights: 0, + Peek: false, + From: nil, + }) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + t := kernel.TaskFromContext(ctx) + ctrl := control.New(t, s.ep, nil) + + if src.NumBytes() == 0 { + nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil) + return int64(nInt), err.ToError() + } + + return src.CopyInTo(ctx, &EndpointWriter{ + Ctx: ctx, + Endpoint: s.ep, + Control: ctrl, + To: nil, + }) +} + +// Release implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Release() { + // Release only decrements a reference on s because s may be referenced in + // the abstract socket namespace. + s.DecRef() +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) +} + +// providerVFS2 is a unix domain socket provider for VFS2. +type providerVFS2 struct{} + +func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, syserr.ErrProtocolNotSupported + } + + // Create the endpoint and socket. + var ep transport.Endpoint + switch stype { + case linux.SOCK_DGRAM, linux.SOCK_RAW: + ep = transport.NewConnectionless(t) + case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: + ep = transport.NewConnectioned(t, stype, t.Kernel()) + default: + return nil, syserr.ErrInvalidArgument + } + + f, err := NewVFS2File(t, ep, stype) + if err != nil { + ep.Close() + return nil, err + } + return f, nil +} + +// Pair creates a new pair of AF_UNIX connected sockets. +func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, nil, syserr.ErrProtocolNotSupported + } + + switch stype { + case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW: + // Ok + default: + return nil, nil, syserr.ErrInvalidArgument + } + + // Create the endpoints and sockets. + ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) + s1, err := NewVFS2File(t, ep1, stype) + if err != nil { + ep1.Close() + ep2.Close() + return nil, nil, err + } + s2, err := NewVFS2File(t, ep2, stype) + if err != nil { + s1.DecRef() + ep2.Close() + return nil, nil, err + } + + return s1, s2, nil +} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 3e90dc4ed..2f04bf882 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -16,6 +16,7 @@ package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and @@ -44,6 +45,10 @@ type MknodOptions struct { // DevMinor are the major and minor device numbers for the created device. DevMajor uint32 DevMinor uint32 + + // Endpoint is the endpoint to bind to the created file, if a socket file is + // being created for bind(2) on a Unix domain socket. + Endpoint transport.BoundEndpoint } // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC. -- cgit v1.2.3 From fc99a7ebf0c24b6f7b3cfd6351436373ed54548b Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Fri, 3 Apr 2020 18:34:48 -0700 Subject: Refactor software GSO code. Software GSO implementation currently has a complicated code path with implicit assumptions that all packets to WritePackets carry same Data and it does this to avoid allocations on the path etc. But this makes it hard to reuse the WritePackets API. This change breaks all such assumptions by introducing a new Vectorised View API ReadToVV which can be used to cleanly split a VV into multiple independent VVs. Further this change also makes packet buffers linkable to form an intrusive list. This allows us to get rid of the array of packet buffers that are passed in the WritePackets API call and replace it with a list of packet buffers. While this code does introduce some more allocations in the benchmarks it doesn't cause any degradation. Updates #231 PiperOrigin-RevId: 304731742 --- pkg/ilist/list.go | 13 ++- pkg/sentry/kernel/kernel.go | 22 +++-- pkg/tcpip/buffer/view.go | 53 +++++++++- pkg/tcpip/buffer/view_test.go | 137 ++++++++++++++++++++++++++ pkg/tcpip/link/channel/channel.go | 18 ++-- pkg/tcpip/link/fdbased/endpoint.go | 162 ++++++++++++++----------------- pkg/tcpip/link/loopback/loopback.go | 2 +- pkg/tcpip/link/muxed/injectable.go | 2 +- pkg/tcpip/link/sharedmem/sharedmem.go | 2 +- pkg/tcpip/link/sniffer/sniffer.go | 14 +-- pkg/tcpip/link/waitable/waitable.go | 4 +- pkg/tcpip/link/waitable/waitable_test.go | 6 +- pkg/tcpip/network/arp/arp.go | 2 +- pkg/tcpip/network/ip_test.go | 2 +- pkg/tcpip/network/ipv4/ipv4.go | 37 +++++-- pkg/tcpip/network/ipv6/icmp.go | 2 +- pkg/tcpip/network/ipv6/ipv6.go | 12 +-- pkg/tcpip/stack/BUILD | 14 ++- pkg/tcpip/stack/forwarder_test.go | 8 +- pkg/tcpip/stack/iptables.go | 17 ++++ pkg/tcpip/stack/ndp_test.go | 2 +- pkg/tcpip/stack/packet_buffer.go | 14 +-- pkg/tcpip/stack/packet_buffer_state.go | 27 ------ pkg/tcpip/stack/registration.go | 4 +- pkg/tcpip/stack/route.go | 19 ++-- pkg/tcpip/stack/stack_test.go | 2 +- pkg/tcpip/transport/tcp/connect.go | 47 +++++---- pkg/tcpip/transport/tcp/segment.go | 6 +- 28 files changed, 420 insertions(+), 230 deletions(-) delete mode 100644 pkg/tcpip/stack/packet_buffer_state.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go index 8f93e4d6d..0d07da3b1 100644 --- a/pkg/ilist/list.go +++ b/pkg/ilist/list.go @@ -86,12 +86,21 @@ func (l *List) Back() Element { return l.tail } +// Len returns the number of elements in the list. +// +// NOTE: This is an O(n) operation. +func (l *List) Len() (count int) { + for e := l.Front(); e != nil; e = e.Next() { + count++ + } + return count +} + // PushFront inserts the element e at the front of list l. func (l *List) PushFront(e Element) { linker := ElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) - if l.head != nil { ElementMapper{}.linkerFor(l.head).SetPrev(e) } else { @@ -106,7 +115,6 @@ func (l *List) PushBack(e Element) { linker := ElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) - if l.tail != nil { ElementMapper{}.linkerFor(l.tail).SetNext(e) } else { @@ -127,7 +135,6 @@ func (l *List) PushBackList(m *List) { l.tail = m.tail } - m.head = nil m.tail = nil } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 0a448b57c..2e6f42b92 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -564,15 +564,25 @@ func (ts *TaskSet) unregisterEpollWaiters() { ts.mu.RLock() defer ts.mu.RUnlock() + + // Tasks that belong to the same process could potentially point to the + // same FDTable. So we retain a map of processed ones to avoid + // processing the same FDTable multiple times. + processed := make(map[*FDTable]struct{}) for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if e, ok := file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } - }) + if t.fdTable == nil { + continue + } + if _, ok := processed[t.fdTable]; ok { + continue } + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + }) + processed[t.fdTable] = struct{}{} } } diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go index 8d42cd066..8ec5d5d5c 100644 --- a/pkg/tcpip/buffer/view.go +++ b/pkg/tcpip/buffer/view.go @@ -17,6 +17,7 @@ package buffer import ( "bytes" + "io" ) // View is a slice of a buffer, with convenience methods. @@ -89,6 +90,47 @@ func (vv *VectorisedView) TrimFront(count int) { } } +// Read implements io.Reader. +func (vv *VectorisedView) Read(v View) (copied int, err error) { + count := len(v) + for count > 0 && len(vv.views) > 0 { + if count < len(vv.views[0]) { + vv.size -= count + copy(v[copied:], vv.views[0][:count]) + vv.views[0].TrimFront(count) + copied += count + return copied, nil + } + count -= len(vv.views[0]) + copy(v[copied:], vv.views[0]) + copied += len(vv.views[0]) + vv.RemoveFirst() + } + if copied == 0 { + return 0, io.EOF + } + return copied, nil +} + +// ReadToVV reads up to n bytes from vv to dstVV and removes them from vv. It +// returns the number of bytes copied. +func (vv *VectorisedView) ReadToVV(dstVV *VectorisedView, count int) (copied int) { + for count > 0 && len(vv.views) > 0 { + if count < len(vv.views[0]) { + vv.size -= count + dstVV.AppendView(vv.views[0][:count]) + vv.views[0].TrimFront(count) + copied += count + return + } + count -= len(vv.views[0]) + dstVV.AppendView(vv.views[0]) + copied += len(vv.views[0]) + vv.RemoveFirst() + } + return copied +} + // CapLength irreversibly reduces the length of the vectorised view. func (vv *VectorisedView) CapLength(length int) { if length < 0 { @@ -116,12 +158,12 @@ func (vv *VectorisedView) CapLength(length int) { // Clone returns a clone of this VectorisedView. // If the buffer argument is large enough to contain all the Views of this VectorisedView, // the method will avoid allocations and use the buffer to store the Views of the clone. -func (vv VectorisedView) Clone(buffer []View) VectorisedView { +func (vv *VectorisedView) Clone(buffer []View) VectorisedView { return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size} } // First returns the first view of the vectorised view. -func (vv VectorisedView) First() View { +func (vv *VectorisedView) First() View { if len(vv.views) == 0 { return nil } @@ -134,11 +176,12 @@ func (vv *VectorisedView) RemoveFirst() { return } vv.size -= len(vv.views[0]) + vv.views[0] = nil vv.views = vv.views[1:] } // Size returns the size in bytes of the entire content stored in the vectorised view. -func (vv VectorisedView) Size() int { +func (vv *VectorisedView) Size() int { return vv.size } @@ -146,7 +189,7 @@ func (vv VectorisedView) Size() int { // // If the vectorised view contains a single view, that view will be returned // directly. -func (vv VectorisedView) ToView() View { +func (vv *VectorisedView) ToView() View { if len(vv.views) == 1 { return vv.views[0] } @@ -158,7 +201,7 @@ func (vv VectorisedView) ToView() View { } // Views returns the slice containing the all views. -func (vv VectorisedView) Views() []View { +func (vv *VectorisedView) Views() []View { return vv.views } diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go index ebc3a17b7..106e1994c 100644 --- a/pkg/tcpip/buffer/view_test.go +++ b/pkg/tcpip/buffer/view_test.go @@ -233,3 +233,140 @@ func TestToClone(t *testing.T) { }) } } + +func TestVVReadToVV(t *testing.T) { + testCases := []struct { + comment string + vv VectorisedView + bytesToRead int + wantBytes string + leftVV VectorisedView + }{ + { + comment: "large VV, short read", + vv: vv(30, "012345678901234567890123456789"), + bytesToRead: 10, + wantBytes: "0123456789", + leftVV: vv(20, "01234567890123456789"), + }, + { + comment: "largeVV, multiple views, short read", + vv: vv(13, "123", "345", "567", "8910"), + bytesToRead: 6, + wantBytes: "123345", + leftVV: vv(7, "567", "8910"), + }, + { + comment: "smallVV (multiple views), large read", + vv: vv(3, "1", "2", "3"), + bytesToRead: 10, + wantBytes: "123", + leftVV: vv(0, ""), + }, + { + comment: "smallVV (single view), large read", + vv: vv(1, "1"), + bytesToRead: 10, + wantBytes: "1", + leftVV: vv(0, ""), + }, + { + comment: "emptyVV, large read", + vv: vv(0, ""), + bytesToRead: 10, + wantBytes: "", + leftVV: vv(0, ""), + }, + } + + for _, tc := range testCases { + t.Run(tc.comment, func(t *testing.T) { + var readTo VectorisedView + inSize := tc.vv.Size() + copied := tc.vv.ReadToVV(&readTo, tc.bytesToRead) + if got, want := copied, len(tc.wantBytes); got != want { + t.Errorf("incorrect number of bytes copied returned in ReadToVV got: %d, want: %d, tc: %+v", got, want, tc) + } + if got, want := string(readTo.ToView()), tc.wantBytes; got != want { + t.Errorf("unexpected content in readTo got: %s, want: %s", got, want) + } + if got, want := tc.vv.Size(), inSize-copied; got != want { + t.Errorf("test VV has incorrect size after reading got: %d, want: %d, tc.vv: %+v", got, want, tc.vv) + } + if got, want := string(tc.vv.ToView()), string(tc.leftVV.ToView()); got != want { + t.Errorf("unexpected data left in vv after read got: %+v, want: %+v", got, want) + } + }) + } +} + +func TestVVRead(t *testing.T) { + testCases := []struct { + comment string + vv VectorisedView + bytesToRead int + readBytes string + leftBytes string + wantError bool + }{ + { + comment: "large VV, short read", + vv: vv(30, "012345678901234567890123456789"), + bytesToRead: 10, + readBytes: "0123456789", + leftBytes: "01234567890123456789", + }, + { + comment: "largeVV, multiple buffers, short read", + vv: vv(13, "123", "345", "567", "8910"), + bytesToRead: 6, + readBytes: "123345", + leftBytes: "5678910", + }, + { + comment: "smallVV, large read", + vv: vv(3, "1", "2", "3"), + bytesToRead: 10, + readBytes: "123", + leftBytes: "", + }, + { + comment: "smallVV, large read", + vv: vv(1, "1"), + bytesToRead: 10, + readBytes: "1", + leftBytes: "", + }, + { + comment: "emptyVV, large read", + vv: vv(0, ""), + bytesToRead: 10, + readBytes: "", + wantError: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.comment, func(t *testing.T) { + readTo := NewView(tc.bytesToRead) + inSize := tc.vv.Size() + copied, err := tc.vv.Read(readTo) + if !tc.wantError && err != nil { + t.Fatalf("unexpected error in tc.vv.Read(..) = %s", err) + } + readTo = readTo[:copied] + if got, want := copied, len(tc.readBytes); got != want { + t.Errorf("incorrect number of bytes copied returned in ReadToVV got: %d, want: %d, tc.vv: %+v", got, want, tc.vv) + } + if got, want := string(readTo), tc.readBytes; got != want { + t.Errorf("unexpected data in readTo got: %s, want: %s", got, want) + } + if got, want := tc.vv.Size(), inSize-copied; got != want { + t.Errorf("test VV has incorrect size after reading got: %d, want: %d, tc.vv: %+v", got, want, tc.vv) + } + if got, want := string(tc.vv.ToView()), tc.leftBytes; got != want { + t.Errorf("vv has incorrect data after Read got: %s, want: %s", got, want) + } + }) + } +} diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go index a8d6653ce..b4a0ae53d 100644 --- a/pkg/tcpip/link/channel/channel.go +++ b/pkg/tcpip/link/channel/channel.go @@ -28,7 +28,7 @@ import ( // PacketInfo holds all the information about an outbound packet. type PacketInfo struct { - Pkt stack.PacketBuffer + Pkt *stack.PacketBuffer Proto tcpip.NetworkProtocolNumber GSO *stack.GSO Route stack.Route @@ -257,7 +257,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne route := r.Clone() route.Release() p := PacketInfo{ - Pkt: pkt, + Pkt: &pkt, Proto: protocol, GSO: gso, Route: route, @@ -269,21 +269,15 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne } // WritePackets stores outbound packets into the channel. -func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { +func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { // Clone r then release its resource so we only get the relevant fields from // stack.Route without holding a reference to a NIC's endpoint. route := r.Clone() route.Release() - payloadView := pkts[0].Data.ToView() n := 0 - for _, pkt := range pkts { - off := pkt.DataOffset - size := pkt.DataSize + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { p := PacketInfo{ - Pkt: stack.PacketBuffer{ - Header: pkt.Header, - Data: buffer.NewViewFromBytes(payloadView[off : off+size]).ToVectorisedView(), - }, + Pkt: pkt, Proto: protocol, GSO: gso, Route: route, @@ -301,7 +295,7 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.Pac // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { p := PacketInfo{ - Pkt: stack.PacketBuffer{Data: vv}, + Pkt: &stack.PacketBuffer{Data: vv}, Proto: 0, GSO: nil, } diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 3b3b6909b..7198742b7 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -441,118 +441,106 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne // WritePackets writes outbound packets to the file descriptor. If it is not // currently writable, the packet is dropped. -func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { - var ethHdrBuf []byte - // hdr + data - iovLen := 2 - if e.hdrSize > 0 { - // Add ethernet header if needed. - ethHdrBuf = make([]byte, header.EthernetMinimumSize) - eth := header.Ethernet(ethHdrBuf) - ethHdr := &header.EthernetFields{ - DstAddr: r.RemoteLinkAddress, - Type: protocol, - } - - // Preserve the src address if it's set in the route. - if r.LocalLinkAddress != "" { - ethHdr.SrcAddr = r.LocalLinkAddress - } else { - ethHdr.SrcAddr = e.addr - } - eth.Encode(ethHdr) - iovLen++ - } +// +// NOTE: This API uses sendmmsg to batch packets. As a result the underlying FD +// picked to write the packet out has to be the same for all packets in the +// list. In other words all packets in the batch should belong to the same +// flow. +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + n := pkts.Len() - n := len(pkts) - - views := pkts[0].Data.Views() - /* - * Each boundary in views can add one more iovec. - * - * payload | | | | - * ----------------------------- - * packets | | | | | | | - * ----------------------------- - * iovecs | | | | | | | | | - */ - iovec := make([]syscall.Iovec, n*iovLen+len(views)-1) mmsgHdrs := make([]rawfile.MMsgHdr, n) + i := 0 + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + var ethHdrBuf []byte + iovLen := 0 + if e.hdrSize > 0 { + // Add ethernet header if needed. + ethHdrBuf = make([]byte, header.EthernetMinimumSize) + eth := header.Ethernet(ethHdrBuf) + ethHdr := &header.EthernetFields{ + DstAddr: r.RemoteLinkAddress, + Type: protocol, + } - iovecIdx := 0 - viewIdx := 0 - viewOff := 0 - off := 0 - nextOff := 0 - for i := range pkts { - // TODO(b/134618279): Different packets may have different data - // in the future. We should handle this. - if !viewsEqual(pkts[i].Data.Views(), views) { - panic("All packets in pkts should have the same Data.") + // Preserve the src address if it's set in the route. + if r.LocalLinkAddress != "" { + ethHdr.SrcAddr = r.LocalLinkAddress + } else { + ethHdr.SrcAddr = e.addr + } + eth.Encode(ethHdr) + iovLen++ } - prevIovecIdx := iovecIdx - mmsgHdr := &mmsgHdrs[i] - mmsgHdr.Msg.Iov = &iovec[iovecIdx] - packetSize := pkts[i].DataSize - hdr := &pkts[i].Header - - off = pkts[i].DataOffset - if off != nextOff { - // We stop in a different point last time. - size := packetSize - viewIdx = 0 - viewOff = 0 - for size > 0 { - if size >= len(views[viewIdx]) { - viewIdx++ - viewOff = 0 - size -= len(views[viewIdx]) - } else { - viewOff = size - size = 0 + var vnetHdrBuf []byte + vnetHdr := virtioNetHdr{} + if e.Capabilities()&stack.CapabilityHardwareGSO != 0 { + if gso != nil { + vnetHdr.hdrLen = uint16(pkt.Header.UsedLength()) + if gso.NeedsCsum { + vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM + vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen + vnetHdr.csumOffset = gso.CsumOffset + } + if gso.Type != stack.GSONone && uint16(pkt.Data.Size()) > gso.MSS { + switch gso.Type { + case stack.GSOTCPv4: + vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 + case stack.GSOTCPv6: + vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 + default: + panic(fmt.Sprintf("Unknown gso type: %v", gso.Type)) + } + vnetHdr.gsoSize = gso.MSS } } + vnetHdrBuf = vnetHdrToByteSlice(&vnetHdr) + iovLen++ } - nextOff = off + packetSize + iovecs := make([]syscall.Iovec, iovLen+1+len(pkt.Data.Views())) + mmsgHdr := &mmsgHdrs[i] + mmsgHdr.Msg.Iov = &iovecs[0] + iovecIdx := 0 + if vnetHdrBuf != nil { + v := &iovecs[iovecIdx] + v.Base = &vnetHdrBuf[0] + v.Len = uint64(len(vnetHdrBuf)) + iovecIdx++ + } if ethHdrBuf != nil { - v := &iovec[iovecIdx] + v := &iovecs[iovecIdx] v.Base = ðHdrBuf[0] v.Len = uint64(len(ethHdrBuf)) iovecIdx++ } - - v := &iovec[iovecIdx] + pktSize := uint64(0) + // Encode L3 Header + v := &iovecs[iovecIdx] + hdr := &pkt.Header hdrView := hdr.View() v.Base = &hdrView[0] v.Len = uint64(len(hdrView)) + pktSize += v.Len iovecIdx++ - for packetSize > 0 { - vec := &iovec[iovecIdx] + // Now encode the Transport Payload. + pktViews := pkt.Data.Views() + for i := range pktViews { + vec := &iovecs[iovecIdx] iovecIdx++ - - v := views[viewIdx] - vec.Base = &v[viewOff] - s := len(v) - viewOff - if s <= packetSize { - viewIdx++ - viewOff = 0 - } else { - s = packetSize - viewOff += s - } - vec.Len = uint64(s) - packetSize -= s + vec.Base = &pktViews[i][0] + vec.Len = uint64(len(pktViews[i])) + pktSize += vec.Len } - - mmsgHdr.Msg.Iovlen = uint64(iovecIdx - prevIovecIdx) + mmsgHdr.Msg.Iovlen = uint64(iovecIdx) + i++ } packets := 0 for packets < n { - fd := e.fds[pkts[packets].Hash%uint32(len(e.fds))] + fd := e.fds[pkts.Front().Hash%uint32(len(e.fds))] sent, err := rawfile.NonBlockingSendMMsg(fd, mmsgHdrs) if err != nil { return packets, err diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go index 4039753b7..1e2255bfa 100644 --- a/pkg/tcpip/link/loopback/loopback.go +++ b/pkg/tcpip/link/loopback/loopback.go @@ -92,7 +92,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw } // WritePackets implements stack.LinkEndpoint.WritePackets. -func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketBuffer, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { +func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { panic("not implemented") } diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go index f5973066d..a5478ce17 100644 --- a/pkg/tcpip/link/muxed/injectable.go +++ b/pkg/tcpip/link/muxed/injectable.go @@ -87,7 +87,7 @@ func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, // WritePackets writes outbound packets to the appropriate // LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if // r.RemoteAddress has a route registered in this endpoint. -func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { +func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { endpoint, ok := m.routes[r.RemoteAddress] if !ok { return 0, tcpip.ErrNoRoute diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index 6461d0108..0796d717e 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -214,7 +214,7 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.Netw } // WritePackets implements stack.LinkEndpoint.WritePackets. -func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { +func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { panic("not implemented") } diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index 0a6b8945c..062388f4d 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -200,7 +200,7 @@ func (e *endpoint) GSOMaxSize() uint32 { return 0 } -func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) { +func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil { logPacket("send", protocol, pkt.Header.View(), gso) } @@ -233,20 +233,16 @@ func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumb // higher-level protocols to write packets; it just logs the packet and // forwards the request to the lower endpoint. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error { - e.dumpPacket(gso, protocol, pkt) + e.dumpPacket(gso, protocol, &pkt) return e.lower.WritePacket(r, gso, protocol, pkt) } // WritePackets implements the stack.LinkEndpoint interface. It is called by // higher-level protocols to write packets; it just logs the packet and // forwards the request to the lower endpoint. -func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { - view := pkts[0].Data.ToView() - for _, pkt := range pkts { - e.dumpPacket(gso, protocol, stack.PacketBuffer{ - Header: pkt.Header, - Data: view[pkt.DataOffset:][:pkt.DataSize].ToVectorisedView(), - }) +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + e.dumpPacket(gso, protocol, pkt) } return e.lower.WritePackets(r, gso, pkts, protocol) } diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go index 52fe397bf..2b3741276 100644 --- a/pkg/tcpip/link/waitable/waitable.go +++ b/pkg/tcpip/link/waitable/waitable.go @@ -112,9 +112,9 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne // WritePackets implements stack.LinkEndpoint.WritePackets. It is called by // higher-level protocols to write packets. It only forwards packets to the // lower endpoint if Wait or WaitWrite haven't been called. -func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { +func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { if !e.writeGate.Enter() { - return len(pkts), nil + return pkts.Len(), nil } n, err := e.lower.WritePackets(r, gso, pkts, protocol) diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go index 88224e494..54eb5322b 100644 --- a/pkg/tcpip/link/waitable/waitable_test.go +++ b/pkg/tcpip/link/waitable/waitable_test.go @@ -71,9 +71,9 @@ func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcp } // WritePackets implements stack.LinkEndpoint.WritePackets. -func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { - e.writeCount += len(pkts) - return len(pkts), nil +func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + e.writeCount += pkts.Len() + return pkts.Len(), nil } func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error { diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 255098372..7acbfa0a8 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -84,7 +84,7 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderPara } // WritePackets implements stack.NetworkEndpoint.WritePackets. -func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) { +func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, stack.NetworkHeaderParams) (int, *tcpip.Error) { return 0, tcpip.ErrNotSupported } diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index 4950d69fc..4c20301c6 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -172,7 +172,7 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Ne } // WritePackets implements stack.LinkEndpoint.WritePackets. -func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { +func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { panic("not implemented") } diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index a7d9a8b25..104aafbed 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -280,28 +280,47 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw } // WritePackets implements stack.NetworkEndpoint.WritePackets. -func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) { +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) { if r.Loop&stack.PacketLoop != 0 { panic("multiple packets in local loop") } if r.Loop&stack.PacketOut == 0 { - return len(pkts), nil + return pkts.Len(), nil + } + + for pkt := pkts.Front(); pkt != nil; { + ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params) + pkt.NetworkHeader = buffer.View(ip) + pkt = pkt.Next() } // iptables filtering. All packets that reach here are locally // generated. ipt := e.stack.IPTables() - for i := range pkts { - if ok := ipt.Check(stack.Output, pkts[i]); !ok { - // iptables is telling us to drop the packet. + dropped := ipt.CheckPackets(stack.Output, pkts) + if len(dropped) == 0 { + // Fast path: If no packets are to be dropped then we can just invoke the + // faster WritePackets API directly. + n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber) + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) + return n, err + } + + // Slow Path as we are dropping some packets in the batch degrade to + // emitting one packet at a time. + n := 0 + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + if _, ok := dropped[pkt]; ok { continue } - ip := e.addIPHeader(r, &pkts[i].Header, pkts[i].DataSize, params) - pkts[i].NetworkHeader = buffer.View(ip) + if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, *pkt); err != nil { + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) + return n, err + } + n++ } - n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber) r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) - return n, err + return n, nil } // WriteHeaderIncludedPacket writes a packet already containing a network diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index 6d2d2c034..f91180aa3 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -79,7 +79,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P // Only the first view in vv is accounted for by h. To account for the // rest of vv, a shallow copy is made and the first view is removed. // This copy is used as extra payload during the checksum calculation. - payload := pkt.Data + payload := pkt.Data.Clone(nil) payload.RemoveFirst() if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want { received.Invalid.Increment() diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index b462b8604..a815b4d9b 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -143,19 +143,17 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw } // WritePackets implements stack.LinkEndpoint.WritePackets. -func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) { +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) { if r.Loop&stack.PacketLoop != 0 { panic("not implemented") } if r.Loop&stack.PacketOut == 0 { - return len(pkts), nil + return pkts.Len(), nil } - for i := range pkts { - hdr := &pkts[i].Header - size := pkts[i].DataSize - ip := e.addIPHeader(r, hdr, size, params) - pkts[i].NetworkHeader = buffer.View(ip) + for pb := pkts.Front(); pb != nil; pb = pb.Next() { + ip := e.addIPHeader(r, &pb.Header, pb.Data.Size(), params) + pb.NetworkHeader = buffer.View(ip) } n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber) diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index 8d80e9cee..5e963a4af 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -15,6 +15,18 @@ go_template_instance( }, ) +go_template_instance( + name = "packet_buffer_list", + out = "packet_buffer_list.go", + package = "stack", + prefix = "PacketBuffer", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*PacketBuffer", + "Linker": "*PacketBuffer", + }, +) + go_library( name = "stack", srcs = [ @@ -29,7 +41,7 @@ go_library( "ndp.go", "nic.go", "packet_buffer.go", - "packet_buffer_state.go", + "packet_buffer_list.go", "rand.go", "registration.go", "route.go", diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go index c45c43d21..e9c652042 100644 --- a/pkg/tcpip/stack/forwarder_test.go +++ b/pkg/tcpip/stack/forwarder_test.go @@ -101,7 +101,7 @@ func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkH } // WritePackets implements LinkEndpoint.WritePackets. -func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) { +func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) { panic("not implemented") } @@ -260,10 +260,10 @@ func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.Netw } // WritePackets stores outbound packets into the channel. -func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { +func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { n := 0 - for _, pkt := range pkts { - e.WritePacket(r, gso, protocol, pkt) + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + e.WritePacket(r, gso, protocol, *pkt) n++ } diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index 37907ae24..6c0a4b24d 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -209,6 +209,23 @@ func (it *IPTables) Check(hook Hook, pkt PacketBuffer) bool { return true } +// CheckPackets runs pkts through the rules for hook and returns a map of packets that +// should not go forward. +// +// NOTE: unlike the Check API the returned map contains packets that should be +// dropped. +func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList) (drop map[*PacketBuffer]struct{}) { + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + if ok := it.Check(hook, *pkt); !ok { + if drop == nil { + drop = make(map[*PacketBuffer]struct{}) + } + drop[pkt] = struct{}{} + } + } + return drop +} + // Precondition: pkt.NetworkHeader is set. func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) chainVerdict { // Start from ruleIdx and walk the list of rules until a rule gives us diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 598468bdd..27dc8baf9 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -468,7 +468,7 @@ func TestDADResolve(t *testing.T) { // As per RFC 4861 section 4.3, a possible option is the Source Link // Layer option, but this option MUST NOT be included when the source // address of the packet is the unspecified address. - checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(), + checker.IPv6(t, p.Pkt.Header.View(), checker.SrcAddr(header.IPv6Any), checker.DstAddr(snmc), checker.TTL(header.NDPHopLimit), diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go index 9367de180..dc125f25e 100644 --- a/pkg/tcpip/stack/packet_buffer.go +++ b/pkg/tcpip/stack/packet_buffer.go @@ -23,9 +23,11 @@ import ( // As a PacketBuffer traverses up the stack, it may be necessary to pass it to // multiple endpoints. Clone() should be called in such cases so that // modifications to the Data field do not affect other copies. -// -// +stateify savable type PacketBuffer struct { + // PacketBufferEntry is used to build an intrusive list of + // PacketBuffers. + PacketBufferEntry + // Data holds the payload of the packet. For inbound packets, it also // holds the headers, which are consumed as the packet moves up the // stack. Headers are guaranteed not to be split across views. @@ -34,14 +36,6 @@ type PacketBuffer struct { // or otherwise modified. Data buffer.VectorisedView - // DataOffset is used for GSO output. It is the offset into the Data - // field where the payload of this packet starts. - DataOffset int - - // DataSize is used for GSO output. It is the size of this packet's - // payload. - DataSize int - // Header holds the headers of outbound packets. As a packet is passed // down the stack, each layer adds to Header. Header buffer.Prependable diff --git a/pkg/tcpip/stack/packet_buffer_state.go b/pkg/tcpip/stack/packet_buffer_state.go deleted file mode 100644 index 0c6b7924c..000000000 --- a/pkg/tcpip/stack/packet_buffer_state.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package stack - -import "gvisor.dev/gvisor/pkg/tcpip/buffer" - -// beforeSave is invoked by stateify. -func (pk *PacketBuffer) beforeSave() { - // Non-Data fields may be slices of the Data field. This causes - // problems for SR, so during save we make each header independent. - pk.Header = pk.Header.DeepCopy() - pk.LinkHeader = append(buffer.View(nil), pk.LinkHeader...) - pk.NetworkHeader = append(buffer.View(nil), pk.NetworkHeader...) - pk.TransportHeader = append(buffer.View(nil), pk.TransportHeader...) -} diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index ac043b722..23ca9ee03 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -246,7 +246,7 @@ type NetworkEndpoint interface { // WritePackets writes packets to the given destination address and // protocol. pkts must not be zero length. - WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) + WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) // WriteHeaderIncludedPacket writes a packet that includes a network // header to the given destination address. @@ -393,7 +393,7 @@ type LinkEndpoint interface { // Right now, WritePackets is used only when the software segmentation // offload is enabled. If it will be used for something else, it may // require to change syscall filters. - WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) + WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) // WriteRawPacket writes a packet directly to the link. The packet // should already have an ethernet header. diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index 9fbe8a411..a0e5e0300 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -168,23 +168,26 @@ func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt PacketBuff return err } -// WritePackets writes the set of packets through the given route. -func (r *Route) WritePackets(gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) { +// WritePackets writes a list of n packets through the given route and returns +// the number of packets written. +func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) { if !r.ref.isValidForOutgoing() { return 0, tcpip.ErrInvalidEndpointState } n, err := r.ref.ep.WritePackets(r, gso, pkts, params) if err != nil { - r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(pkts) - n)) + r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n)) } r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n)) - payloadSize := 0 - for i := 0; i < n; i++ { - r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkts[i].Header.UsedLength())) - payloadSize += pkts[i].DataSize + + writtenBytes := 0 + for i, pb := 0, pkts.Front(); i < n && pb != nil; i, pb = i+1, pb.Next() { + writtenBytes += pb.Header.UsedLength() + writtenBytes += pb.Data.Size() } - r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payloadSize)) + + r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes)) return n, err } diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index b8543b71e..3f8a2a095 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -153,7 +153,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params } // WritePackets implements stack.LinkEndpoint.WritePackets. -func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) { +func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) { panic("not implemented") } diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 3239a5911..2ca3fb809 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -756,8 +756,7 @@ func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedV func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) { optLen := len(tf.opts) hdr := &pkt.Header - packetSize := pkt.DataSize - off := pkt.DataOffset + packetSize := pkt.Data.Size() // Initialize the header. tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen)) pkt.TransportHeader = buffer.View(tcp) @@ -782,12 +781,18 @@ func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *sta // header and data and get the right sum of the TCP packet. tcp.SetChecksum(xsum) } else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 { - xsum = header.ChecksumVVWithOffset(pkt.Data, xsum, off, packetSize) + xsum = header.ChecksumVV(pkt.Data, xsum) tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) } } func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error { + // We need to shallow clone the VectorisedView here as ReadToView will + // split the VectorisedView and Trim underlying views as it splits. Not + // doing the clone here will cause the underlying views of data itself + // to be altered. + data = data.Clone(nil) + optLen := len(tf.opts) if tf.rcvWnd > 0xffff { tf.rcvWnd = 0xffff @@ -796,31 +801,25 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso mss := int(gso.MSS) n := (data.Size() + mss - 1) / mss - // Allocate one big slice for all the headers. - hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen - buf := make([]byte, n*hdrSize) - pkts := make([]stack.PacketBuffer, n) - for i := range pkts { - pkts[i].Header = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize]) - } - size := data.Size() - off := 0 + hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen + var pkts stack.PacketBufferList for i := 0; i < n; i++ { packetSize := mss if packetSize > size { packetSize = size } size -= packetSize - pkts[i].DataOffset = off - pkts[i].DataSize = packetSize - pkts[i].Data = data - pkts[i].Hash = tf.txHash - pkts[i].Owner = owner - buildTCPHdr(r, tf, &pkts[i], gso) - off += packetSize + var pkt stack.PacketBuffer + pkt.Header = buffer.NewPrependable(hdrSize) + pkt.Hash = tf.txHash + pkt.Owner = owner + data.ReadToVV(&pkt.Data, packetSize) + buildTCPHdr(r, tf, &pkt, gso) tf.seq = tf.seq.Add(seqnum.Size(packetSize)) + pkts.PushBack(&pkt) } + if tf.ttl == 0 { tf.ttl = r.DefaultTTL() } @@ -845,12 +844,10 @@ func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stac } pkt := stack.PacketBuffer{ - Header: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen), - DataOffset: 0, - DataSize: data.Size(), - Data: data, - Hash: tf.txHash, - Owner: owner, + Header: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen), + Data: data, + Hash: tf.txHash, + Owner: owner, } buildTCPHdr(r, tf, &pkt, gso) diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go index e6fe7985d..40461fd31 100644 --- a/pkg/tcpip/transport/tcp/segment.go +++ b/pkg/tcpip/transport/tcp/segment.go @@ -77,9 +77,11 @@ func newSegmentFromView(r *stack.Route, id stack.TransportEndpointID, v buffer.V id: id, route: r.Clone(), } - s.views[0] = v - s.data = buffer.NewVectorisedView(len(v), s.views[:1]) s.rcvdTime = time.Now() + if len(v) != 0 { + s.views[0] = v + s.data = buffer.NewVectorisedView(len(v), s.views[:1]) + } return s } -- cgit v1.2.3 From 24bee1c1813a691072cff5bad7a528690a99eb5e Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Sat, 4 Apr 2020 21:01:42 -0700 Subject: Record VFS2 sockets in global socket map. Updates #1476, #1478, #1484, #1485. PiperOrigin-RevId: 304845354 --- pkg/sentry/fsimpl/proc/BUILD | 1 - pkg/sentry/fsimpl/proc/task_net.go | 88 ++++++++++++++++++++++--------------- pkg/sentry/kernel/kernel.go | 30 +++++++++++-- pkg/sentry/socket/socket.go | 6 ++- pkg/sentry/socket/unix/unix_vfs2.go | 2 +- pkg/sentry/vfs/file_description.go | 6 +++ 6 files changed, 91 insertions(+), 42 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 8156984eb..17c1342b5 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -22,7 +22,6 @@ go_library( "//pkg/log", "//pkg/refs", "//pkg/safemem", - "//pkg/sentry/fs", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 373a7b17d..6b2a77328 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -32,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/usermem" @@ -206,22 +206,21 @@ var _ dynamicInode = (*netUnixData)(nil) func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") for _, se := range n.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { s.DecRef() // Not a unix socket. continue } - sops := sfile.FileOperations.(*unix.SocketOperations) + sops := s.Impl().(*unix.SocketVFS2) addr, err := sops.Endpoint().GetLocalAddress() if err != nil { - log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) + log.Warningf("Failed to retrieve socket name from %+v: %v", s, err) addr.Addr = "" } @@ -234,6 +233,15 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { } } + // Get inode number. + var ino uint64 + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO}) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve ino for socket file: %v", statErr) + } else { + ino = stat.Ino + } + // In the socket entry below, the value for the 'Num' field requires // some consideration. Linux prints the address to the struct // unix_sock representing a socket in the kernel, but may redact the @@ -252,14 +260,14 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // the definition of this struct changes over time. // // For now, we always redact this pointer. - fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - sfile.ReadRefs()-1, // RefCount, don't count our own ref. + s.Refs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. sops.State(), // State. - sfile.InodeID(), // Inode. + ino, // Inode. ) // Path @@ -341,15 +349,14 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, t := kernel.TaskFromContext(ctx) for _, se := range k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) + sops, ok := s.Impl().(socket.SocketVFS2) if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { s.DecRef() @@ -398,14 +405,15 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Unimplemented. fmt.Fprintf(buf, "%08X ", 0) + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout; number of unanswered 0-window probes. @@ -413,11 +421,16 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, fmt.Fprintf(buf, "%8d ", 0) // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + fmt.Fprintf(buf, "%d ", s.Refs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -499,15 +512,14 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { t := kernel.TaskFromContext(ctx) for _, se := range d.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) + sops, ok := s.Impl().(socket.SocketVFS2) if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { s.DecRef() @@ -551,25 +563,31 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: retrnsmt. Always 0 for UDP. fmt.Fprintf(buf, "%08X ", 0) + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout. Always 0 for UDP. fmt.Fprintf(buf, "%8d ", 0) // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + fmt.Fprintf(buf, "%d ", s.Refs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 2e6f42b92..ba8935a82 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1445,9 +1445,10 @@ func (k *Kernel) SupervisorContext() context.Context { // +stateify savable type SocketEntry struct { socketEntry - k *Kernel - Sock *refs.WeakRef - ID uint64 // Socket table entry number. + k *Kernel + Sock *refs.WeakRef + SockVFS2 *vfs.FileDescription + ID uint64 // Socket table entry number. } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. @@ -1470,7 +1471,30 @@ func (k *Kernel) RecordSocket(sock *fs.File) { k.extMu.Unlock() } +// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for +// tracking. +// +// Precondition: Caller must hold a reference to sock. +// +// Note that the socket table will not hold a reference on the +// vfs.FileDescription, because we do not support weak refs on VFS2 files. +func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) { + k.extMu.Lock() + id := k.nextSocketEntry + k.nextSocketEntry++ + s := &SocketEntry{ + k: k, + ID: id, + SockVFS2: sock, + } + k.sockets.PushBack(s) + k.extMu.Unlock() +} + // ListSockets returns a snapshot of all sockets. +// +// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef() +// to get a reference on a socket in the table. func (k *Kernel) ListSockets() []*SocketEntry { k.extMu.Lock() var socks []*SocketEntry diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index b5ba4a56b..6580bd6e9 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -269,7 +269,7 @@ func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*v return nil, err } if s != nil { - // TODO: Add vfs2 sockets to global socket table. + t.Kernel().RecordSocketVFS2(s) return s, nil } } @@ -291,7 +291,9 @@ func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (* return nil, nil, err } if s1 != nil && s2 != nil { - // TODO: Add vfs2 sockets to global socket table. + k := t.Kernel() + k.RecordSocketVFS2(s1) + k.RecordSocketVFS2(s2) return s1, s2, nil } } diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index ca1388e2c..3e54d49c4 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -141,7 +141,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block return 0, nil, 0, syserr.FromError(e) } - // TODO: add vfs2 sockets to global table. + t.Kernel().RecordSocketVFS2(ns) return fd, addr, addrLen, nil } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 5df4bbf45..28e93a441 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -182,6 +182,12 @@ func (fd *FileDescription) DecRef() { } } +// Refs returns the current number of references. The returned count +// is inherently racy and is unsafe to use without external synchronization. +func (fd *FileDescription) Refs() int64 { + return atomic.LoadInt64(&fd.refs) +} + // Mount returns the mount on which fd was opened. It does not take a reference // on the returned Mount. func (fd *FileDescription) Mount() *Mount { -- cgit v1.2.3 From f332a864e8cc7799332838deffab37244ff8ffc7 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 6 Apr 2020 10:51:54 -0700 Subject: Port timerfd to VFS2. PiperOrigin-RevId: 305067208 --- pkg/sentry/kernel/kernel.go | 28 ++-- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + .../syscalls/linux/vfs2/linux64_override_amd64.go | 6 +- pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go | 123 ++++++++++++++++++ pkg/sentry/vfs/BUILD | 2 + pkg/sentry/vfs/file_description.go | 7 + pkg/sentry/vfs/timerfd.go | 142 +++++++++++++++++++++ 7 files changed, 295 insertions(+), 14 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go create mode 100644 pkg/sentry/vfs/timerfd.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index ba8935a82..de8a95854 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1044,14 +1044,17 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if !VFS2Enabled { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + tfd.PauseTimer() + } + } else { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } - }) - } + } + }) } } k.timekeeper.PauseUpdates() @@ -1076,15 +1079,18 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if !VFS2Enabled { - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + tfd.ResumeTimer() + } + } else { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } - }) - } + } + }) } } } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 2eb210014..0004e60d9 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -25,6 +25,7 @@ go_library( "stat_amd64.go", "stat_arm64.go", "sync.go", + "sys_timerfd.go", "xattr.go", ], marshal = True, diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index 7d220bc20..63febc2f7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -139,11 +139,11 @@ func Override(table map[uintptr]kernel.Syscall) { table[280] = syscalls.Supported("utimensat", Utimensat) table[281] = syscalls.Supported("epoll_pwait", EpollPwait) delete(table, 282) // signalfd - delete(table, 283) // timerfd_create + table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) delete(table, 284) // eventfd delete(table, 285) // fallocate - delete(table, 286) // timerfd_settime - delete(table, 287) // timerfd_gettime + table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) + table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) delete(table, 288) // accept4 delete(table, 289) // signalfd4 delete(table, 290) // eventfd2 diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go b/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go new file mode 100644 index 000000000..7938a5249 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go @@ -0,0 +1,123 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// TimerfdCreate implements Linux syscall timerfd_create(2). +func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := args[0].Int() + flags := args[1].Int() + + if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { + return 0, nil, syserror.EINVAL + } + + var fileFlags uint32 + if flags&linux.TFD_NONBLOCK != 0 { + fileFlags = linux.O_NONBLOCK + } + + var clock ktime.Clock + switch clockID { + case linux.CLOCK_REALTIME: + clock = t.Kernel().RealtimeClock() + case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: + clock = t.Kernel().MonotonicClock() + default: + return 0, nil, syserror.EINVAL + } + file, err := t.Kernel().VFS().NewTimerFD(clock, fileFlags) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.TFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// TimerfdSettime implements Linux syscall timerfd_settime(2). +func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + flags := args[1].Int() + newValAddr := args[2].Pointer() + oldValAddr := args[3].Pointer() + + if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + tfd, ok := file.Impl().(*vfs.TimerFileDescription) + if !ok { + return 0, nil, syserror.EINVAL + } + + var newVal linux.Itimerspec + if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + return 0, nil, err + } + newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock()) + if err != nil { + return 0, nil, err + } + tm, oldS := tfd.SetTime(newS) + if oldValAddr != 0 { + oldVal := ktime.ItimerspecFromSetting(tm, oldS) + if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// TimerfdGettime implements Linux syscall timerfd_gettime(2). +func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + curValAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + tfd, ok := file.Impl().(*vfs.TimerFileDescription) + if !ok { + return 0, nil, syserror.EINVAL + } + + tm, s := tfd.GetTime() + curVal := ktime.ItimerspecFromSetting(tm, s) + _, err := t.CopyOut(curValAddr, &curVal) + return 0, nil, err +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index bf4d27c7d..9aeb83fb0 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -36,6 +36,7 @@ go_library( "pathname.go", "permissions.go", "resolving_path.go", + "timerfd.go", "vfs.go", ], visibility = ["//pkg/sentry:internal"], @@ -51,6 +52,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 28e93a441..20c545fca 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -91,6 +91,10 @@ type FileDescriptionOptions struct { // ESPIPE. DenyPWrite bool + // if InvalidWrite is true, calls to FileDescription.Write() return + // EINVAL. + InvalidWrite bool + // If UseDentryMetadata is true, calls to FileDescription methods that // interact with file and filesystem metadata (Stat, SetStat, StatFS, // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling @@ -562,6 +566,9 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + if fd.opts.InvalidWrite { + return 0, syserror.EINVAL + } if !fd.writable { return 0, syserror.EBADF } diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go new file mode 100644 index 000000000..42b880656 --- /dev/null +++ b/pkg/sentry/vfs/timerfd.go @@ -0,0 +1,142 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// implements ktime.TimerListener. +type TimerFileDescription struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + + events waiter.Queue + timer *ktime.Timer + + // val is the number of timer expirations since the last successful + // call to PRead, or SetTime. val must be accessed using atomic memory + // operations. + val uint64 +} + +var _ FileDescriptionImpl = (*TimerFileDescription)(nil) +var _ ktime.TimerListener = (*TimerFileDescription)(nil) + +// NewTimerFD returns a new timer fd. +func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) { + vd := vfs.NewAnonVirtualDentry("[timerfd]") + defer vd.DecRef() + tfd := &TimerFileDescription{} + tfd.timer = ktime.NewTimer(clock, tfd) + if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + InvalidWrite: true, + }); err != nil { + return nil, err + } + return &tfd.vfsfd, nil +} + +// Read implements FileDescriptionImpl.Read. +func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + const sizeofUint64 = 8 + if dst.NumBytes() < sizeofUint64 { + return 0, syserror.EINVAL + } + if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], val) + if _, err := dst.CopyOut(ctx, buf[:]); err != nil { + // Linux does not undo consuming the number of + // expirations even if writing to userspace fails. + return 0, err + } + return sizeofUint64, nil + } + return 0, syserror.ErrWouldBlock +} + +// Clock returns the timer fd's Clock. +func (tfd *TimerFileDescription) Clock() ktime.Clock { + return tfd.timer.Clock() +} + +// GetTime returns the associated Timer's setting and the time at which it was +// observed. +func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { + return tfd.timer.Get() +} + +// SetTime atomically changes the associated Timer's setting, resets the number +// of expirations to 0, and returns the previous setting and the time at which +// it was observed. +func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { + return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + if atomic.LoadUint64(&tfd.val) != 0 { + ready |= waiter.EventIn + } + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + tfd.events.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { + tfd.events.EventUnregister(e) +} + +// PauseTimer pauses the associated Timer. +func (tfd *TimerFileDescription) PauseTimer() { + tfd.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (tfd *TimerFileDescription) ResumeTimer() { + tfd.timer.Resume() +} + +// Release implements FileDescriptionImpl.Release() +func (tfd *TimerFileDescription) Release() { + tfd.timer.Destroy() +} + +// Notify implements ktime.TimerListener.Notify. +func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + atomic.AddUint64(&tfd.val, exp) + tfd.events.Notify(waiter.EventIn) + return ktime.Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (tfd *TimerFileDescription) Destroy() {} -- cgit v1.2.3 From 94b793262d3c54b4c32fed83d2bd121069680d15 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 25 Mar 2020 16:55:02 -0700 Subject: Fix all copy locks violations. This required minor restructuring of how system call tables were saved and restored, but it makes way more sense this way. Updates #2243 --- pkg/log/glog.go | 6 +++--- pkg/log/json.go | 2 +- pkg/log/json_k8s.go | 4 ++-- pkg/log/log.go | 2 +- pkg/log/log_test.go | 6 +++--- pkg/sentry/contexttest/contexttest.go | 4 ++-- pkg/sentry/fs/host/socket_test.go | 6 +++--- pkg/sentry/fs/proc/sys_net.go | 4 ++-- pkg/sentry/kernel/syscalls.go | 33 ++++++++++++++++---------------- pkg/sentry/kernel/syscalls_state.go | 36 ++++++++++++++++++++++++++--------- pkg/sentry/kernel/task_context.go | 2 +- pkg/sentry/kernel/time/time.go | 10 +++++----- pkg/state/state.go | 5 +---- runsc/boot/compat.go | 2 +- runsc/main.go | 6 +++--- tools/go_stateify/main.go | 2 +- tools/nogo.json | 13 ------------- 17 files changed, 72 insertions(+), 71 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/log/glog.go b/pkg/log/glog.go index b4f7bb5a4..f57c4427b 100644 --- a/pkg/log/glog.go +++ b/pkg/log/glog.go @@ -25,7 +25,7 @@ import ( // GoogleEmitter is a wrapper that emits logs in a format compatible with // package github.com/golang/glog. type GoogleEmitter struct { - Writer + *Writer } // pid is used for the threadid component of the header. @@ -46,7 +46,7 @@ var pid = os.Getpid() // line The line number // msg The user-supplied message // -func (g *GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) { +func (g GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) { // Log level. prefix := byte('?') switch level { @@ -81,5 +81,5 @@ func (g *GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format message := fmt.Sprintf(format, args...) // Emit the formatted result. - fmt.Fprintf(&g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message) + fmt.Fprintf(g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message) } diff --git a/pkg/log/json.go b/pkg/log/json.go index 0943db1cc..bdf9d691e 100644 --- a/pkg/log/json.go +++ b/pkg/log/json.go @@ -58,7 +58,7 @@ func (lv *Level) UnmarshalJSON(b []byte) error { // JSONEmitter logs messages in json format. type JSONEmitter struct { - Writer + *Writer } // Emit implements Emitter.Emit. diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go index 6c6fc8b6f..5883e95e1 100644 --- a/pkg/log/json_k8s.go +++ b/pkg/log/json_k8s.go @@ -29,11 +29,11 @@ type k8sJSONLog struct { // K8sJSONEmitter logs messages in json format that is compatible with // Kubernetes fluent configuration. type K8sJSONEmitter struct { - Writer + *Writer } // Emit implements Emitter.Emit. -func (e *K8sJSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) { +func (e K8sJSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) { j := k8sJSONLog{ Log: fmt.Sprintf(format, v...), Level: level, diff --git a/pkg/log/log.go b/pkg/log/log.go index a794da1aa..37e0605ad 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -374,5 +374,5 @@ func CopyStandardLogTo(l Level) error { func init() { // Store the initial value for the log. - log.Store(&BasicLogger{Level: Info, Emitter: &GoogleEmitter{Writer{Next: os.Stderr}}}) + log.Store(&BasicLogger{Level: Info, Emitter: GoogleEmitter{&Writer{Next: os.Stderr}}}) } diff --git a/pkg/log/log_test.go b/pkg/log/log_test.go index 402cc29ae..9ff18559b 100644 --- a/pkg/log/log_test.go +++ b/pkg/log/log_test.go @@ -52,7 +52,7 @@ func TestDropMessages(t *testing.T) { t.Fatalf("Write should have failed") } - fmt.Printf("writer: %+v\n", w) + fmt.Printf("writer: %#v\n", &w) tw.fail = false if _, err := w.Write([]byte("line 2\n")); err != nil { @@ -76,7 +76,7 @@ func TestDropMessages(t *testing.T) { func TestCaller(t *testing.T) { tw := &testWriter{} - e := &GoogleEmitter{Writer: Writer{Next: tw}} + e := GoogleEmitter{Writer: &Writer{Next: tw}} bl := &BasicLogger{ Emitter: e, Level: Debug, @@ -94,7 +94,7 @@ func BenchmarkGoogleLogging(b *testing.B) { tw := &testWriter{ limit: 1, // Only record one message. } - e := &GoogleEmitter{Writer: Writer{Next: tw}} + e := GoogleEmitter{Writer: &Writer{Next: tw}} bl := &BasicLogger{ Emitter: e, Level: Debug, diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go index 031fc64ec..8e5658c7a 100644 --- a/pkg/sentry/contexttest/contexttest.go +++ b/pkg/sentry/contexttest/contexttest.go @@ -97,7 +97,7 @@ type hostClock struct { } // Now implements ktime.Clock.Now. -func (hostClock) Now() ktime.Time { +func (*hostClock) Now() ktime.Time { return ktime.FromNanoseconds(time.Now().UnixNano()) } @@ -127,7 +127,7 @@ func (t *TestContext) Value(key interface{}) interface{} { case uniqueid.CtxInotifyCookie: return atomic.AddUint32(&lastInotifyCookie, 1) case ktime.CtxRealtimeClock: - return hostClock{} + return &hostClock{} default: if val, ok := t.otherValues[key]; ok { return val diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index eb4afe520..affdbcacb 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -199,14 +199,14 @@ func TestListen(t *testing.T) { } func TestPasscred(t *testing.T) { - e := ConnectedEndpoint{} + e := &ConnectedEndpoint{} if got, want := e.Passcred(), false; got != want { t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want) } } func TestGetLocalAddress(t *testing.T) { - e := ConnectedEndpoint{path: "foo"} + e := &ConnectedEndpoint{path: "foo"} want := tcpip.FullAddress{Addr: tcpip.Address("foo")} if got, err := e.GetLocalAddress(); err != nil || got != want { t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil) @@ -214,7 +214,7 @@ func TestGetLocalAddress(t *testing.T) { } func TestQueuedSize(t *testing.T) { - e := ConnectedEndpoint{} + e := &ConnectedEndpoint{} tests := []struct { name string f func() int64 diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index d4c4b533d..702fdd392 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -80,7 +80,7 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir } // Truncate implements fs.InodeOperations.Truncate. -func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error { +func (*tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error { return nil } @@ -196,7 +196,7 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f } // Truncate implements fs.InodeOperations.Truncate. -func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error { +func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error { return nil } diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 93c4fe969..c9a2321b8 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -218,56 +218,55 @@ type Stracer interface { SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) } -// SyscallTable is a lookup table of system calls. Critically, a SyscallTable -// is *immutable*. In order to make supporting suspend and resume sane, they -// must be uniquely registered and may not change during operation. +// SyscallTable is a lookup table of system calls. // -// +stateify savable +// Note that a SyscallTable is not savable directly. Instead, they are saved as +// an OS/Arch pair and lookup happens again on restore. type SyscallTable struct { // OS is the operating system that this syscall table implements. - OS abi.OS `state:"wait"` + OS abi.OS // Arch is the architecture that this syscall table targets. - Arch arch.Arch `state:"wait"` + Arch arch.Arch // The OS version that this syscall table implements. - Version Version `state:"manual"` + Version Version // AuditNumber is a numeric constant that represents the syscall table. If // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by // linux/audit.h. - AuditNumber uint32 `state:"manual"` + AuditNumber uint32 // Table is the collection of functions. - Table map[uintptr]Syscall `state:"manual"` + Table map[uintptr]Syscall // lookup is a fixed-size array that holds the syscalls (indexed by // their numbers). It is used for fast look ups. - lookup []SyscallFn `state:"manual"` + lookup []SyscallFn // Emulate is a collection of instruction addresses to emulate. The // keys are addresses, and the values are system call numbers. - Emulate map[usermem.Addr]uintptr `state:"manual"` + Emulate map[usermem.Addr]uintptr // The function to call in case of a missing system call. - Missing MissingFn `state:"manual"` + Missing MissingFn // Stracer traces this syscall table. - Stracer Stracer `state:"manual"` + Stracer Stracer // External is used to handle an external callback. - External func(*Kernel) `state:"manual"` + External func(*Kernel) // ExternalFilterBefore is called before External is called before the syscall is executed. // External is not called if it returns false. - ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool // ExternalFilterAfter is called before External is called after the syscall is executed. // External is not called if it returns false. - ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool // FeatureEnable stores the strace and one-shot enable bits. - FeatureEnable SyscallFlagsTable `state:"manual"` + FeatureEnable SyscallFlagsTable } // allSyscallTables contains all known tables. diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go index 00358326b..90f890495 100644 --- a/pkg/sentry/kernel/syscalls_state.go +++ b/pkg/sentry/kernel/syscalls_state.go @@ -14,16 +14,34 @@ package kernel -import "fmt" +import ( + "fmt" -// afterLoad is invoked by stateify. -func (s *SyscallTable) afterLoad() { - otherTable, ok := LookupSyscallTable(s.OS, s.Arch) - if !ok { - // Couldn't find a reference? - panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch)) + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +// syscallTableInfo is used to reload the SyscallTable. +// +// +stateify savable +type syscallTableInfo struct { + OS abi.OS + Arch arch.Arch +} + +// saveSt saves the SyscallTable. +func (tc *TaskContext) saveSt() syscallTableInfo { + return syscallTableInfo{ + OS: tc.st.OS, + Arch: tc.st.Arch, } +} - // Copy the table. - *s = *otherTable +// loadSt loads the SyscallTable. +func (tc *TaskContext) loadSt(sti syscallTableInfo) { + st, ok := LookupSyscallTable(sti.OS, sti.Arch) + if !ok { + panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch)) + } + tc.st = st // Save the table reference. } diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 0158b1788..c115e8d1f 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -49,7 +49,7 @@ type TaskContext struct { fu *futex.Manager // st is the task's syscall table. - st *SyscallTable + st *SyscallTable `state:".(syscallTableInfo)"` } // release releases all resources held by the TaskContext. release is called by diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index 706de83ef..e959700f2 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -245,7 +245,7 @@ type Clock interface { type WallRateClock struct{} // WallTimeUntil implements Clock.WallTimeUntil. -func (WallRateClock) WallTimeUntil(t, now Time) time.Duration { +func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration { return t.Sub(now) } @@ -254,16 +254,16 @@ func (WallRateClock) WallTimeUntil(t, now Time) time.Duration { type NoClockEvents struct{} // Readiness implements waiter.Waitable.Readiness. -func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask { +func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask { return 0 } // EventRegister implements waiter.Waitable.EventRegister. -func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) { } // EventUnregister implements waiter.Waitable.EventUnregister. -func (NoClockEvents) EventUnregister(e *waiter.Entry) { +func (*NoClockEvents) EventUnregister(e *waiter.Entry) { } // ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and @@ -273,7 +273,7 @@ type ClockEventsQueue struct { } // Readiness implements waiter.Waitable.Readiness. -func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { +func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { return 0 } diff --git a/pkg/state/state.go b/pkg/state/state.go index dbe507ab4..03ae2dbb0 100644 --- a/pkg/state/state.go +++ b/pkg/state/state.go @@ -241,10 +241,7 @@ func Register(name string, instance interface{}, fns Fns) { // // This function is used by the stateify tool. func IsZeroValue(val interface{}) bool { - if val == nil { - return true - } - return reflect.DeepEqual(val, reflect.Zero(reflect.TypeOf(val)).Interface()) + return val == nil || reflect.ValueOf(val).Elem().IsZero() } // step captures one encoding / decoding step. On each step, there is up to one diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 8995d678e..b7cfb35bf 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -65,7 +65,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) { if logFD > 0 { f := os.NewFile(uintptr(logFD), "user log file") - target := &log.MultiEmitter{c.sink, &log.K8sJSONEmitter{log.Writer{Next: f}}} + target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}} c.sink = &log.BasicLogger{Level: log.Info, Emitter: target} } return c, nil diff --git a/runsc/main.go b/runsc/main.go index 62e184ec9..c1c78529c 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -342,11 +342,11 @@ func main() { func newEmitter(format string, logFile io.Writer) log.Emitter { switch format { case "text": - return &log.GoogleEmitter{log.Writer{Next: logFile}} + return log.GoogleEmitter{&log.Writer{Next: logFile}} case "json": - return &log.JSONEmitter{log.Writer{Next: logFile}} + return log.JSONEmitter{&log.Writer{Next: logFile}} case "json-k8s": - return &log.K8sJSONEmitter{log.Writer{Next: logFile}} + return log.K8sJSONEmitter{&log.Writer{Next: logFile}} } cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format) panic("unreachable") diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go index 3437aa476..309ee9c21 100644 --- a/tools/go_stateify/main.go +++ b/tools/go_stateify/main.go @@ -206,7 +206,7 @@ func main() { initCalls = append(initCalls, fmt.Sprintf("%sRegister(\"%s.%s\", (*%s)(nil), state.Fns{Save: (*%s).save, Load: (*%s).load})", statePrefix, *fullPkg, name, name, name, name)) } emitZeroCheck := func(name string) { - fmt.Fprintf(outputFile, " if !%sIsZeroValue(x.%s) { m.Failf(\"%s is %%v, expected zero\", x.%s) }\n", statePrefix, name, name, name) + fmt.Fprintf(outputFile, " if !%sIsZeroValue(&x.%s) { m.Failf(\"%s is %%#v, expected zero\", &x.%s) }\n", statePrefix, name, name, name) } emitLoadValue := func(name, typName string) { fmt.Fprintf(outputFile, " m.LoadValue(\"%s\", new(%s), func(y interface{}) { x.load%s(y.(%s)) })\n", name, typName, camelCased(name), typName) diff --git a/tools/nogo.json b/tools/nogo.json index 83cb76b93..cc05ba027 100644 --- a/tools/nogo.json +++ b/tools/nogo.json @@ -9,19 +9,6 @@ "/external/": "allowed: not subject to unsafe naming rules" } }, - "copylocks": { - "exclude_files": { - ".*_state_autogen.go": "fix: m.Failf copies by value", - "/pkg/log/json.go": "fix: Emit passes lock by value: gvisor.dev/gvisor/pkg/log.JSONEmitter contains gvisor.dev/gvisor/pkg/log.Writer contains gvisor.dev/gvisor/pkg/sync.Mutex", - "/pkg/log/log_test.go": "fix: call of fmt.Printf copies lock value: gvisor.dev/gvisor/pkg/log.Writer contains gvisor.dev/gvisor/pkg/sync.Mutex", - "/pkg/sentry/fs/host/socket_test.go": "fix: call of t.Errorf copies lock value: gvisor.dev/gvisor/pkg/sentry/fs/host.ConnectedEndpoint contains gvisor.dev/gvisor/pkg/refs.AtomicRefCount contains gvisor.dev/gvisor/pkg/sync.Mutex", - "/pkg/sentry/fs/proc/sys_net.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/proc.tcpMemInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex", - "/pkg/sentry/fs/proc/sys_net.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/proc.tcpSack contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex", - "/pkg/sentry/fs/tty/slave.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/tty.slaveInodeOperations contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex", - "/pkg/sentry/kernel/time/time.go": "fix: Readiness passes lock by value: gvisor.dev/gvisor/pkg/sentry/kernel/time.ClockEventsQueue contains gvisor.dev/gvisor/pkg/waiter.Queue contains gvisor.dev/gvisor/pkg/sync.RWMutex", - "/pkg/sentry/kernel/syscalls_state.go": "fix: assignment copies lock value to *s: gvisor.dev/gvisor/pkg/sentry/kernel.SyscallTable contains gvisor.dev/gvisor/pkg/sentry/kernel.SyscallFlagsTable contains gvisor.dev/gvisor/pkg/sync.Mutex" - } - }, "lostcancel": { "exclude_files": { "/pkg/tcpip/network/arp/arp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak", -- cgit v1.2.3 From 6dd5a1f3fe55daa8510b1ee5e3a59219aad92af6 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 8 Apr 2020 17:56:55 -0700 Subject: Clean up TODOs PiperOrigin-RevId: 305592245 --- pkg/sentry/fs/tmpfs/fs.go | 3 --- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/kernel/ptrace.go | 1 - pkg/sentry/vfs/filesystem.go | 2 +- pkg/sentry/vfs/mount.go | 12 ++++++------ pkg/sentry/vfs/mount_test.go | 2 +- runsc/cmd/gofer.go | 5 ++--- test/syscalls/linux/epoll.cc | 4 ---- test/syscalls/linux/file_base.h | 1 + test/syscalls/linux/pwrite64.cc | 9 +-------- test/syscalls/linux/tuntap.cc | 7 ++++--- test/syscalls/linux/write.cc | 10 ++-------- tools/go_generics/defs.bzl | 1 - 13 files changed, 19 insertions(+), 40 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index d5be56c3f..bc117ca6a 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -44,9 +44,6 @@ const ( // lookup. cacheRevalidate = "revalidate" - // TODO(edahlgren/mpratt): support a tmpfs size limit. - // size = "size" - // Permissions that exceed modeMask will be rejected. modeMask = 01777 diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 16a3c18ae..4433071aa 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -682,7 +682,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO: actually implement statfs + // TODO(gvisor.dev/issue/1193): actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 35ad97d5d..e23e796ef 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -184,7 +184,6 @@ func (t *Task) CanTrace(target *Task, attach bool) bool { if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { return false } - // TODO: Yama LSM return true } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index cd34782ff..bef1bd312 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -497,7 +497,7 @@ type FilesystemImpl interface { // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error - // TODO: inotify_add_watch() + // TODO(gvisor.dev/issue/1479): inotify_add_watch() } // PrependPathAtVFSRootError is returned by implementations of diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 1b8ecc415..f06946103 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -233,9 +233,9 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia } vd.dentry.mu.Lock() } - // TODO: Linux requires that either both the mount point and the mount root - // are directories, or neither are, and returns ENOTDIR if this is not the - // case. + // TODO(gvisor.dev/issue/1035): Linux requires that either both the mount + // point and the mount root are directories, or neither are, and returns + // ENOTDIR if this is not the case. mntns := vd.mount.ns mnt := newMount(vfs, fs, root, mntns, opts) vfs.mounts.seq.BeginWrite() @@ -274,9 +274,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti } } - // TODO(jamieliu): Linux special-cases umount of the caller's root, which - // we don't implement yet (we'll just fail it since the caller holds a - // reference on it). + // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's + // root, which we don't implement yet (we'll just fail it since the caller + // holds a reference on it). vfs.mounts.seq.BeginWrite() if opts.Flags&linux.MNT_DETACH == 0 { diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go index 3b933468d..3335e4057 100644 --- a/pkg/sentry/vfs/mount_test.go +++ b/pkg/sentry/vfs/mount_test.go @@ -55,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) { } } -// TODO: concurrent lookup/insertion/removal +// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal. // must be powers of 2 var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8} diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 02e5af3d3..28f0d54b9 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -272,9 +272,8 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error { root := spec.Root.Path if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { - // FIXME: runsc can't be re-executed without - // /proc, so we create a tmpfs mount, mount ./proc and ./root - // there, then move this mount to the root and after + // runsc can't be re-executed without /proc, so we create a tmpfs mount, + // mount ./proc and ./root there, then move this mount to the root and after // setCapsAndCallSelf, runsc will chroot into /root. // // We need a directory to construct a new root and we know that diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc index a4f8f3cec..f57d38dc7 100644 --- a/test/syscalls/linux/epoll.cc +++ b/test/syscalls/linux/epoll.cc @@ -56,10 +56,6 @@ TEST(EpollTest, AllWritable) { struct epoll_event result[kFDsPerEpoll]; ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1), SyscallSucceedsWithValue(kFDsPerEpoll)); - // TODO(edahlgren): Why do some tests check epoll_event::data, and others - // don't? Does Linux actually guarantee that, in any of these test cases, - // epoll_wait will necessarily write out the epoll_events in the order that - // they were registered? for (int i = 0; i < kFDsPerEpoll; i++) { ASSERT_EQ(result[i].events, EPOLLOUT); } diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h index 25fdd7106..fb418e052 100644 --- a/test/syscalls/linux/file_base.h +++ b/test/syscalls/linux/file_base.h @@ -87,6 +87,7 @@ class FileTest : public ::testing::Test { ClosePipes(); } + protected: std::string test_file_name_; FileDescriptor test_file_fd_; diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc index b48fe540d..c2f72e010 100644 --- a/test/syscalls/linux/pwrite64.cc +++ b/test/syscalls/linux/pwrite64.cc @@ -27,14 +27,7 @@ namespace testing { namespace { -// This test is currently very rudimentary. -// -// TODO(edahlgren): -// * bad buffer states (EFAULT). -// * bad fds (wrong permission, wrong type of file, EBADF). -// * check offset is not incremented. -// * check for EOF. -// * writing to pipes, symlinks, special files. +// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary. class Pwrite64 : public ::testing::Test { void SetUp() override { name_ = NewTempAbsPath(); diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc index 53ad2dda3..3a8ba37eb 100644 --- a/test/syscalls/linux/tuntap.cc +++ b/test/syscalls/linux/tuntap.cc @@ -242,7 +242,7 @@ TEST_F(TuntapTest, InvalidReadWrite) { TEST_F(TuntapTest, WriteToDownDevice) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); - // FIXME: gVisor always creates enabled/up'd interfaces. + // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces. SKIP_IF(IsRunningOnGvisor()); FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR)); @@ -280,10 +280,11 @@ PosixErrorOr OpenAndAttachTap( &addr, sizeof(addr))); if (!IsRunningOnGvisor()) { - // FIXME: gVisor doesn't support setting MAC address on interfaces yet. + // FIXME(b/110961832): gVisor doesn't support setting MAC address on + // interfaces yet. RETURN_IF_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA))); - // FIXME: gVisor always creates enabled/up'd interfaces. + // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces. RETURN_IF_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP)); } diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc index 9b219cfd6..39b5b2f56 100644 --- a/test/syscalls/linux/write.cc +++ b/test/syscalls/linux/write.cc @@ -31,14 +31,8 @@ namespace gvisor { namespace testing { namespace { -// This test is currently very rudimentary. -// -// TODO(edahlgren): -// * bad buffer states (EFAULT). -// * bad fds (wrong permission, wrong type of file, EBADF). -// * check offset is incremented. -// * check for EOF. -// * writing to pipes, symlinks, special files. + +// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary. class WriteTest : public ::testing::Test { public: ssize_t WriteBytes(int fd, int bytes) { diff --git a/tools/go_generics/defs.bzl b/tools/go_generics/defs.bzl index c5be52ecd..8c9995fd4 100644 --- a/tools/go_generics/defs.bzl +++ b/tools/go_generics/defs.bzl @@ -105,7 +105,6 @@ def _go_template_instance_impl(ctx): executable = ctx.executable._tool, ) - # TODO: How can we get the dependencies out? return struct( files = depset([output]), ) -- cgit v1.2.3 From 7aa5caae71c29b0be9047a7c156a9daaa435ebb8 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Wed, 11 Mar 2020 03:21:34 +0000 Subject: Enable syscall ptrace test on arm64. Signed-off-by: Haibo Xu Change-Id: I5bb8fa7d580d173b1438d6465e1adb442216c8fa --- pkg/sentry/arch/arch.go | 3 +++ pkg/sentry/arch/syscalls_amd64.go | 7 +++++++ pkg/sentry/arch/syscalls_arm64.go | 13 ++++++++++++- pkg/sentry/kernel/task_syscall.go | 14 ++++++++++++++ test/syscalls/linux/ptrace.cc | 31 ++++++++++++++++++++++++------- 5 files changed, 60 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index 1d11cc472..a903d031c 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -88,6 +88,9 @@ type Context interface { // SyscallNo returns the syscall number. SyscallNo() uintptr + // SyscallSaveOrig save orignal register value. + SyscallSaveOrig() + // SyscallArgs returns the syscall arguments in an array. SyscallArgs() SyscallArguments diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go index 8b4f23007..3859f41ee 100644 --- a/pkg/sentry/arch/syscalls_amd64.go +++ b/pkg/sentry/arch/syscalls_amd64.go @@ -18,6 +18,13 @@ package arch const restartSyscallNr = uintptr(219) +// SyscallSaveOrig save the value of the register which is clobbered in +// syscall handler(doSyscall()). +// +// Noop on x86. +func (c *context64) SyscallSaveOrig() { +} + // SyscallNo returns the syscall number according to the 64-bit convention. func (c *context64) SyscallNo() uintptr { return uintptr(c.Regs.Orig_rax) diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go index dc13b6124..92d062513 100644 --- a/pkg/sentry/arch/syscalls_arm64.go +++ b/pkg/sentry/arch/syscalls_arm64.go @@ -18,6 +18,17 @@ package arch const restartSyscallNr = uintptr(128) +// SyscallSaveOrig save the value of the register R0 which is clobbered in +// syscall handler(doSyscall()). +// +// In linux, at the entry of the syscall handler(el0_svc_common()), value of R0 +// is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0 +// was not accessible to the user space application, so we have to do the same +// operation in the sentry code to save the R0 value into the App context. +func (c *context64) SyscallSaveOrig() { + c.OrigR0 = c.Regs.Regs[0] +} + // SyscallNo returns the syscall number according to the 64-bit convention. func (c *context64) SyscallNo() uintptr { return uintptr(c.Regs.Regs[8]) @@ -40,7 +51,7 @@ func (c *context64) SyscallNo() uintptr { // R30: the link register. func (c *context64) SyscallArgs() SyscallArguments { return SyscallArguments{ - SyscallArgument{Value: uintptr(c.Regs.Regs[0])}, + SyscallArgument{Value: uintptr(c.OrigR0)}, SyscallArgument{Value: uintptr(c.Regs.Regs[1])}, SyscallArgument{Value: uintptr(c.Regs.Regs[2])}, SyscallArgument{Value: uintptr(c.Regs.Regs[3])}, diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index d555d69a8..3d7a734ef 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -194,6 +194,19 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u // // The syscall path is very hot; avoid defer. func (t *Task) doSyscall() taskRunState { + // Save value of the register which is clobbered in the following + // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64. + // + // On x86, register rax was shared by syscall number and return + // value, and at the entry of the syscall handler, the rax was + // saved to regs.orig_rax which was exposed to user space. + // But on arm64, syscall number was passed through X8, and the X0 + // was shared by the first syscall argument and return value. The + // X0 was saved to regs.orig_x0 which was not exposed to user space. + // So we have to do the same operation here to save the X0 value + // into the task context. + t.Arch().SyscallSaveOrig() + sysno := t.Arch().SyscallNo() args := t.Arch().SyscallArgs() @@ -269,6 +282,7 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { return (*runSyscallExit)(nil) } args := t.Arch().SyscallArgs() + return t.doSyscallInvoke(sysno, args) } diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc index cb828ff88..926690eb8 100644 --- a/test/syscalls/linux/ptrace.cc +++ b/test/syscalls/linux/ptrace.cc @@ -400,9 +400,11 @@ TEST(PtraceTest, GetRegSet) { // Read exactly the full register set. EXPECT_EQ(iov.iov_len, sizeof(regs)); -#ifdef __x86_64__ +#if defined(__x86_64__) // Child called kill(2), with SIGSTOP as arg 2. EXPECT_EQ(regs.rsi, SIGSTOP); +#elif defined(__aarch64__) + EXPECT_EQ(regs.regs[1], SIGSTOP); #endif // Suppress SIGSTOP and resume the child. @@ -752,15 +754,23 @@ TEST(PtraceTest, SyscallSucceeds()); EXPECT_TRUE(siginfo.si_code == SIGTRAP || siginfo.si_code == (SIGTRAP | 0x80)) << "si_code = " << siginfo.si_code; -#ifdef __x86_64__ + { struct user_regs_struct regs = {}; - ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, ®s), SyscallSucceeds()); + struct iovec iov; + iov.iov_base = ®s; + iov.iov_len = sizeof(regs); + EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov), + SyscallSucceeds()); +#if defined(__x86_64__) EXPECT_TRUE(regs.orig_rax == SYS_vfork || regs.orig_rax == SYS_clone) << "orig_rax = " << regs.orig_rax; EXPECT_EQ(grandchild_pid, regs.rax); - } +#elif defined(__aarch64__) + EXPECT_TRUE(regs.regs[8] == SYS_clone) << "regs[8] = " << regs.regs[8]; + EXPECT_EQ(grandchild_pid, regs.regs[0]); #endif // defined(__x86_64__) + } // After this point, the child will be making wait4 syscalls that will be // interrupted by saving, so saving is not permitted. Note that this is @@ -805,14 +815,21 @@ TEST(PtraceTest, SyscallSucceedsWithValue(child_pid)); EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80)) << " status " << status; -#ifdef __x86_64__ { struct user_regs_struct regs = {}; - ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, ®s), SyscallSucceeds()); + struct iovec iov; + iov.iov_base = ®s; + iov.iov_len = sizeof(regs); + EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov), + SyscallSucceeds()); +#if defined(__x86_64__) EXPECT_EQ(SYS_wait4, regs.orig_rax); EXPECT_EQ(grandchild_pid, regs.rax); - } +#elif defined(__aarch64__) + EXPECT_EQ(SYS_wait4, regs.regs[8]); + EXPECT_EQ(grandchild_pid, regs.regs[0]); #endif // defined(__x86_64__) + } // Detach from the child and wait for it to exit. ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds()); -- cgit v1.2.3 From 1798d6cbee3360b09d3736069e15fd746e863bd2 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 10 Apr 2020 11:17:59 -0700 Subject: Remove TODO from kernel.Stracer The dependency strace=>kernel grew over time. strace also depends on task's FD table and FSContext. It could be fixed with some interfaces the other way, but then we're trading an interface for another, and kernel.Stracer is likely cleaner. Closes #155 PiperOrigin-RevId: 305909678 --- pkg/sentry/kernel/syscalls.go | 3 --- pkg/sentry/strace/strace.go | 3 --- 2 files changed, 6 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index c9a2321b8..2e3565747 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -209,9 +209,6 @@ type Stracer interface { // SyscallEnter is called on syscall entry. // // The returned private data is passed to SyscallExit. - // - // TODO(gvisor.dev/issue/155): remove kernel imports from the strace - // package so that the type can be used directly. SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} // SyscallExit is called on syscall exit. diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 77655558e..b94c4fbf5 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -778,9 +778,6 @@ func (s SyscallMap) Name(sysno uintptr) string { // // N.B. This is not in an init function because we can't be sure all syscall // tables are registered with the kernel when init runs. -// -// TODO(gvisor.dev/issue/155): remove kernel package dependencies from this -// package and have the kernel package self-initialize all syscall tables. func Initialize() { for _, table := range kernel.SyscallTables() { // Is this known? -- cgit v1.2.3 From daf3322498b698518a3c8545ad05f790deb3848c Mon Sep 17 00:00:00 2001 From: Ian Lewis Date: Fri, 10 Apr 2020 20:31:07 -0700 Subject: Add logging message for noNewPrivileges OCI option. noNewPrivileges is ignored if set to false since gVisor assumes that PR_SET_NO_NEW_PRIVS is always enabled. PiperOrigin-RevId: 305991947 --- pkg/sentry/kernel/task_identity.go | 2 +- pkg/sentry/syscalls/linux/sys_prctl.go | 4 ++-- runsc/specutils/specutils.go | 6 ++++++ 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index ce3e6ef28..0325967e4 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -455,7 +455,7 @@ func (t *Task) SetKeepCaps(k bool) { t.creds.Store(creds) } -// updateCredsForExec updates t.creds to reflect an execve(). +// updateCredsForExecLocked updates t.creds to reflect an execve(). // // NOTE(b/30815691): We currently do not implement privileged executables // (set-user/group-ID bits and file capabilities). This allows us to make a lot diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 9c6728530..f92bf8096 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -161,8 +161,8 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { return 0, nil, syserror.EINVAL } - // no_new_privs is assumed to always be set. See - // kernel.Task.updateCredsForExec. + // PR_SET_NO_NEW_PRIVS is assumed to always be set. + // See kernel.Task.updateCredsForExecLocked. return 0, nil, nil case linux.PR_GET_NO_NEW_PRIVS: diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 0f4a9cf6d..837d5e238 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -92,6 +92,12 @@ func ValidateSpec(spec *specs.Spec) error { log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile) } + // PR_SET_NO_NEW_PRIVS is assumed to always be set. + // See kernel.Task.updateCredsForExecLocked. + if !spec.Process.NoNewPrivileges { + log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.") + } + // TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox. if spec.Linux != nil && spec.Linux.Seccomp != nil { log.Warningf("Seccomp spec is being ignored") -- cgit v1.2.3 From 6a4d17a31dc209afbbca66e871a7c6dc299c167b Mon Sep 17 00:00:00 2001 From: Jon Budd Date: Mon, 13 Apr 2020 11:01:02 -0700 Subject: Remove obsolete TODOs for b/38173783 The comments in the ticket indicate that this behavior is fine and that the ticket should be closed, so we shouldn't need pointers to the ticket. PiperOrigin-RevId: 306266071 --- pkg/context/context.go | 4 ---- pkg/sentry/arch/stack.go | 3 --- pkg/sentry/fs/gofer/file_state.go | 1 - pkg/sentry/fs/gofer/handles.go | 1 - pkg/sentry/fs/gofer/inode_state.go | 1 - pkg/sentry/fs/gofer/session_state.go | 1 - pkg/sentry/fs/inode.go | 1 - pkg/sentry/kernel/shm/shm.go | 2 +- pkg/sentry/kernel/task_context.go | 1 - pkg/sentry/kernel/task_signals.go | 2 -- pkg/usermem/usermem.go | 3 --- 11 files changed, 1 insertion(+), 19 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/context/context.go b/pkg/context/context.go index 23e009ef3..5319b6d8d 100644 --- a/pkg/context/context.go +++ b/pkg/context/context.go @@ -127,10 +127,6 @@ func (logContext) Value(key interface{}) interface{} { var bgContext = &logContext{Logger: log.Log()} // Background returns an empty context using the default logger. -// -// Users should be wary of using a Background context. Please tag any use with -// FIXME(b/38173783) and a note to remove this use. -// // Generally, one should use the Task as their context when available, or avoid // having to use a context in places where a Task is unavailable. // diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go index 09bceabc9..1108fa0bd 100644 --- a/pkg/sentry/arch/stack.go +++ b/pkg/sentry/arch/stack.go @@ -97,7 +97,6 @@ func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) { if c < 0 { return 0, fmt.Errorf("bad binary.Size for %T", v) } - // TODO(b/38173783): Use a real context.Context. n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{}) if err != nil || c != n { return 0, err @@ -121,11 +120,9 @@ func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) { var err error if isVaddr { value := s.Arch.Native(uintptr(0)) - // TODO(b/38173783): Use a real context.Context. n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{}) *vaddr = usermem.Addr(s.Arch.Value(value)) } else { - // TODO(b/38173783): Use a real context.Context. n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{}) } if err != nil { diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go index ff96b28ba..edd6576aa 100644 --- a/pkg/sentry/fs/gofer/file_state.go +++ b/pkg/sentry/fs/gofer/file_state.go @@ -34,7 +34,6 @@ func (f *fileOperations) afterLoad() { flags := f.flags flags.Truncate = false - // TODO(b/38173783): Context is not plumbed to save/restore. f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), flags, f.inodeOperations.cachingInodeOps) if err != nil { return fmt.Errorf("failed to re-open handle: %v", err) diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index 9f7c3e89f..fc14249be 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -57,7 +57,6 @@ func (h *handles) DecRef() { } } } - // FIXME(b/38173783): Context is not plumbed here. if err := h.File.close(context.Background()); err != nil { log.Warningf("error closing p9 file: %v", err) } diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index 238f7804c..a3402e343 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -123,7 +123,6 @@ func (i *inodeFileState) afterLoad() { // beforeSave. return fmt.Errorf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings)) } - // TODO(b/38173783): Context is not plumbed to save/restore. ctx := &dummyClockContext{context.Background()} _, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name)) diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 111da59f9..2d398b753 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -104,7 +104,6 @@ func (s *session) afterLoad() { // If private unix sockets are enabled, create and fill the session's endpoint // maps. if opts.privateunixsocket { - // TODO(b/38173783): Context is not plumbed to save/restore. ctx := &dummyClockContext{context.Background()} if err = s.restoreEndpointMaps(ctx); err != nil { diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 73f89abcc..a34fbc946 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -102,7 +102,6 @@ func (i *Inode) DecRef() { // destroy releases the Inode and releases the msrc reference taken. func (i *Inode) destroy() { - // FIXME(b/38173783): Context is not plumbed here. ctx := context.Background() if err := i.WriteOut(ctx); err != nil { // FIXME(b/65209558): Mark as warning again once noatime is diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 208569057..f66cfcc7f 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -461,7 +461,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) { s.mu.Lock() defer s.mu.Unlock() - // TODO(b/38173783): RemoveMapping may be called during task exit, when ctx + // RemoveMapping may be called during task exit, when ctx // is context.Background. Gracefully handle missing clocks. Failing to // update the detach time in these cases is ok, since no one can observe the // omission. diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index c115e8d1f..9fa528384 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -58,7 +58,6 @@ func (tc *TaskContext) release() { // Nil out pointers so that if the task is saved after release, it doesn't // follow the pointers to possibly now-invalid objects. if tc.MemoryManager != nil { - // TODO(b/38173783) tc.MemoryManager.DecUsers(context.Background()) tc.MemoryManager = nil } diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 8802db142..6aa798346 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -513,8 +513,6 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool { if t.stop != nil { return false } - // - TODO(b/38173783): No special case for when t is also the sending task, - // because the identity of the sender is unknown. // - Do not choose tasks that have already been interrupted, as they may be // busy handling another signal. if len(t.interruptChan) != 0 { diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go index d2f4403b0..cd6a0ea6b 100644 --- a/pkg/usermem/usermem.go +++ b/pkg/usermem/usermem.go @@ -29,9 +29,6 @@ import ( ) // IO provides access to the contents of a virtual memory space. -// -// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any -// meaningful data. type IO interface { // CopyOut copies len(src) bytes from src to the memory mapped at addr. It // returns the number of bytes copied. If the number of bytes copied is < -- cgit v1.2.3 From 5d885d7fb21414d903d57ffe2b95bcc62c098d6a Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Mon, 13 Apr 2020 13:01:28 -0700 Subject: Port socket-related syscalls to VFS2. Note that most kinds of sockets are not yet supported in VFS2 (only Unix sockets are partially supported at the moment), so these syscalls will still generally fail. Enabling them allows us to begin running socket tests for VFS2 as more features are ported over. Updates #1476, #1478, #1484, #1485. PiperOrigin-RevId: 306292294 --- pkg/sentry/kernel/fd_table.go | 55 + pkg/sentry/kernel/task.go | 9 + pkg/sentry/syscalls/linux/sys_socket.go | 9 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 6 + .../syscalls/linux/vfs2/linux64_override_amd64.go | 40 +- pkg/sentry/syscalls/linux/vfs2/socket.go | 1138 ++++++++++++++++++++ 6 files changed, 1238 insertions(+), 19 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/socket.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index d09d97825..ed40b5303 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -307,6 +307,61 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return fds, nil } +// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available +// greater than or equal to the fd parameter. All files will share the set +// flags. Success is guaranteed to be all or none. +func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { + if fd < 0 { + // Don't accept negative FDs. + return nil, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if fd >= end { + return nil, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // From f.next to find available fd. + if fd < f.next { + fd = f.next + } + + // Install all entries. + for i := fd; i < end && len(fds) < len(files); i++ { + if d, _, _ := f.getVFS2(i); d == nil { + f.setVFS2(i, files[len(fds)], flags) // Set the descriptor. + fds = append(fds, i) // Record the file descriptor. + } + } + + // Failure? Unwind existing FDs. + if len(fds) < len(files) { + for _, i := range fds { + f.setVFS2(i, nil, FDFlags{}) // Zap entry. + } + return nil, syscall.EMFILE + } + + if fd == f.next { + // Update next search start position. + f.next = fds[len(fds)-1] + 1 + } + + return fds, nil +} + // NewFDVFS2 allocates a file descriptor greater than or equal to minfd for // the given file description. If it succeeds, it takes a reference on file. func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index d6546735e..e5d133d6c 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -777,6 +777,15 @@ func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error return t.fdTable.NewFDs(t, fd, files, flags) } +// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) { + return t.fdTable.NewFDsVFS2(t, fd, files, flags) +} + // NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file. // // This automatically passes the task as the context. diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 2919228d0..61b2576ac 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -31,6 +31,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // minListenBacklog is the minimum reasonable backlog for listening sockets. const minListenBacklog = 8 @@ -244,7 +246,10 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Copy the file descriptors out. if _, err := t.CopyOut(socks, fds); err != nil { - // Note that we don't close files here; see pipe(2) also. + for _, fd := range fds { + _, file := t.FDTable().Remove(fd) + file.DecRef() + } return 0, nil, err } @@ -1128,3 +1133,5 @@ func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen) return n, nil, err } + +// LINT.ThenChange(./vfs2/socket.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 0004e60d9..b32abfe59 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -21,6 +21,7 @@ go_library( "poll.go", "read_write.go", "setstat.go", + "socket.go", "stat.go", "stat_amd64.go", "stat_arm64.go", @@ -32,6 +33,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/binary", "//pkg/bits", "//pkg/fspath", "//pkg/gohacks", @@ -43,10 +45,14 @@ go_library( "//pkg/sentry/limits", "//pkg/sentry/loader", "//pkg/sentry/memmap", + "//pkg/sentry/socket", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", "//pkg/sync", + "//pkg/syserr", "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index 63febc2f7..645e0bcb8 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -44,21 +44,22 @@ func Override(table map[uintptr]kernel.Syscall) { table[32] = syscalls.Supported("dup", Dup) table[33] = syscalls.Supported("dup2", Dup2) delete(table, 40) // sendfile - delete(table, 41) // socket - delete(table, 42) // connect - delete(table, 43) // accept - delete(table, 44) // sendto - delete(table, 45) // recvfrom - delete(table, 46) // sendmsg - delete(table, 47) // recvmsg - delete(table, 48) // shutdown - delete(table, 49) // bind - delete(table, 50) // listen - delete(table, 51) // getsockname - delete(table, 52) // getpeername - delete(table, 53) // socketpair - delete(table, 54) // setsockopt - delete(table, 55) // getsockopt + // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. + table[41] = syscalls.PartiallySupported("socket", Socket, "In process of porting socket syscalls to VFS2.", nil) + table[42] = syscalls.PartiallySupported("connect", Connect, "In process of porting socket syscalls to VFS2.", nil) + table[43] = syscalls.PartiallySupported("accept", Accept, "In process of porting socket syscalls to VFS2.", nil) + table[44] = syscalls.PartiallySupported("sendto", SendTo, "In process of porting socket syscalls to VFS2.", nil) + table[45] = syscalls.PartiallySupported("recvfrom", RecvFrom, "In process of porting socket syscalls to VFS2.", nil) + table[46] = syscalls.PartiallySupported("sendmsg", SendMsg, "In process of porting socket syscalls to VFS2.", nil) + table[47] = syscalls.PartiallySupported("recvmsg", RecvMsg, "In process of porting socket syscalls to VFS2.", nil) + table[48] = syscalls.PartiallySupported("shutdown", Shutdown, "In process of porting socket syscalls to VFS2.", nil) + table[49] = syscalls.PartiallySupported("bind", Bind, "In process of porting socket syscalls to VFS2.", nil) + table[50] = syscalls.PartiallySupported("listen", Listen, "In process of porting socket syscalls to VFS2.", nil) + table[51] = syscalls.PartiallySupported("getsockname", GetSockName, "In process of porting socket syscalls to VFS2.", nil) + table[52] = syscalls.PartiallySupported("getpeername", GetPeerName, "In process of porting socket syscalls to VFS2.", nil) + table[53] = syscalls.PartiallySupported("socketpair", SocketPair, "In process of porting socket syscalls to VFS2.", nil) + table[54] = syscalls.PartiallySupported("getsockopt", GetSockOpt, "In process of porting socket syscalls to VFS2.", nil) + table[55] = syscalls.PartiallySupported("setsockopt", SetSockOpt, "In process of porting socket syscalls to VFS2.", nil) table[59] = syscalls.Supported("execve", Execve) table[72] = syscalls.Supported("fcntl", Fcntl) delete(table, 73) // flock @@ -144,7 +145,8 @@ func Override(table map[uintptr]kernel.Syscall) { delete(table, 285) // fallocate table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) - delete(table, 288) // accept4 + // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. + table[288] = syscalls.PartiallySupported("accept4", Accept4, "In process of porting socket syscalls to VFS2.", nil) delete(table, 289) // signalfd4 delete(table, 290) // eventfd2 table[291] = syscalls.Supported("epoll_create1", EpollCreate1) @@ -153,9 +155,11 @@ func Override(table map[uintptr]kernel.Syscall) { delete(table, 294) // inotify_init1 table[295] = syscalls.Supported("preadv", Preadv) table[296] = syscalls.Supported("pwritev", Pwritev) - delete(table, 299) // recvmmsg + // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. + table[299] = syscalls.PartiallySupported("recvmmsg", RecvMMsg, "In process of porting socket syscalls to VFS2.", nil) table[306] = syscalls.Supported("syncfs", Syncfs) - delete(table, 307) // sendmmsg + // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. + table[307] = syscalls.PartiallySupported("sendmmsg", SendMMsg, "In process of porting socket syscalls to VFS2.", nil) table[316] = syscalls.Supported("renameat2", Renameat2) delete(table, 319) // memfd_create table[322] = syscalls.Supported("execveat", Execveat) diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go new file mode 100644 index 000000000..79a4a7ada --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -0,0 +1,1138 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// minListenBacklog is the minimum reasonable backlog for listening sockets. +const minListenBacklog = 8 + +// maxListenBacklog is the maximum allowed backlog for listening sockets. +const maxListenBacklog = 1024 + +// maxAddrLen is the maximum socket address length we're willing to accept. +const maxAddrLen = 200 + +// maxOptLen is the maximum sockopt parameter length we're willing to accept. +const maxOptLen = 1024 * 8 + +// maxControlLen is the maximum length of the msghdr.msg_control buffer we're +// willing to accept. Note that this limit is smaller than Linux, which allows +// buffers upto INT_MAX. +const maxControlLen = 10 * 1024 * 1024 + +// nameLenOffset is the offset from the start of the MessageHeader64 struct to +// the NameLen field. +const nameLenOffset = 8 + +// controlLenOffset is the offset form the start of the MessageHeader64 struct +// to the ControlLen field. +const controlLenOffset = 40 + +// flagsOffset is the offset form the start of the MessageHeader64 struct +// to the Flags field. +const flagsOffset = 48 + +const sizeOfInt32 = 4 + +// messageHeader64Len is the length of a MessageHeader64 struct. +var messageHeader64Len = uint64(binary.Size(MessageHeader64{})) + +// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct. +var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{})) + +// baseRecvFlags are the flags that are accepted across recvmsg(2), +// recvmmsg(2), and recvfrom(2). +const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC + +// MessageHeader64 is the 64-bit representation of the msghdr struct used in +// the recvmsg and sendmsg syscalls. +type MessageHeader64 struct { + // Name is the optional pointer to a network address buffer. + Name uint64 + + // NameLen is the length of the buffer pointed to by Name. + NameLen uint32 + _ uint32 + + // Iov is a pointer to an array of io vectors that describe the memory + // locations involved in the io operation. + Iov uint64 + + // IovLen is the length of the array pointed to by Iov. + IovLen uint64 + + // Control is the optional pointer to ancillary control data. + Control uint64 + + // ControlLen is the length of the data pointed to by Control. + ControlLen uint64 + + // Flags on the sent/received message. + Flags int32 + _ int32 +} + +// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in +// the recvmmsg and sendmmsg syscalls. +type multipleMessageHeader64 struct { + msgHdr MessageHeader64 + msgLen uint32 + _ int32 +} + +// CopyInMessageHeader64 copies a message header from user to kernel memory. +func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error { + b := t.CopyScratchBuffer(52) + if _, err := t.CopyInBytes(addr, b); err != nil { + return err + } + + msg.Name = usermem.ByteOrder.Uint64(b[0:]) + msg.NameLen = usermem.ByteOrder.Uint32(b[8:]) + msg.Iov = usermem.ByteOrder.Uint64(b[16:]) + msg.IovLen = usermem.ByteOrder.Uint64(b[24:]) + msg.Control = usermem.ByteOrder.Uint64(b[32:]) + msg.ControlLen = usermem.ByteOrder.Uint64(b[40:]) + msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:])) + + return nil +} + +// CaptureAddress allocates memory for and copies a socket address structure +// from the untrusted address space range. +func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { + if addrlen > maxAddrLen { + return nil, syserror.EINVAL + } + + addrBuf := make([]byte, addrlen) + if _, err := t.CopyInBytes(addr, addrBuf); err != nil { + return nil, err + } + + return addrBuf, nil +} + +// writeAddress writes a sockaddr structure and its length to an output buffer +// in the unstrusted address space range. If the address is bigger than the +// buffer, it is truncated. +func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error { + // Get the buffer length. + var bufLen uint32 + if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil { + return err + } + + if int32(bufLen) < 0 { + return syserror.EINVAL + } + + // Write the length unconditionally. + if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil { + return err + } + + if addr == nil { + return nil + } + + if bufLen > addrLen { + bufLen = addrLen + } + + // Copy as much of the address as will fit in the buffer. + encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr) + if bufLen > uint32(len(encodedAddr)) { + bufLen = uint32(len(encodedAddr)) + } + _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)]) + return err +} + +// Socket implements the linux syscall socket(2). +func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + domain := int(args[0].Int()) + stype := args[1].Int() + protocol := int(args[2].Int()) + + // Check and initialize the flags. + if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Create the new socket. + s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol) + if e != nil { + return 0, nil, e.ToError() + } + defer s.DecRef() + + if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// SocketPair implements the linux syscall socketpair(2). +func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + domain := int(args[0].Int()) + stype := args[1].Int() + protocol := int(args[2].Int()) + addr := args[3].Pointer() + + // Check and initialize the flags. + if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Create the socket pair. + s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol) + if e != nil { + return 0, nil, e.ToError() + } + // Adding to the FD table will cause an extra reference to be acquired. + defer s1.DecRef() + defer s2.DecRef() + + nonblocking := uint32(stype & linux.SOCK_NONBLOCK) + if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { + return 0, nil, err + } + if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { + return 0, nil, err + } + + // Create the FDs for the sockets. + flags := kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + } + fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags) + if err != nil { + return 0, nil, err + } + + if _, err := t.CopyOut(addr, fds); err != nil { + for _, fd := range fds { + _, file := t.FDTable().Remove(fd) + file.DecRef() + } + return 0, nil, err + } + + return 0, nil, nil +} + +// Connect implements the linux syscall connect(2). +func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Uint() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Capture address and call syscall implementation. + a, err := CaptureAddress(t, addr, addrlen) + if err != nil { + return 0, nil, err + } + + blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 + return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS) +} + +// accept is the implementation of the accept syscall. It is called by accept +// and accept4 syscall handlers. +func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { + // Check that no unsupported flags are passed in. + if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, syserror.ENOTSOCK + } + + // Call the syscall implementation for this socket, then copy the + // output address if one is specified. + blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 + + peerRequested := addrLen != 0 + nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + if peerRequested { + // NOTE(magi): Linux does not give you an error if it can't + // write the data back out so neither do we. + if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL { + return 0, err + } + } + return uintptr(nfd), nil +} + +// Accept4 implements the linux syscall accept4(2). +func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + flags := int(args[3].Int()) + + n, err := accept(t, fd, addr, addrlen, flags) + return n, nil, err +} + +// Accept implements the linux syscall accept(2). +func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + n, err := accept(t, fd, addr, addrlen, 0) + return n, nil, err +} + +// Bind implements the linux syscall bind(2). +func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Uint() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Capture address and call syscall implementation. + a, err := CaptureAddress(t, addr, addrlen) + if err != nil { + return 0, nil, err + } + + return 0, nil, s.Bind(t, a).ToError() +} + +// Listen implements the linux syscall listen(2). +func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + backlog := args[1].Int() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Per Linux, the backlog is silently capped to reasonable values. + if backlog <= 0 { + backlog = minListenBacklog + } + if backlog > maxListenBacklog { + backlog = maxListenBacklog + } + + return 0, nil, s.Listen(t, int(backlog)).ToError() +} + +// Shutdown implements the linux syscall shutdown(2). +func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + how := args[1].Int() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Validate how, then call syscall implementation. + switch how { + case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: + default: + return 0, nil, syserror.EINVAL + } + + return 0, nil, s.Shutdown(t, int(how)).ToError() +} + +// GetSockOpt implements the linux syscall getsockopt(2). +func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + level := args[1].Int() + name := args[2].Int() + optValAddr := args[3].Pointer() + optLenAddr := args[4].Pointer() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Read the length. Reject negative values. + optLen := int32(0) + if _, err := t.CopyIn(optLenAddr, &optLen); err != nil { + return 0, nil, err + } + if optLen < 0 { + return 0, nil, syserror.EINVAL + } + + // Call syscall implementation then copy both value and value len out. + v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen)) + if e != nil { + return 0, nil, e.ToError() + } + + vLen := int32(binary.Size(v)) + if _, err := t.CopyOut(optLenAddr, vLen); err != nil { + return 0, nil, err + } + + if v != nil { + if _, err := t.CopyOut(optValAddr, v); err != nil { + return 0, nil, err + } + } + + return 0, nil, nil +} + +// getSockOpt tries to handle common socket options, or dispatches to a specific +// socket implementation. +func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) { + if level == linux.SOL_SOCKET { + switch name { + case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: + if len < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + } + + switch name { + case linux.SO_TYPE: + _, skType, _ := s.Type() + return int32(skType), nil + case linux.SO_DOMAIN: + family, _, _ := s.Type() + return int32(family), nil + case linux.SO_PROTOCOL: + _, _, protocol := s.Type() + return int32(protocol), nil + } + } + + return s.GetSockOpt(t, level, name, optValAddr, len) +} + +// SetSockOpt implements the linux syscall setsockopt(2). +// +// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. +func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + level := args[1].Int() + name := args[2].Int() + optValAddr := args[3].Pointer() + optLen := args[4].Int() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + if optLen < 0 { + return 0, nil, syserror.EINVAL + } + if optLen > maxOptLen { + return 0, nil, syserror.EINVAL + } + buf := t.CopyScratchBuffer(int(optLen)) + if _, err := t.CopyIn(optValAddr, &buf); err != nil { + return 0, nil, err + } + + // Call syscall implementation. + if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, nil +} + +// GetSockName implements the linux syscall getsockname(2). +func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Get the socket name and copy it to the caller. + v, vl, err := s.GetSockName(t) + if err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, writeAddress(t, v, vl, addr, addrlen) +} + +// GetPeerName implements the linux syscall getpeername(2). +func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Get the socket peer name and copy it to the caller. + v, vl, err := s.GetPeerName(t) + if err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, writeAddress(t, v, vl, addr, addrlen) +} + +// RecvMsg implements the linux syscall recvmsg(2). +func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + flags := args[2].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { + return 0, nil, syserror.EINVAL + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline) + return n, nil, err +} + +// RecvMMsg implements the linux syscall recvmmsg(2). +func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + vlen := args[2].Uint() + flags := args[3].Int() + toPtr := args[4].Pointer() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + var haveDeadline bool + var deadline ktime.Time + if toPtr != 0 { + var ts linux.Timespec + if _, err := ts.CopyIn(t, toPtr); err != nil { + return 0, nil, err + } + if !ts.Valid() { + return 0, nil, syserror.EINVAL + } + deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) + haveDeadline = true + } + + if !haveDeadline { + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + } + + var count uint32 + var err error + for i := uint64(0); i < uint64(vlen); i++ { + mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + var n uintptr + if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { + break + } + + // Copy the received length to the caller. + lp, ok := mp.AddLength(messageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + if _, err = t.CopyOut(lp, uint32(n)); err != nil { + break + } + count++ + } + + if count == 0 { + return 0, nil, err + } + return uintptr(count), nil, nil +} + +func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) { + // Capture the message header and io vectors. + var msg MessageHeader64 + if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + return 0, err + } + + if msg.IovLen > linux.UIO_MAXIOV { + return 0, syserror.EMSGSIZE + } + dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + // FIXME(b/63594852): Pretend we have an empty error queue. + if flags&linux.MSG_ERRQUEUE != 0 { + return 0, syserror.EAGAIN + } + + // Fast path when no control message nor name buffers are provided. + if msg.ControlLen == 0 && msg.NameLen == 0 { + n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) + if err != nil { + return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS) + } + if !cms.Unix.Empty() { + mflags |= linux.MSG_CTRUNC + cms.Release() + } + + if int(msg.Flags) != mflags { + // Copy out the flags to the caller. + if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + return 0, err + } + } + + return uintptr(n), nil + } + + if msg.ControlLen > maxControlLen { + return 0, syserror.ENOBUFS + } + n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + defer cms.Release() + + controlData := make([]byte, 0, msg.ControlLen) + controlData = control.PackControlMessages(t, cms, controlData) + + if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() { + creds, _ := cms.Unix.Credentials.(control.SCMCredentials) + controlData, mflags = control.PackCredentials(t, creds, controlData, mflags) + } + + if cms.Unix.Rights != nil { + controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) + } + + // Copy the address to the caller. + if msg.NameLen != 0 { + if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil { + return 0, err + } + } + + // Copy the control data to the caller. + if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { + return 0, err + } + if len(controlData) > 0 { + if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil { + return 0, err + } + } + + // Copy out the flags to the caller. + if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + return 0, err + } + + return uintptr(n), nil +} + +// recvFrom is the implementation of the recvfrom syscall. It is called by +// recvfrom and recv syscall handlers. +func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { + if int(bufLen) < 0 { + return 0, syserror.EINVAL + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, syserror.ENOTSOCK + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) + cm.Release() + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + + // Copy the address to the caller. + if nameLenPtr != 0 { + if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil { + return 0, err + } + } + + return uintptr(n), nil +} + +// RecvFrom implements the linux syscall recvfrom(2). +func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufPtr := args[1].Pointer() + bufLen := args[2].Uint64() + flags := args[3].Int() + namePtr := args[4].Pointer() + nameLenPtr := args[5].Pointer() + + n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr) + return n, nil, err +} + +// SendMsg implements the linux syscall sendmsg(2). +func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + flags := args[2].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { + return 0, nil, syserror.EINVAL + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + n, err := sendSingleMsg(t, s, file, msgPtr, flags) + return n, nil, err +} + +// SendMMsg implements the linux syscall sendmmsg(2). +func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + vlen := args[2].Uint() + flags := args[3].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { + return 0, nil, syserror.EINVAL + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + var count uint32 + var err error + for i := uint64(0); i < uint64(vlen); i++ { + mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + var n uintptr + if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { + break + } + + // Copy the received length to the caller. + lp, ok := mp.AddLength(messageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + if _, err = t.CopyOut(lp, uint32(n)); err != nil { + break + } + count++ + } + + if count == 0 { + return 0, nil, err + } + return uintptr(count), nil, nil +} + +func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) { + // Capture the message header. + var msg MessageHeader64 + if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + return 0, err + } + + var controlData []byte + if msg.ControlLen > 0 { + // Put an upper bound to prevent large allocations. + if msg.ControlLen > maxControlLen { + return 0, syserror.ENOBUFS + } + controlData = make([]byte, msg.ControlLen) + if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { + return 0, err + } + } + + // Read the destination address if one is specified. + var to []byte + if msg.NameLen != 0 { + var err error + to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen) + if err != nil { + return 0, err + } + } + + // Read data then call the sendmsg implementation. + if msg.IovLen > linux.UIO_MAXIOV { + return 0, syserror.EMSGSIZE + } + src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + controlMessages, err := control.Parse(t, s, controlData) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.SendTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + // Call the syscall implementation. + n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) + err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) + if err != nil { + controlMessages.Release() + } + return uintptr(n), err +} + +// sendTo is the implementation of the sendto syscall. It is called by sendto +// and send syscall handlers. +func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { + bl := int(bufLen) + if bl < 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, syserror.ENOTSOCK + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + // Read the destination address if one is specified. + var to []byte + var err error + if namePtr != 0 { + to, err = CaptureAddress(t, namePtr, nameLen) + if err != nil { + return 0, err + } + } + + src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.SendTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + // Call the syscall implementation. + n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) + return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file) +} + +// SendTo implements the linux syscall sendto(2). +func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufPtr := args[1].Pointer() + bufLen := args[2].Uint64() + flags := args[3].Int() + namePtr := args[4].Pointer() + nameLen := args[5].Uint() + + n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen) + return n, nil, err +} -- cgit v1.2.3 From 28399818fc1e5d294cc93ddd4a1ac7e31c375fbf Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 16 Apr 2020 11:48:14 -0700 Subject: Make ExtractErrno a function PiperOrigin-RevId: 306891171 --- pkg/sentry/kernel/task_run.go | 2 +- pkg/sentry/kernel/task_signals.go | 2 +- pkg/sentry/kernel/task_syscall.go | 12 ++++++------ pkg/sentry/strace/strace.go | 2 +- pkg/sentry/syscalls/linux/sys_aio.go | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 799cbcd93..2ba8d7e63 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -353,7 +353,7 @@ func (app *runApp) execute(t *Task) taskRunState { default: // What happened? Can't continue. t.Warningf("Unexpected SwitchToApp error: %v", err) - t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)}) + t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)}) return (*runExit)(nil) } } diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 6aa798346..f07de2089 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -174,7 +174,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS fallthrough case (sre == ERESTARTSYS && !act.IsRestart()): t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) - t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1))) + t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1))) default: t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) t.Arch().RestartSyscall() diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 3d7a734ef..c9db78e06 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -312,7 +312,7 @@ func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRu return ctrl.next } } else if err != nil { - t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno)))) + t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) t.haveSyscallReturn = true } else { t.Arch().SetReturn(rval) @@ -431,7 +431,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle // A return is not emulated in this case. return (*runApp)(nil) } - t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno)))) + t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) } t.Arch().SetIP(t.Arch().Value(caller)) t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) @@ -441,7 +441,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle // ExtractErrno extracts an integer error number from the error. // The syscall number is purely for context in the error case. Use -1 if // syscall number is unknown. -func (t *Task) ExtractErrno(err error, sysno int) int { +func ExtractErrno(err error, sysno int) int { switch err := err.(type) { case nil: return 0 @@ -455,11 +455,11 @@ func (t *Task) ExtractErrno(err error, sysno int) int { // handled (and the SIGBUS is delivered). return int(syscall.EFAULT) case *os.PathError: - return t.ExtractErrno(err.Err, sysno) + return ExtractErrno(err.Err, sysno) case *os.LinkError: - return t.ExtractErrno(err.Err, sysno) + return ExtractErrno(err.Err, sysno) case *os.SyscallError: - return t.ExtractErrno(err.Err, sysno) + return ExtractErrno(err.Err, sysno) default: if errno, ok := syserror.TranslateError(err); ok { return int(errno) diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index b94c4fbf5..68ca537c8 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -719,7 +719,7 @@ func (s SyscallMap) SyscallEnter(t *kernel.Task, sysno uintptr, args arch.Syscal // SyscallExit implements kernel.Stracer.SyscallExit. It logs the syscall // exit trace. func (s SyscallMap) SyscallExit(context interface{}, t *kernel.Task, sysno, rval uintptr, err error) { - errno := t.ExtractErrno(err, int(sysno)) + errno := kernel.ExtractErrno(err, int(sysno)) c := context.(*syscallContext) elapsed := time.Since(c.start) diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index 38cbeba5a..d781d6a04 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -290,7 +290,7 @@ func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioC // Update the result. if err != nil { err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) - ev.Result = -int64(t.ExtractErrno(err, 0)) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) } file.DecRef() -- cgit v1.2.3 From f03996c5e9803934226e4b3a10827501cb936ab9 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 16 Apr 2020 19:26:02 -0700 Subject: Implement pipe(2) and pipe2(2) for VFS2. Updates #1035 PiperOrigin-RevId: 306968644 --- pkg/sentry/fsimpl/pipefs/BUILD | 20 +++ pkg/sentry/fsimpl/pipefs/pipefs.go | 148 +++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/named_pipe.go | 23 +-- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 +- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 30 +++- pkg/sentry/kernel/pipe/vfs.go | 162 ++++++++++++--------- pkg/sentry/syscalls/linux/sys_pipe.go | 14 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 3 + pkg/sentry/syscalls/linux/vfs2/fd.go | 17 +++ .../syscalls/linux/vfs2/linux64_override_amd64.go | 4 +- pkg/sentry/syscalls/linux/vfs2/pipe.go | 63 ++++++++ pkg/sentry/syscalls/linux/vfs2/read_write.go | 8 +- pkg/sentry/vfs/vfs.go | 2 +- test/syscalls/linux/pipe.cc | 2 + 16 files changed, 389 insertions(+), 112 deletions(-) create mode 100644 pkg/sentry/fsimpl/pipefs/BUILD create mode 100644 pkg/sentry/fsimpl/pipefs/pipefs.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/pipe.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD new file mode 100644 index 000000000..0d411606f --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "pipefs", + srcs = ["pipefs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go new file mode 100644 index 000000000..faf3179bc --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -0,0 +1,148 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pipefs provides the filesystem implementation backing +// Kernel.PipeMount. +package pipefs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type filesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (filesystemType) Name() string { + return "pipefs" +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("pipefs.filesystemType.GetFilesystem should never be called") +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem + + // TODO(gvisor.dev/issue/1193): + // + // - kernfs does not provide a way to implement statfs, from which we + // should indicate PIPEFS_MAGIC. + // + // - kernfs does not provide a way to override names for + // vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic + // name fmt.Sprintf("pipe:[%d]", inode.ino). +} + +// NewFilesystem sets up and returns a new vfs.Filesystem implemented by +// pipefs. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeNoopRefCount + + pipe *pipe.VFSPipe + + ino uint64 + uid auth.KUID + gid auth.KGID + // We use the creation timestamp for all of atime, mtime, and ctime. + ctime ktime.Time +} + +func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode { + creds := auth.CredentialsFromContext(ctx) + return &inode{ + pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + ino: fs.NextIno(), + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + ctime: ktime.NowFromContext(ctx), + } +} + +const pipeMode = 0600 | linux.S_IFIFO + +// CheckPermissions implements kernfs.Inode.CheckPermissions. +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid) +} + +// Mode implements kernfs.Inode.Mode. +func (i *inode) Mode() linux.FileMode { + return pipeMode +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) + return linux.Statx{ + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: usermem.PageSize, + Nlink: 1, + UID: uint32(i.uid), + GID: uint32(i.gid), + Mode: pipeMode, + Ino: i.ino, + Size: 0, + Blocks: 0, + Atime: ts, + Ctime: ts, + Mtime: ts, + // TODO(gvisor.dev/issue/1197): Device number. + }, nil +} + +// SetStat implements kernfs.Inode.SetStat. +func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + return syserror.EPERM +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // FIXME(b/38173783): kernfs does not plumb Context here. + return i.pipe.Open(context.Background(), rp.Mount(), vfsd, opts.Flags) +} + +// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read +// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2). +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + inode := newInode(ctx, fs) + var d kernfs.Dentry + d.Init(inode) + defer d.DecRef() + return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) +} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index f4d50d64f..660f5a29b 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -392,7 +392,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP case *namedPipe: - return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags) + return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) case *socketFile: diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 2c5c739df..8d77b3fa8 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -16,10 +16,8 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) @@ -33,27 +31,8 @@ type namedPipe struct { // * fs.mu must be locked. // * rp.Mount().CheckBeginWrite() has been called successfully. func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { - file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)} + file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)} file.inode.init(file, fs, creds, linux.S_IFIFO|mode) file.inode.nlink = 1 // Only the parent has a link. return &file.inode } - -// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented -// entirely via struct embedding. -type namedPipeFD struct { - fileDescription - - *pipe.VFSPipeFD -} - -func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { - var err error - var fd namedPipeFD - fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags) - if err != nil { - return nil, err - } - fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}) - return &fd.vfsfd, nil -} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 9fa8637d5..a59b24d45 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -357,6 +357,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return err } i.mu.Lock() + defer i.mu.Unlock() var ( needsMtimeBump bool needsCtimeBump bool @@ -427,7 +428,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu atomic.StoreInt64(&i.ctime, now) } - i.mu.Unlock() return nil } diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e0ff58d8c..e47af66d6 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -170,6 +170,7 @@ go_library( "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index de8a95854..fef60e636 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -50,6 +50,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -254,6 +255,10 @@ type Kernel struct { // VFS keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem + // pipeMount is the Mount used for pipes created by the pipe() and pipe2() + // syscalls (as opposed to named pipes created by mknod()). + pipeMount *vfs.Mount + // If set to true, report address space activation waits as if the task is in // external wait so that the watchdog doesn't report the task stuck. SleepForAddressSpaceActivation bool @@ -354,19 +359,29 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() + if VFS2Enabled { if err := k.vfs.Init(); err != nil { return fmt.Errorf("failed to initialize VFS: %v", err) } - fs := sockfs.NewFilesystem(&k.vfs) - // NewDisconnectedMount will take an additional reference on fs. - defer fs.DecRef() - sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{}) + + pipeFilesystem := pipefs.NewFilesystem(&k.vfs) + defer pipeFilesystem.DecRef() + pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create pipefs mount: %v", err) + } + k.pipeMount = pipeMount + + socketFilesystem := sockfs.NewFilesystem(&k.vfs) + defer socketFilesystem.DecRef() + socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to initialize socket mount: %v", err) } - k.socketMount = sm + k.socketMount = socketMount } + return nil } @@ -1613,3 +1628,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { func (k *Kernel) VFS() *vfs.VirtualFilesystem { return &k.vfs } + +// PipeMount returns the pipefs mount. +func (k *Kernel) PipeMount() *vfs.Mount { + return k.pipeMount +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index a5675bd70..b54f08a30 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -49,38 +49,42 @@ type VFSPipe struct { } // NewVFSPipe returns an initialized VFSPipe. -func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe { +func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { var vp VFSPipe - initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes) + initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes) return &vp } -// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics -// during open: +// ReaderWriterPair returns read-only and write-only FDs for vp. // -// "Normally, opening the FIFO blocks until the other end is opened also. A -// process can open a FIFO in nonblocking mode. In this case, opening for -// read-only will succeed even if no-one has opened on the write side yet, -// opening for write-only will fail with ENXIO (no such device or address) -// unless the other end has already been opened. Under Linux, opening a FIFO -// for read and write will succeed both in blocking and nonblocking mode. POSIX -// leaves this behavior undefined. This can be used to open a FIFO for writing -// while there are no readers available." - fifo(7) -func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +// Preconditions: statusFlags should not contain an open access mode. +func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags) +} + +// Open opens the pipe represented by vp. +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() - readable := vfs.MayReadFileWithOpenFlags(flags) - writable := vfs.MayWriteFileWithOpenFlags(flags) + readable := vfs.MayReadFileWithOpenFlags(statusFlags) + writable := vfs.MayWriteFileWithOpenFlags(statusFlags) if !readable && !writable { return nil, syserror.EINVAL } - vfd, err := vp.open(vfsd, vfsfd, flags) - if err != nil { - return nil, err - } + fd := vp.newFD(mnt, vfsd, statusFlags) + // Named pipes have special blocking semantics during open: + // + // "Normally, opening the FIFO blocks until the other end is opened also. A + // process can open a FIFO in nonblocking mode. In this case, opening for + // read-only will succeed even if no-one has opened on the write side yet, + // opening for write-only will fail with ENXIO (no such device or address) + // unless the other end has already been opened. Under Linux, opening a + // FIFO for read and write will succeed both in blocking and nonblocking + // mode. POSIX leaves this behavior undefined. This can be used to open a + // FIFO for writing while there are no readers available." - fifo(7) switch { case readable && writable: // Pipes opened for read-write always succeed without blocking. @@ -89,23 +93,26 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf case readable: newHandleLocked(&vp.rWakeup) - // If this pipe is being opened as nonblocking and there's no + // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. - if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + fd.DecRef() return nil, syserror.EINTR } case writable: newHandleLocked(&vp.wWakeup) - if !vp.pipe.HasReaders() { - // Nonblocking, write-only opens fail with ENXIO when - // the read side isn't open yet. - if flags&linux.O_NONBLOCK != 0 { + if vp.pipe.isNamed && !vp.pipe.HasReaders() { + // Non-blocking, write-only opens fail with ENXIO when the read + // side isn't open yet. + if statusFlags&linux.O_NONBLOCK != 0 { + fd.DecRef() return nil, syserror.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { + fd.DecRef() return nil, syserror.EINTR } } @@ -114,96 +121,93 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf panic("invalid pipe flags: must be readable, writable, or both") } - return vfd, nil + return fd, nil } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { - var fd VFSPipeFD - fd.flags = flags - fd.readable = vfs.MayReadFileWithOpenFlags(flags) - fd.writable = vfs.MayWriteFileWithOpenFlags(flags) - fd.vfsfd = vfsfd - fd.pipe = &vp.pipe +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription { + fd := &VFSPipeFD{ + pipe: &vp.pipe, + } + fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }) switch { - case fd.readable && fd.writable: + case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): vp.pipe.rOpen() vp.pipe.wOpen() - case fd.readable: + case fd.vfsfd.IsReadable(): vp.pipe.rOpen() - case fd.writable: + case fd.vfsfd.IsWritable(): vp.pipe.wOpen() default: panic("invalid pipe flags: must be readable, writable, or both") } - return &fd, nil + return &fd.vfsfd } -// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is -// expected that filesystesm will use this in a struct implementing -// vfs.FileDescriptionImpl. +// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. type VFSPipeFD struct { - pipe *Pipe - flags uint32 - readable bool - writable bool - vfsfd *vfs.FileDescription + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + pipe *Pipe } // Release implements vfs.FileDescriptionImpl.Release. func (fd *VFSPipeFD) Release() { var event waiter.EventMask - if fd.readable { + if fd.vfsfd.IsReadable() { fd.pipe.rClose() - event |= waiter.EventIn + event |= waiter.EventOut } - if fd.writable { + if fd.vfsfd.IsWritable() { fd.pipe.wClose() - event |= waiter.EventOut + event |= waiter.EventIn | waiter.EventHUp } if event == 0 { panic("invalid pipe flags: must be readable, writable, or both") } - if fd.writable { - fd.vfsfd.VirtualDentry().Mount().EndWrite() - } - fd.pipe.Notify(event) } -// OnClose implements vfs.FileDescriptionImpl.OnClose. -func (fd *VFSPipeFD) OnClose(_ context.Context) error { - return nil +// Readiness implements waiter.Waitable.Readiness. +func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { + switch { + case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): + return fd.pipe.rwReadiness() + case fd.vfsfd.IsReadable(): + return fd.pipe.rReadiness() + case fd.vfsfd.IsWritable(): + return fd.pipe.wReadiness() + default: + panic("pipe FD is neither readable nor writable") + } } -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.pipe.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) { + fd.pipe.EventUnregister(e) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { - if !fd.readable { - return 0, syserror.EINVAL - } - return fd.pipe.Read(ctx, dst) } -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE -} - // Write implements vfs.FileDescriptionImpl.Write. func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { - if !fd.writable { - return 0, syserror.EINVAL - } - return fd.pipe.Write(ctx, src) } @@ -211,3 +215,17 @@ func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.Wr func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { return fd.pipe.Ioctl(ctx, uio, args) } + +// PipeSize implements fcntl(F_GETPIPE_SZ). +func (fd *VFSPipeFD) PipeSize() int64 { + // Inline Pipe.FifoSize() rather than calling it with nil Context and + // fs.File and ignoring the returned error (which is always nil). + fd.pipe.mu.Lock() + defer fd.pipe.mu.Unlock() + return fd.pipe.max +} + +// SetPipeSize implements fcntl(F_SETPIPE_SZ). +func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { + return fd.pipe.SetFifoSize(size) +} diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 798344042..43c510930 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -24,6 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // pipe2 implements the actual system call with flags. func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { @@ -45,10 +47,12 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { } if _, err := t.CopyOut(addr, fds); err != nil { - // The files are not closed in this case, the exact semantics - // of this error case are not well defined, but they could have - // already been observed by user space. - return 0, syserror.EFAULT + for _, fd := range fds { + if file, _ := t.FDTable().Remove(fd); file != nil { + file.DecRef() + } + } + return 0, err } return 0, nil } @@ -69,3 +73,5 @@ func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := pipe2(t, addr, flags) return n, nil, err } + +// LINT.ThenChange(vfs2/pipe.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index b32abfe59..6ff2d84d2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -18,6 +18,7 @@ go_library( "linux64_override_arm64.go", "mmap.go", "path.go", + "pipe.go", "poll.go", "read_write.go", "setstat.go", @@ -39,8 +40,10 @@ go_library( "//pkg/gohacks", "//pkg/sentry/arch", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/loader", diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 3afcea665..8181d80f4 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/syserror" ) @@ -140,6 +141,22 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(file.StatusFlags()), nil, nil case linux.F_SETFL: return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) + case linux.F_SETPIPE_SZ: + pipefile, ok := file.Impl().(*pipe.VFSPipeFD) + if !ok { + return 0, nil, syserror.EBADF + } + n, err := pipefile.SetPipeSize(int64(args[2].Int())) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil + case linux.F_GETPIPE_SZ: + pipefile, ok := file.Impl().(*pipe.VFSPipeFD) + if !ok { + return 0, nil, syserror.EBADF + } + return uintptr(pipefile.PipeSize()), nil, nil default: // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index 645e0bcb8..21eb98444 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -39,7 +39,7 @@ func Override(table map[uintptr]kernel.Syscall) { table[19] = syscalls.Supported("readv", Readv) table[20] = syscalls.Supported("writev", Writev) table[21] = syscalls.Supported("access", Access) - delete(table, 22) // pipe + table[22] = syscalls.Supported("pipe", Pipe) table[23] = syscalls.Supported("select", Select) table[32] = syscalls.Supported("dup", Dup) table[33] = syscalls.Supported("dup2", Dup2) @@ -151,7 +151,7 @@ func Override(table map[uintptr]kernel.Syscall) { delete(table, 290) // eventfd2 table[291] = syscalls.Supported("epoll_create1", EpollCreate1) table[292] = syscalls.Supported("dup3", Dup3) - delete(table, 293) // pipe2 + table[293] = syscalls.Supported("pipe2", Pipe2) delete(table, 294) // inotify_init1 table[295] = syscalls.Supported("preadv", Preadv) table[296] = syscalls.Supported("pwritev", Pwritev) diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go new file mode 100644 index 000000000..4a01e4209 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Pipe implements Linux syscall pipe(2). +func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + return 0, nil, pipe2(t, addr, 0) +} + +// Pipe2 implements Linux syscall pipe2(2). +func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + return 0, nil, pipe2(t, addr, flags) +} + +func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { + if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { + return syserror.EINVAL + } + r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) + defer r.DecRef() + defer w.DecRef() + + fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return err + } + if _, err := t.CopyOut(addr, fds); err != nil { + for _, fd := range fds { + if _, file := t.FDTable().Remove(fd); file != nil { + file.DecRef() + } + } + return err + } + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 898b190fd..6c6998f45 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -103,7 +103,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt // Issue the request and break out if it completes with anything other than // "would block". - n, err := file.Read(t, dst, opts) + n, err = file.Read(t, dst, opts) total += n if err != syserror.ErrWouldBlock { break @@ -248,7 +248,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of // Issue the request and break out if it completes with anything other than // "would block". - n, err := file.PRead(t, dst, offset+total, opts) + n, err = file.PRead(t, dst, offset+total, opts) total += n if err != syserror.ErrWouldBlock { break @@ -335,7 +335,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op // Issue the request and break out if it completes with anything other than // "would block". - n, err := file.Write(t, src, opts) + n, err = file.Write(t, src, opts) total += n if err != syserror.ErrWouldBlock { break @@ -480,7 +480,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o // Issue the request and break out if it completes with anything other than // "would block". - n, err := file.PWrite(t, src, offset+total, opts) + n, err = file.PWrite(t, src, offset+total, opts) total += n if err != syserror.ErrWouldBlock { break diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 053c6e1d1..cb5bbd781 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -335,7 +335,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) - if err != nil { + if err == nil { vfs.putResolvingPath(rp) return nil } diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc index d8e19e910..67228b66b 100644 --- a/test/syscalls/linux/pipe.cc +++ b/test/syscalls/linux/pipe.cc @@ -265,6 +265,8 @@ TEST_P(PipeTest, OffsetCalls) { SyscallFailsWithErrno(ESPIPE)); struct iovec iov; + iov.iov_base = &buf; + iov.iov_len = sizeof(buf); EXPECT_THAT(preadv(wfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE)); EXPECT_THAT(pwritev(rfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE)); } -- cgit v1.2.3 From 12bde95635ac266aab8087b4705372bb177638f3 Mon Sep 17 00:00:00 2001 From: Zach Koopmans Date: Fri, 17 Apr 2020 10:38:04 -0700 Subject: Get /bin/true to run on VFS2 Included: - loader_test.go RunTest and TestStartSignal VFS2 - container_test.go TestAppExitStatus on VFS2 - experimental flag added to runsc to turn on VFS2 Note: shared mounts are not yet supported. PiperOrigin-RevId: 307070753 --- pkg/sentry/kernel/syscalls.go | 7 + runsc/boot/BUILD | 11 ++ runsc/boot/config.go | 5 + runsc/boot/fds.go | 33 ++++ runsc/boot/fs.go | 9 +- runsc/boot/loader.go | 31 +++- runsc/boot/loader_amd64.go | 5 +- runsc/boot/loader_arm64.go | 5 +- runsc/boot/loader_test.go | 37 ++++- runsc/boot/user.go | 64 ++++++++ runsc/boot/vfs.go | 310 ++++++++++++++++++++++++++++++++++++++ runsc/container/container_test.go | 14 +- runsc/main.go | 3 + 13 files changed, 513 insertions(+), 21 deletions(-) create mode 100644 runsc/boot/vfs.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 2e3565747..84156d5a1 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -326,6 +326,13 @@ func RegisterSyscallTable(s *SyscallTable) { allSyscallTables = append(allSyscallTables, s) } +// FlushSyscallTablesTestOnly flushes the syscall tables for tests. Used for +// parameterized VFSv2 tests. +// TODO(gvisor.dv/issue/1624): Remove when VFS1 is no longer supported. +func FlushSyscallTablesTestOnly() { + allSyscallTables = nil +} + // Lookup returns the syscall implementation, if one exists. func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { if sysno < uintptr(len(s.lookup)) { diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 23f42382f..5451f1eba 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -21,6 +21,7 @@ go_library( "network.go", "strace.go", "user.go", + "vfs.go", ], visibility = [ "//runsc:__subpackages__", @@ -33,6 +34,7 @@ go_library( "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fspath", "//pkg/log", "//pkg/memutil", "//pkg/rand", @@ -40,6 +42,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", + "//pkg/sentry/devices/memdev", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", "//pkg/sentry/fs/gofer", @@ -49,6 +52,12 @@ go_library( "//pkg/sentry/fs/sys", "//pkg/sentry/fs/tmpfs", "//pkg/sentry/fs/tty", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/gofer", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/proc", + "//pkg/sentry/fsimpl/sys", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel:uncaught_signal_go_proto", @@ -71,6 +80,7 @@ go_library( "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/sync", "//pkg/syserror", @@ -114,6 +124,7 @@ go_test( "//pkg/p9", "//pkg/sentry/contexttest", "//pkg/sentry/fs", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sync", "//pkg/unet", diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 7ea5bfade..715a19112 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -305,5 +305,10 @@ func (c *Config) ToFlags() []string { if len(c.TestOnlyTestNameEnv) != 0 { f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) } + + if c.VFS2 { + f = append(f, "--vfs2=true") + } + return f } diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 5314b0f2a..7e49f6f9f 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" ) @@ -31,6 +32,10 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } + if kernel.VFS2Enabled { + return createFDTableVFS2(ctx, console, stdioFDs) + } + k := kernel.KernelFromContext(ctx) fdTable := k.NewFDTable() defer fdTable.DecRef() @@ -78,3 +83,31 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable.IncRef() return fdTable, nil } + +func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { + k := kernel.KernelFromContext(ctx) + fdTable := k.NewFDTable() + defer fdTable.DecRef() + + hostMount, err := vfshost.NewMount(k.VFS()) + if err != nil { + return nil, fmt.Errorf("creating host mount: %w", err) + } + + for appFD, hostFD := range stdioFDs { + // TODO(gvisor.dev/issue/1482): Add TTY support. + appFile, err := vfshost.ImportFD(hostMount, hostFD, false) + if err != nil { + return nil, err + } + + if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { + appFile.DecRef() + return nil, err + } + appFile.DecRef() + } + + fdTable.IncRef() + return fdTable, nil +} diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 82cc612d2..98cce60af 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -278,6 +278,9 @@ func subtargets(root string, mnts []specs.Mount) []string { } func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if conf.VFS2 { + return setupContainerVFS2(ctx, conf, mntr, procArgs) + } mns, err := mntr.setupFS(conf, procArgs) if err != nil { return err @@ -573,6 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. func (c *containerMounter) processHints(conf *Config) error { + if conf.VFS2 { + return nil + } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a @@ -781,9 +787,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly default: - // TODO(nlacasse): Support all the mount types and make this a fatal error. - // Most applications will "just work" without them, so this is a warning - // for now. log.Warningf("ignoring unknown filesystem type %q", m.Type) } return fsName, opts, useOverlay, nil diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 654441f65..cf1f47bc7 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -26,7 +26,6 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" @@ -73,6 +72,8 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) +var syscallTable *kernel.SyscallTable + // Loader keeps state needed to start the kernel and run the container.. type Loader struct { // k is the kernel. @@ -195,13 +196,14 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("setting up memory usage: %v", err) } - if args.Conf.VFS2 { - st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host) - if ok { - vfs2.Override(st.Table) - } + // Patch the syscall table. + kernel.VFS2Enabled = args.Conf.VFS2 + if kernel.VFS2Enabled { + vfs2.Override(syscallTable.Table) } + kernel.RegisterSyscallTable(syscallTable) + // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { @@ -392,11 +394,16 @@ func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel. return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err) } + wd := spec.Process.Cwd + if wd == "" { + wd = "/" + } + // Create the process arguments. procArgs := kernel.CreateProcessArgs{ Argv: spec.Process.Args, Envv: spec.Process.Env, - WorkingDirectory: spec.Process.Cwd, // Defaults to '/' if empty. + WorkingDirectory: wd, Credentials: creds, Umask: 0022, Limits: ls, @@ -541,7 +548,15 @@ func (l *Loader) run() error { } // Add the HOME enviroment variable if it is not already set. - envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + var envv []string + if kernel.VFS2Enabled { + envv, err = maybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + + } else { + envv, err = maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + } if err != nil { return err } diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go index b9669f2ac..78df86611 100644 --- a/runsc/boot/loader_amd64.go +++ b/runsc/boot/loader_amd64.go @@ -17,11 +17,10 @@ package boot import ( - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" ) func init() { - // Register the global syscall table. - kernel.RegisterSyscallTable(linux.AMD64) + // Set the global syscall table. + syscallTable = linux.AMD64 } diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go index cf64d28c8..250785010 100644 --- a/runsc/boot/loader_arm64.go +++ b/runsc/boot/loader_arm64.go @@ -17,11 +17,10 @@ package boot import ( - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" ) func init() { - // Register the global syscall table. - kernel.RegisterSyscallTable(linux.ARM64) + // Set the global syscall table. + syscallTable = linux.ARM64 } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index c9a75b76d..e7c71734f 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -30,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/fsgofer" @@ -66,6 +67,11 @@ func testSpec() *specs.Spec { } } +func resetSyscallTable() { + kernel.VFS2Enabled = false + kernel.FlushSyscallTablesTestOnly() +} + // startGofer starts a new gofer routine serving 'root' path. It returns the // sandbox side of the connection, and a function that when called will stop the // gofer. @@ -101,7 +107,7 @@ func startGofer(root string) (int, func(), error) { return sandboxEnd, cleanup, nil } -func createLoader() (*Loader, func(), error) { +func createLoader(vfsEnabled bool) (*Loader, func(), error) { fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10])) if err != nil { return nil, nil, err @@ -109,6 +115,8 @@ func createLoader() (*Loader, func(), error) { conf := testConfig() spec := testSpec() + conf.VFS2 = vfsEnabled + sandEnd, cleanup, err := startGofer(spec.Root.Path) if err != nil { return nil, nil, err @@ -142,10 +150,22 @@ func createLoader() (*Loader, func(), error) { // TestRun runs a simple application in a sandbox and checks that it succeeds. func TestRun(t *testing.T) { - l, cleanup, err := createLoader() + defer resetSyscallTable() + doRun(t, false) +} + +// TestRunVFS2 runs TestRun in VFSv2. +func TestRunVFS2(t *testing.T) { + defer resetSyscallTable() + doRun(t, true) +} + +func doRun(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled) if err != nil { t.Fatalf("error creating loader: %v", err) } + defer l.Destroy() defer cleanup() @@ -179,7 +199,18 @@ func TestRun(t *testing.T) { // TestStartSignal tests that the controller Start message will cause // WaitForStartSignal to return. func TestStartSignal(t *testing.T) { - l, cleanup, err := createLoader() + defer resetSyscallTable() + doStartSignal(t, false) +} + +// TestStartSignalVFS2 does TestStartSignal with VFS2. +func TestStartSignalVFS2(t *testing.T) { + defer resetSyscallTable() + doStartSignal(t, true) +} + +func doStartSignal(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled) if err != nil { t.Fatalf("error creating loader: %v", err) } diff --git a/runsc/boot/user.go b/runsc/boot/user.go index f0aa52135..332e4fce5 100644 --- a/runsc/boot/user.go +++ b/runsc/boot/user.go @@ -23,8 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) @@ -84,6 +86,48 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K File: f, } + return findHomeInPasswd(uint32(uid), r, defaultHome) +} + +type fileReaderVFS2 struct { + ctx context.Context + fd *vfs.FileDescription +} + +func (r *fileReaderVFS2) Read(buf []byte) (int, error) { + n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) + return int(n), err +} + +func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) { + const defaultHome = "/" + + root := mns.Root() + defer root.DecRef() + + creds := auth.CredentialsFromContext(ctx) + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse("/etc/passwd"), + } + + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + } + + fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts) + if err != nil { + return defaultHome, nil + } + defer fd.DecRef() + + r := &fileReaderVFS2{ + ctx: ctx, + fd: fd, + } + homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome) if err != nil { return "", err @@ -111,6 +155,26 @@ func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth. if err != nil { return nil, fmt.Errorf("error reading exec user: %v", err) } + + return append(envv, "HOME="+homeDir), nil +} + +func maybeAddExecUserHomeVFS2(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) { + // Check if the envv already contains HOME. + for _, env := range envv { + if strings.HasPrefix(env, "HOME=") { + // We have it. Return the original slice unmodified. + return envv, nil + } + } + + // Read /etc/passwd for the user's HOME directory and set the HOME + // environment variable as required by POSIX if it is not overridden by + // the user. + homeDir, err := getExecUserHomeVFS2(ctx, vmns, uid) + if err != nil { + return nil, fmt.Errorf("error reading exec user: %v", err) + } return append(envv, "HOME="+homeDir), nil } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go new file mode 100644 index 000000000..82083c57d --- /dev/null +++ b/runsc/boot/vfs.go @@ -0,0 +1,310 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "path" + "strconv" + "strings" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/devices/memdev" + "gvisor.dev/gvisor/pkg/sentry/fs" + devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" + sysimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + tmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/syserror" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { + + vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + // Setup files in devtmpfs. + if err := memdev.Register(vfsObj); err != nil { + return fmt.Errorf("registering memdev: %w", err) + } + a, err := devtmpfsimpl.NewAccessor(ctx, vfsObj, creds, devtmpfsimpl.Name) + if err != nil { + return fmt.Errorf("creating devtmpfs accessor: %w", err) + } + defer a.Release() + + if err := a.UserspaceInit(ctx); err != nil { + return fmt.Errorf("initializing userspace: %w", err) + } + if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating devtmpfs files: %w", err) + } + return nil +} + +func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if err := mntr.k.VFS().Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %w", err) + } + mns, err := mntr.setupVFS2(ctx, conf, procArgs) + if err != nil { + return fmt.Errorf("failed to setupFS: %w", err) + } + procArgs.MountNamespaceVFS2 = mns + return setExecutablePathVFS2(ctx, procArgs) +} + +func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { + + exe := procArgs.Argv[0] + + // Absolute paths can be used directly. + if path.IsAbs(exe) { + procArgs.Filename = exe + return nil + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(exe, '/') > 0 { + + if !path.IsAbs(procArgs.WorkingDirectory) { + return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory) + } + + procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) + return nil + } + + // Paths with a '/' are relative to the CWD. + if strings.IndexByte(exe, '/') > 0 { + procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) + return nil + } + + // Otherwise, We must lookup the name in the paths, starting from the + // root directory. + root := procArgs.MountNamespaceVFS2.Root() + defer root.DecRef() + + paths := fs.GetPath(procArgs.Envv) + creds := procArgs.Credentials + + for _, p := range paths { + + binPath := path.Join(p, exe) + + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(binPath), + FollowFinalSymlink: true, + } + + opts := &vfs.OpenOptions{ + FileExec: true, + Flags: linux.O_RDONLY, + } + + dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + // Didn't find it here. + continue + } + if err != nil { + return err + } + dentry.DecRef() + + procArgs.Filename = binPath + return nil + } + + return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":")) +} + +func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { + log.Infof("Configuring container's file system with VFS2") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := procArgs.NewContext(c.k) + + creds := procArgs.Credentials + if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil { + return nil, fmt.Errorf("register filesystems: %w", err) + } + + fd := c.fds.remove() + + opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") + + log.Infof("Mounting root over 9P, ioFD: %d", fd) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts}) + if err != nil { + return nil, fmt.Errorf("setting up mountnamespace: %w", err) + } + + rootProcArgs.MountNamespaceVFS2 = mns + + // Mount submounts. + if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil { + return nil, fmt.Errorf("mounting submounts vfs2: %w", err) + } + + return mns, nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { + + for _, submount := range c.mounts { + log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil { + return err + } + } + + // TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go. + + return c.checkDispenser() +} + +// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version. +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error { + root := mns.Root() + defer root.DecRef() + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), + } + + _, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount) + if err != nil { + return fmt.Errorf("mountOptions failed: %w", err) + } + + opts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(options, ","), + }, + InternalMount: true, + } + + // All writes go to upper, be paranoid and make lower readonly. + opts.ReadOnly = useOverlay + + if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { + return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + } + log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts) + return nil +} + +// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m specs.Mount) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + ) + + switch m.Type { + case devpts, devtmpfs, proc, sysfs: + fsName = m.Type + case nonefs: + fsName = sysfs + case tmpfs: + fsName = m.Type + + var err error + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) + if err != nil { + return "", nil, false, err + } + + case bind: + fd := c.fds.remove() + fsName = "9p" + opts = p9MountOptionsVFS2(fd, c.getMountAccessType(m)) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, nil +} + +// p9MountOptions creates a slice of options for a p9 mount. +// TODO(gvisor.dev/issue/1200): Remove this version in favor of the one in +// fs.go when privateunixsocket lands. +func p9MountOptionsVFS2(fd int, fa FileAccessType) []string { + opts := []string{ + "trans=fd", + "rfdno=" + strconv.Itoa(fd), + "wfdno=" + strconv.Itoa(fd), + } + if fa == FileAccessShared { + opts = append(opts, "cache=remote_revalidating") + } + return opts +} diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 442e80ac0..24f9ecc35 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -521,9 +521,21 @@ func TestExePath(t *testing.T) { // Test the we can retrieve the application exit status from the container. func TestAppExitStatus(t *testing.T) { + conf := testutil.TestConfig() + conf.VFS2 = false + doAppExitStatus(t, conf) +} + +// This is TestAppExitStatus for VFSv2. +func TestAppExitStatusVFS2(t *testing.T) { + conf := testutil.TestConfig() + conf.VFS2 = true + doAppExitStatus(t, conf) +} + +func doAppExitStatus(t *testing.T, conf *boot.Config) { // First container will succeed. succSpec := testutil.NewSpecWithArgs("true") - conf := testutil.TestConfig() rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) diff --git a/runsc/main.go b/runsc/main.go index c1c78529c..9d52f3006 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -84,6 +84,7 @@ var ( rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") + vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -230,6 +231,7 @@ func main() { ReferenceLeakMode: refsLeakMode, OverlayfsStaleRead: *overlayfsStaleRead, CPUNumFromQuota: *cpuNumFromQuota, + VFS2: *vfs2Enabled, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, @@ -313,6 +315,7 @@ func main() { log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay) log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets) log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls) + log.Infof("\t\tVFS2 enabled: %v", conf.VFS2) log.Infof("***************************") if *testOnlyAllowRunAsCurrentUserWithoutChroot { -- cgit v1.2.3 From 0c586946ea26610b87c4ff7bda783a5a9ca11ec0 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 22 Apr 2020 17:48:59 -0700 Subject: Specify a memory file in platform.New(). PiperOrigin-RevId: 307941984 --- pkg/abi/linux/BUILD | 1 + pkg/abi/linux/arch_amd64.go | 23 +++++++++++++++++++++++ pkg/abi/linux/seccomp.go | 7 +++++++ pkg/flipcall/packet_window_allocator.go | 4 ++-- pkg/seccomp/seccomp_unsafe.go | 9 +-------- pkg/sentry/kernel/task_run.go | 1 + pkg/sentry/platform/kvm/context.go | 3 +++ pkg/sentry/platform/kvm/kvm.go | 5 +++++ pkg/sentry/platform/platform.go | 21 +++++++++++++++++++++ pkg/sentry/platform/ptrace/ptrace.go | 13 +++++++++++++ pkg/sentry/platform/ptrace/subprocess.go | 2 +- runsc/cmd/BUILD | 2 +- runsc/cmd/boot.go | 9 +++++++-- runsc/sandbox/sandbox.go | 10 +++++++--- tools/nogo/config.go | 3 +++ 15 files changed, 96 insertions(+), 17 deletions(-) create mode 100644 pkg/abi/linux/arch_amd64.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 322d1ccc4..59b0e138a 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -10,6 +10,7 @@ go_library( name = "linux", srcs = [ "aio.go", + "arch_amd64.go", "audit.go", "bpf.go", "capability.go", diff --git a/pkg/abi/linux/arch_amd64.go b/pkg/abi/linux/arch_amd64.go new file mode 100644 index 000000000..0be31e755 --- /dev/null +++ b/pkg/abi/linux/arch_amd64.go @@ -0,0 +1,23 @@ +// Copyright 2020 The gVisor Authors. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +// Start and end addresses of the vsyscall page. +const ( + VSyscallStartAddr uint64 = 0xffffffffff600000 + VSyscallEndAddr uint64 = 0xffffffffff601000 +) diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go index 4eeb5cd7a..d0607e256 100644 --- a/pkg/abi/linux/seccomp.go +++ b/pkg/abi/linux/seccomp.go @@ -63,3 +63,10 @@ func (a BPFAction) String() string { func (a BPFAction) Data() uint16 { return uint16(a & SECCOMP_RET_DATA) } + +// SockFprog is sock_fprog taken from . +type SockFprog struct { + Len uint16 + pad [6]byte + Filter *BPFInstruction +} diff --git a/pkg/flipcall/packet_window_allocator.go b/pkg/flipcall/packet_window_allocator.go index ccb918fab..af9cc3d21 100644 --- a/pkg/flipcall/packet_window_allocator.go +++ b/pkg/flipcall/packet_window_allocator.go @@ -134,7 +134,7 @@ func (pwa *PacketWindowAllocator) Allocate(size int) (PacketWindowDescriptor, er start := pwa.nextAlloc pwa.nextAlloc = end return PacketWindowDescriptor{ - FD: pwa.fd, + FD: pwa.FD(), Offset: start, Length: size, }, nil @@ -158,7 +158,7 @@ func (pwa *PacketWindowAllocator) ensureFileSize(min int64) error { } newSize = newNewSize } - if err := syscall.Ftruncate(pwa.fd, newSize); err != nil { + if err := syscall.Ftruncate(pwa.FD(), newSize); err != nil { return fmt.Errorf("ftruncate failed: %v", err) } pwa.fileSize = newSize diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go index be328db12..f7e986589 100644 --- a/pkg/seccomp/seccomp_unsafe.go +++ b/pkg/seccomp/seccomp_unsafe.go @@ -21,13 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" ) -// sockFprog is sock_fprog taken from . -type sockFprog struct { - Len uint16 - pad [6]byte - Filter *linux.BPFInstruction -} - // SetFilter installs the given BPF program. // // This is safe to call from an afterFork context. @@ -39,7 +32,7 @@ func SetFilter(instrs []linux.BPFInstruction) syscall.Errno { return errno } - sockProg := sockFprog{ + sockProg := linux.SockFprog{ Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 2ba8d7e63..d654dd997 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -96,6 +96,7 @@ func (t *Task) run(threadID uintptr) { t.tg.liveGoroutines.Done() t.tg.pidns.owner.liveGoroutines.Done() t.tg.pidns.owner.runningGoroutines.Done() + t.p.Release() // Keep argument alive because stack trace for dead variables may not be correct. runtime.KeepAlive(threadID) diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index c769ac7b4..6507121ea 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -85,3 +85,6 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a func (c *context) Interrupt() { c.interrupt.NotifyInterrupt() } + +// Release implements platform.Context.Release(). +func (c *context) Release() {} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index a9b4af43e..ae813e24e 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -191,6 +191,11 @@ func (*constructor) OpenDevice() (*os.File, error) { return OpenDevice() } +// Flags implements platform.Constructor.Flags(). +func (*constructor) Requirements() platform.Requirements { + return platform.Requirements{} +} + func init() { platform.Register("kvm", &constructor{}) } diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 2ca696382..171513f3f 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -148,6 +148,9 @@ type Context interface { // Interrupt interrupts a concurrent call to Switch(), causing it to return // ErrContextInterrupt. Interrupt() + + // Release() releases any resources associated with this context. + Release() } var ( @@ -353,10 +356,28 @@ func (fr FileRange) String() string { return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) } +// Requirements is used to specify platform specific requirements. +type Requirements struct { + // RequiresCurrentPIDNS indicates that the sandbox has to be started in the + // current pid namespace. + RequiresCurrentPIDNS bool + // RequiresCapSysPtrace indicates that the sandbox has to be started with + // the CAP_SYS_PTRACE capability. + RequiresCapSysPtrace bool +} + // Constructor represents a platform type. type Constructor interface { + // New returns a new platform instance. + // + // Arguments: + // + // * deviceFile - the device file (e.g. /dev/kvm for the KVM platform). New(deviceFile *os.File) (Platform, error) OpenDevice() (*os.File, error) + + // Requirements returns platform specific requirements. + Requirements() Requirements } // platforms contains all available platform types. diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 03adb624b..08d055e05 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -177,6 +177,9 @@ func (c *context) Interrupt() { c.interrupt.NotifyInterrupt() } +// Release implements platform.Context.Release(). +func (c *context) Release() {} + // PTrace represents a collection of ptrace subprocesses. type PTrace struct { platform.MMapMinAddr @@ -248,6 +251,16 @@ func (*constructor) OpenDevice() (*os.File, error) { return nil, nil } +// Flags implements platform.Constructor.Flags(). +func (*constructor) Requirements() platform.Requirements { + // TODO(b/75837838): Also set a new PID namespace so that we limit + // access to other host processes. + return platform.Requirements{ + RequiresCapSysPtrace: true, + RequiresCurrentPIDNS: true, + } +} + func init() { platform.Register("ptrace", &constructor{}) } diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index a644609ef..773ddb1ed 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -332,7 +332,7 @@ func (t *thread) unexpectedStubExit() { msg, err := t.getEventMessage() status := syscall.WaitStatus(msg) if status.Signaled() && status.Signal() == syscall.SIGKILL { - // SIGKILL can be only sent by an user or OOM-killer. In both + // SIGKILL can be only sent by a user or OOM-killer. In both // these cases, we don't need to panic. There is no reasons to // think that something wrong in gVisor. log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index d0bb4613a..4900fbe16 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -44,13 +44,13 @@ go_library( "//pkg/sentry/control", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/platform", "//pkg/state", "//pkg/state/statefile", "//pkg/sync", "//pkg/unet", "//pkg/urpc", "//runsc/boot", - "//runsc/boot/platforms", "//runsc/console", "//runsc/container", "//runsc/flag", diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 0938944a6..4c2ac6ff0 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -25,8 +25,8 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/boot" - "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) @@ -183,7 +183,12 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if caps == nil { caps = &specs.LinuxCapabilities{} } - if conf.Platform == platforms.Ptrace { + + gPlatform, err := platform.Lookup(conf.Platform) + if err != nil { + Fatalf("loading platform: %v", err) + } + if gPlatform.Requirements().RequiresCapSysPtrace { // Ptrace platform requires extra capabilities. const c = "CAP_SYS_PTRACE" caps.Bounding = append(caps.Bounding, c) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index e82bcef6f..e4ec16e2f 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -446,9 +446,13 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF nextFD++ } - // If the platform needs a device FD we must pass it in. - if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil { + gPlatform, err := platform.Lookup(conf.Platform) + if err != nil { return err + } + + if deviceFile, err := gPlatform.OpenDevice(); err != nil { + return fmt.Errorf("opening device file for platform %q: %v", gPlatform, err) } else if deviceFile != nil { defer deviceFile.Close() cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile) @@ -539,7 +543,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF {Type: specs.UTSNamespace}, } - if conf.Platform == platforms.Ptrace { + if gPlatform.Requirements().RequiresCurrentPIDNS { // TODO(b/75837838): Also set a new PID namespace so that we limit // access to other host processes. log.Infof("Sandbox will be started in the current PID namespace") diff --git a/tools/nogo/config.go b/tools/nogo/config.go index 0c4b7dd40..6958fca69 100644 --- a/tools/nogo/config.go +++ b/tools/nogo/config.go @@ -103,6 +103,9 @@ var analyzerConfig = map[*analysis.Analyzer]matcher{ "pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go", // Special case. "pkg/sentry/platform/safecopy/safecopy_unsafe.go", // Special case. "pkg/sentry/vfs/mount_unsafe.go", // Special case. + "pkg/sentry/platform/systrap/stub_unsafe.go", // Special case. + "pkg/sentry/platform/systrap/switchto_google_unsafe.go", // Special case. + "pkg/sentry/platform/systrap/sysmsg_thread_unsafe.go", // Special case. ), ), unusedresult.Analyzer: alwaysMatches(), -- cgit v1.2.3 From 93dd47146185ec7004f514e23bad9f225f55efb1 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Thu, 23 Apr 2020 15:47:59 -0700 Subject: Enable automated marshalling for epoll events. Ensure we use the correct architecture-specific defintion of epoll event, and use go-marshal for serialization. PiperOrigin-RevId: 308145677 --- pkg/abi/linux/epoll_amd64.go | 4 ++- pkg/abi/linux/epoll_arm64.go | 4 ++- pkg/sentry/kernel/epoll/BUILD | 1 + pkg/sentry/kernel/epoll/epoll.go | 20 +++--------- pkg/sentry/syscalls/epoll.go | 3 +- pkg/sentry/syscalls/linux/sys_epoll.go | 27 ++-------------- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 - pkg/sentry/syscalls/linux/vfs2/epoll.go | 7 ++-- pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go | 44 -------------------------- 9 files changed, 20 insertions(+), 91 deletions(-) delete mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go index 34ff18009..7e74b1143 100644 --- a/pkg/abi/linux/epoll_amd64.go +++ b/pkg/abi/linux/epoll_amd64.go @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build amd64 + package linux // EpollEvent is equivalent to struct epoll_event from epoll(2). // -// +marshal +// +marshal slice:EpollEventSlice type EpollEvent struct { Events uint32 // Linux makes struct epoll_event::data a __u64. We represent it as diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go index f86c35329..a35939cc9 100644 --- a/pkg/abi/linux/epoll_arm64.go +++ b/pkg/abi/linux/epoll_arm64.go @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build arm64 + package linux // EpollEvent is equivalent to struct epoll_event from epoll(2). // -// +marshal +// +marshal slice:EpollEventSlice type EpollEvent struct { Events uint32 // Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index dedf0fa15..75eedd5a2 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -24,6 +24,7 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", "//pkg/context", "//pkg/refs", "//pkg/sentry/fs", diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 592650923..3d78cd48f 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -20,6 +20,7 @@ import ( "fmt" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -30,19 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// Event describes the event mask that was observed and the user data to be -// returned when one of the events occurs. It has this format to match the linux -// format to avoid extra copying/allocation when writing events to userspace. -type Event struct { - // Events is the event mask containing the set of events that have been - // observed on an entry. - Events uint32 - - // Data is an opaque 64-bit value provided by the caller when adding the - // entry, and returned to the caller when the entry reports an event. - Data [2]int32 -} - // EntryFlags is a bitmask that holds an entry's flags. type EntryFlags int @@ -227,9 +215,9 @@ func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask { } // ReadEvents returns up to max available events. -func (e *EventPoll) ReadEvents(max int) []Event { +func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { var local pollEntryList - var ret []Event + var ret []linux.EpollEvent e.listsMu.Lock() @@ -251,7 +239,7 @@ func (e *EventPoll) ReadEvents(max int) []Event { } // Add event to the array that will be returned to caller. - ret = append(ret, Event{ + ret = append(ret, linux.EpollEvent{ Events: uint32(ready), Data: entry.userData, }) diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index 87dcad18b..d9fb808c0 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -17,6 +17,7 @@ package syscalls import ( "time" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -118,7 +119,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { } // WaitEpoll implements the epoll_wait(2) linux syscall. -func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, error) { +func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEvent, error) { // Get epoll from the file descriptor. epollfile := t.GetFile(fd) if epollfile == nil { diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 3ab93fbde..51bf205cf 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" "gvisor.dev/gvisor/pkg/sentry/syscalls" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -72,7 +71,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc var data [2]int32 if op != linux.EPOLL_CTL_DEL { var e linux.EpollEvent - if _, err := t.CopyIn(eventAddr, &e); err != nil { + if _, err := e.CopyIn(t, eventAddr); err != nil { return 0, nil, err } @@ -105,28 +104,6 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } } -// copyOutEvents copies epoll events from the kernel to user memory. -func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error { - const itemLen = 12 - buffLen := len(e) * itemLen - if _, ok := addr.AddLength(uint64(buffLen)); !ok { - return syserror.EFAULT - } - - b := t.CopyScratchBuffer(buffLen) - for i := range e { - usermem.ByteOrder.PutUint32(b[i*itemLen:], e[i].Events) - usermem.ByteOrder.PutUint32(b[i*itemLen+4:], uint32(e[i].Data[0])) - usermem.ByteOrder.PutUint32(b[i*itemLen+8:], uint32(e[i].Data[1])) - } - - if _, err := t.CopyOutBytes(addr, b); err != nil { - return err - } - - return nil -} - // EpollWait implements the epoll_wait(2) linux syscall. func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { epfd := args[0].Int() @@ -140,7 +117,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if len(r) != 0 { - if err := copyOutEvents(t, eventsAddr, r); err != nil { + if _, err := linux.CopyEpollEventSliceOut(t, eventsAddr, r); err != nil { return 0, nil, err } } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 6ff2d84d2..f6fb0f219 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -6,7 +6,6 @@ go_library( name = "vfs2", srcs = [ "epoll.go", - "epoll_unsafe.go", "execve.go", "fd.go", "filesystem.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index 5a938cee2..34c90ae3e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes() + // EpollCreate1 implements Linux syscall epoll_create1(2). func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() @@ -124,7 +126,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys maxEvents := int(args[2].Int()) timeout := int(args[3].Int()) - const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS + var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { return 0, nil, syserror.EINVAL } @@ -157,7 +159,8 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys maxEvents -= n if n != 0 { // Copy what we read out. - copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n]) + copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n]) + copiedEvents := copiedBytes / sizeofEpollEvent // rounded down eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent) total += copiedEvents if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go deleted file mode 100644 index 825f325bf..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs2 - -import ( - "reflect" - "runtime" - "unsafe" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/gohacks" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/usermem" -) - -const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{})) - -func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) { - if len(events) == 0 { - return 0, nil - } - // Cast events to a byte slice for copying. - var eventBytes []byte - eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes)) - eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0]))) - eventBytesHdr.Len = len(events) * sizeofEpollEvent - eventBytesHdr.Cap = len(events) * sizeofEpollEvent - copiedBytes, err := t.CopyOutBytes(addr, eventBytes) - runtime.KeepAlive(events) - copiedEvents := copiedBytes / sizeofEpollEvent // rounded down - return copiedEvents, err -} -- cgit v1.2.3 From f01f2132d8d3e551579cba9a1b942b4b70d83f21 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Thu, 23 Apr 2020 18:18:54 -0700 Subject: Enable automated marshalling for mempolicy syscalls. PiperOrigin-RevId: 308170679 --- pkg/abi/linux/mm.go | 17 +++++++++++------ pkg/sentry/kernel/task.go | 2 +- pkg/sentry/kernel/task_sched.go | 4 ++-- pkg/sentry/mm/mm.go | 3 ++- pkg/sentry/mm/syscalls.go | 4 ++-- pkg/sentry/syscalls/linux/sys_mempolicy.go | 18 +++++++++--------- 6 files changed, 27 insertions(+), 21 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go index cd043dac3..07cc1895e 100644 --- a/pkg/abi/linux/mm.go +++ b/pkg/abi/linux/mm.go @@ -90,14 +90,19 @@ const ( MS_SYNC = 1 << 2 ) +// NumaPolicy is the NUMA memory policy for a memory range. See numa(7). +// +// +marshal +type NumaPolicy int32 + // Policies for get_mempolicy(2)/set_mempolicy(2). const ( - MPOL_DEFAULT = 0 - MPOL_PREFERRED = 1 - MPOL_BIND = 2 - MPOL_INTERLEAVE = 3 - MPOL_LOCAL = 4 - MPOL_MAX = 5 + MPOL_DEFAULT NumaPolicy = 0 + MPOL_PREFERRED NumaPolicy = 1 + MPOL_BIND NumaPolicy = 2 + MPOL_INTERLEAVE NumaPolicy = 3 + MPOL_LOCAL NumaPolicy = 4 + MPOL_MAX NumaPolicy = 5 ) // Flags for get_mempolicy(2). diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e5d133d6c..f48247c94 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -484,7 +484,7 @@ type Task struct { // bit. // // numaPolicy and numaNodeMask are protected by mu. - numaPolicy int32 + numaPolicy linux.NumaPolicy numaNodeMask uint64 // netns is the task's network namespace. netns is never nil. diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 8b148db35..09366b60c 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -653,14 +653,14 @@ func (t *Task) SetNiceness(n int) { } // NumaPolicy returns t's current numa policy. -func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) { +func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() return t.numaPolicy, t.numaNodeMask } // SetNumaPolicy sets t's numa policy. -func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) { +func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() t.numaPolicy = policy diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 34d3bde7a..6db7c3d40 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -35,6 +35,7 @@ package mm import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsbridge" @@ -286,7 +287,7 @@ type vma struct { mlockMode memmap.MLockMode // numaPolicy is the NUMA policy for this vma set by mbind(). - numaPolicy int32 + numaPolicy linux.NumaPolicy // numaNodemask is the NUMA nodemask for this vma set by mbind(). numaNodemask uint64 diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index c5dfa5972..3f496aa9f 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -974,7 +974,7 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error } // NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR). -func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) { +func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (linux.NumaPolicy, uint64, error) { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() vseg := mm.vmas.FindSegment(addr) @@ -986,7 +986,7 @@ func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) { } // SetNumaPolicy implements the semantics of Linux's mbind(). -func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error { +func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error { if !addr.IsPageAligned() { return syserror.EINVAL } diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go index ac934dc6f..9b4a5c3f1 100644 --- a/pkg/sentry/syscalls/linux/sys_mempolicy.go +++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go @@ -162,10 +162,10 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if err != nil { return 0, nil, err } - policy = 0 // maxNodes == 1 + policy = linux.MPOL_DEFAULT // maxNodes == 1 } if mode != 0 { - if _, err := t.CopyOut(mode, policy); err != nil { + if _, err := policy.CopyOut(t, mode); err != nil { return 0, nil, err } } @@ -199,10 +199,10 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE { return 0, nil, syserror.EINVAL } - policy = 0 // maxNodes == 1 + policy = linux.MPOL_DEFAULT // maxNodes == 1 } if mode != 0 { - if _, err := t.CopyOut(mode, policy); err != nil { + if _, err := policy.CopyOut(t, mode); err != nil { return 0, nil, err } } @@ -216,7 +216,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // SetMempolicy implements the syscall set_mempolicy(2). func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - modeWithFlags := args[0].Int() + modeWithFlags := linux.NumaPolicy(args[0].Int()) nodemask := args[1].Pointer() maxnode := args[2].Uint() @@ -233,7 +233,7 @@ func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].Uint64() - mode := args[2].Int() + mode := linux.NumaPolicy(args[2].Int()) nodemask := args[3].Pointer() maxnode := args[4].Uint() flags := args[5].Uint() @@ -258,9 +258,9 @@ func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } -func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags int32, nodemask usermem.Addr, maxnode uint32) (int32, uint64, error) { - flags := modeWithFlags & linux.MPOL_MODE_FLAGS - mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS +func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask usermem.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) { + flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS) + mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS) if flags == linux.MPOL_MODE_FLAGS { // Can't specify both mode flags simultaneously. return 0, 0, syserror.EINVAL -- cgit v1.2.3 From 1b88c63b3e6b330c8399bf92f148cc80374bee18 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 24 Apr 2020 10:02:22 -0700 Subject: Move hostfs mount to Kernel struct. This is needed to set up host fds passed through a Unix socket. Note that the host package depends on kernel, so we cannot set up the hostfs mount directly in Kernel.Init as we do for sockfs and pipefs. Also, adjust sockfs to make its setup look more like hostfs's and pipefs's. PiperOrigin-RevId: 308274053 --- pkg/sentry/fsimpl/host/host.go | 16 +++++++-------- pkg/sentry/fsimpl/sockfs/sockfs.go | 26 ++++++++++------------- pkg/sentry/kernel/kernel.go | 42 ++++++++++++++++++++++++++++---------- runsc/boot/fds.go | 7 +------ runsc/boot/loader.go | 13 ++++++++++++ 5 files changed, 64 insertions(+), 40 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 7847e3cc2..a26b13067 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -42,7 +42,7 @@ type filesystemType struct{} // GetFilesystem implements FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - panic("cannot instaniate a host filesystem") + panic("host.filesystemType.GetFilesystem should never be called") } // Name implements FilesystemType.Name. @@ -55,14 +55,14 @@ type filesystem struct { kernfs.Filesystem } -// NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD. -func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) { +// NewFilesystem sets up and returns a new hostfs filesystem. +// +// Note that there should only ever be one instance of host.filesystem, +// a global mount for host fds. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { fs := &filesystem{} - fs.Init(vfsObj, &filesystemType{}) - vfsfs := fs.VFSFilesystem() - // NewDisconnectedMount will take an additional reference on vfsfs. - defer vfsfs.DecRef() - return vfsObj.NewDisconnectedMount(vfsfs, nil, &vfs.MountOptions{}) + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() } // ImportFD sets up and returns a vfs.FileDescription from a donated fd. diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 3f7ad1d65..632cfde88 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -24,26 +24,12 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// NewFilesystem creates a new sockfs filesystem. -// -// Note that there should only ever be one instance of sockfs.Filesystem, -// backing a global socket mount. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{}) - if err != nil { - panic("failed to create sockfs filesystem") - } - return fs -} - // filesystemType implements vfs.FilesystemType. type filesystemType struct{} // GetFilesystem implements FilesystemType.GetFilesystem. func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - fs := &filesystem{} - fs.Init(vfsObj, fsType) - return fs.VFSFilesystem(), nil, nil + panic("sockfs.filesystemType.GetFilesystem should never be called") } // Name implements FilesystemType.Name. @@ -60,6 +46,16 @@ type filesystem struct { kernfs.Filesystem } +// NewFilesystem sets up and returns a new sockfs filesystem. +// +// Note that there should only ever be one instance of sockfs.Filesystem, +// backing a global socket mount. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() +} + // inode implements kernfs.Inode. // // TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index fef60e636..c91b9dce2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -227,11 +227,6 @@ type Kernel struct { // by extMu. nextSocketEntry uint64 - // socketMount is a disconnected vfs.Mount, not included in k.vfs, - // representing a sockfs.filesystem. socketMount is used to back - // VirtualDentries representing anonymous sockets. - socketMount *vfs.Mount - // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` @@ -255,10 +250,22 @@ type Kernel struct { // VFS keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem + // hostMount is the Mount used for file descriptors that were imported + // from the host. + hostMount *vfs.Mount + // pipeMount is the Mount used for pipes created by the pipe() and pipe2() // syscalls (as opposed to named pipes created by mknod()). pipeMount *vfs.Mount + // socketMount is the Mount used for sockets created by the socket() and + // socketpair() syscalls. There are several cases where a socket dentry will + // not be contained in socketMount: + // 1. Socket files created by mknod() + // 2. Socket fds imported from the host (Kernel.hostMount is used for these) + // 3. Socket files created by binding Unix sockets to a file path + socketMount *vfs.Mount + // If set to true, report address space activation waits as if the task is in // external wait so that the watchdog doesn't report the task stuck. SleepForAddressSpaceActivation bool @@ -377,7 +384,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { defer socketFilesystem.DecRef() socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { - return fmt.Errorf("failed to initialize socket mount: %v", err) + return fmt.Errorf("failed to create sockfs mount: %v", err) } k.socketMount = socketMount } @@ -1526,11 +1533,6 @@ func (k *Kernel) ListSockets() []*SocketEntry { return socks } -// SocketMount returns the global socket mount. -func (k *Kernel) SocketMount() *vfs.Mount { - return k.socketMount -} - // supervisorContext is a privileged context. type supervisorContext struct { context.NoopSleeper @@ -1629,7 +1631,25 @@ func (k *Kernel) VFS() *vfs.VirtualFilesystem { return &k.vfs } +// SetHostMount sets the hostfs mount. +func (k *Kernel) SetHostMount(mnt *vfs.Mount) { + if k.hostMount != nil { + panic("Kernel.hostMount cannot be set more than once") + } + k.hostMount = mnt +} + +// HostMount returns the hostfs mount. +func (k *Kernel) HostMount() *vfs.Mount { + return k.hostMount +} + // PipeMount returns the pipefs mount. func (k *Kernel) PipeMount() *vfs.Mount { return k.pipeMount } + +// SocketMount returns the sockfs mount. +func (k *Kernel) SocketMount() *vfs.Mount { + return k.socketMount +} diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 7e49f6f9f..0cbd63857 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -89,14 +89,9 @@ func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kern fdTable := k.NewFDTable() defer fdTable.DecRef() - hostMount, err := vfshost.NewMount(k.VFS()) - if err != nil { - return nil, fmt.Errorf("creating host mount: %w", err) - } - for appFD, hostFD := range stdioFDs { // TODO(gvisor.dev/issue/1482): Add TTY support. - appFile, err := vfshost.ImportFD(hostMount, hostFD, false) + appFile, err := vfshost.ImportFD(k.HostMount(), hostFD, false) if err != nil { return nil, err } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 096b0e9f0..3f41d8357 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -36,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/fs/user" + vfs2host "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -46,6 +47,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" @@ -329,6 +331,17 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("creating pod mount hints: %v", err) } + if kernel.VFS2Enabled { + // Set up host mount that will be used for imported fds. + hostFilesystem := vfs2host.NewFilesystem(k.VFS()) + defer hostFilesystem.DecRef() + hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs mount: %v", err) + } + k.SetHostMount(hostMount) + } + // Make host FDs stable between invocations. Host FDs must map to the exact // same number when the sandbox is restored. Otherwise the wrong FD will be // used. -- cgit v1.2.3 From 3c67754663f424f2ebbc0ff2a4c80e30618d5355 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Sat, 25 Apr 2020 23:54:56 -0700 Subject: Enable automated marshalling for signals and the arch package. PiperOrigin-RevId: 308472331 --- pkg/abi/linux/BUILD | 2 + pkg/abi/linux/ptrace_amd64.go | 52 ++++++++++++++++++++++ pkg/abi/linux/ptrace_arm64.go | 29 ++++++++++++ pkg/sentry/arch/BUILD | 4 +- pkg/sentry/arch/arch_aarch64.go | 26 ++++++----- pkg/sentry/arch/arch_amd64.go | 13 +++--- pkg/sentry/arch/arch_arm64.go | 2 + pkg/sentry/arch/arch_state_aarch64.go | 38 ---------------- pkg/sentry/arch/arch_state_x86.go | 42 ----------------- pkg/sentry/arch/arch_x86.go | 25 ++++++----- pkg/sentry/arch/arch_x86_impl.go | 4 +- pkg/sentry/arch/signal.go | 3 ++ pkg/sentry/arch/signal_act.go | 4 ++ pkg/sentry/arch/signal_stack.go | 3 ++ pkg/sentry/kernel/task_signals.go | 8 ++-- pkg/sentry/platform/kvm/kvm_arm64.go | 5 +-- pkg/sentry/platform/kvm/kvm_test.go | 36 +++++++-------- pkg/sentry/platform/kvm/testutil/BUILD | 1 + pkg/sentry/platform/kvm/testutil/testutil_amd64.go | 17 +++---- pkg/sentry/platform/kvm/testutil/testutil_arm64.go | 13 +++--- pkg/sentry/platform/ptrace/ptrace_amd64.go | 7 ++- pkg/sentry/platform/ptrace/ptrace_arm64.go | 5 +-- pkg/sentry/platform/ptrace/ptrace_unsafe.go | 4 +- pkg/sentry/platform/ptrace/subprocess.go | 8 ++-- pkg/sentry/platform/ptrace/subprocess_amd64.go | 16 +++---- pkg/sentry/platform/ptrace/subprocess_arm64.go | 16 +++---- pkg/sentry/platform/ring0/BUILD | 1 + pkg/sentry/platform/ring0/defs.go | 9 ++-- pkg/sentry/platform/ring0/entry_amd64.go | 6 +-- pkg/sentry/platform/ring0/gen_offsets/BUILD | 1 + pkg/sentry/platform/ring0/offsets_amd64.go | 5 ++- pkg/sentry/platform/ring0/offsets_arm64.go | 5 ++- pkg/sentry/syscalls/linux/sys_signal.go | 10 ++--- 33 files changed, 224 insertions(+), 196 deletions(-) create mode 100644 pkg/abi/linux/ptrace_amd64.go create mode 100644 pkg/abi/linux/ptrace_arm64.go delete mode 100644 pkg/sentry/arch/arch_state_aarch64.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 59b0e138a..114b516e2 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -44,6 +44,8 @@ go_library( "poll.go", "prctl.go", "ptrace.go", + "ptrace_amd64.go", + "ptrace_arm64.go", "rseq.go", "rusage.go", "sched.go", diff --git a/pkg/abi/linux/ptrace_amd64.go b/pkg/abi/linux/ptrace_amd64.go new file mode 100644 index 000000000..ed3881e27 --- /dev/null +++ b/pkg/abi/linux/ptrace_amd64.go @@ -0,0 +1,52 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +// PtraceRegs is the set of CPU registers exposed by ptrace. Source: +// syscall.PtraceRegs. +// +// +marshal +// +stateify savable +type PtraceRegs struct { + R15 uint64 + R14 uint64 + R13 uint64 + R12 uint64 + Rbp uint64 + Rbx uint64 + R11 uint64 + R10 uint64 + R9 uint64 + R8 uint64 + Rax uint64 + Rcx uint64 + Rdx uint64 + Rsi uint64 + Rdi uint64 + Orig_rax uint64 + Rip uint64 + Cs uint64 + Eflags uint64 + Rsp uint64 + Ss uint64 + Fs_base uint64 + Gs_base uint64 + Ds uint64 + Es uint64 + Fs uint64 + Gs uint64 +} diff --git a/pkg/abi/linux/ptrace_arm64.go b/pkg/abi/linux/ptrace_arm64.go new file mode 100644 index 000000000..6147738b3 --- /dev/null +++ b/pkg/abi/linux/ptrace_arm64.go @@ -0,0 +1,29 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package linux + +// PtraceRegs is the set of CPU registers exposed by ptrace. Source: +// syscall.PtraceRegs. +// +// +marshal +// +stateify savable +type PtraceRegs struct { + Regs [31]uint64 + Sp uint64 + Pc uint64 + Pstate uint64 +} diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index e27f21e5e..901e0f320 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -11,7 +11,6 @@ go_library( "arch_amd64.go", "arch_amd64.s", "arch_arm64.go", - "arch_state_aarch64.go", "arch_state_x86.go", "arch_x86.go", "arch_x86_impl.go", @@ -26,11 +25,11 @@ go_library( "syscalls_amd64.go", "syscalls_arm64.go", ], + marshal = True, visibility = ["//:sandbox"], deps = [ ":registers_go_proto", "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/cpuid", "//pkg/log", @@ -38,6 +37,7 @@ go_library( "//pkg/sync", "//pkg/syserror", "//pkg/usermem", + "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index c29e1b841..529980267 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -17,18 +17,20 @@ package arch import ( + "encoding/binary" "fmt" "io" - "syscall" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) +// Registers represents the CPU registers for this architecture. +type Registers = linux.PtraceRegs + const ( // SyscallWidth is the width of insturctions. SyscallWidth = 4 @@ -90,7 +92,7 @@ func NewFloatingPointData() *FloatingPointData { // file ensures it's only built on aarch64). type State struct { // The system registers. - Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"` + Regs Registers // Our floating point state. aarch64FPState `state:"wait"` @@ -226,25 +228,27 @@ func (s *State) RegisterMap() (map[string]uintptr, error) { // PtraceGetRegs implements Context.PtraceGetRegs. func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { - return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs())) + regs := s.ptraceGetRegs() + n, err := regs.WriteTo(dst) + return int(n), err } -func (s *State) ptraceGetRegs() syscall.PtraceRegs { +func (s *State) ptraceGetRegs() Registers { return s.Regs } -var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{})) +var registersSize = (*Registers)(nil).SizeBytes() // PtraceSetRegs implements Context.PtraceSetRegs. func (s *State) PtraceSetRegs(src io.Reader) (int, error) { - var regs syscall.PtraceRegs - buf := make([]byte, ptraceRegsSize) + var regs Registers + buf := make([]byte, registersSize) if _, err := io.ReadFull(src, buf); err != nil { return 0, err } - binary.Unmarshal(buf, usermem.ByteOrder, ®s) + regs.UnmarshalUnsafe(buf) s.Regs = regs - return ptraceRegsSize, nil + return registersSize, nil } // PtraceGetFPRegs implements Context.PtraceGetFPRegs. diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 85d6acc0f..3b3a0a272 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -22,7 +22,6 @@ import ( "math/rand" "syscall" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/usermem" @@ -301,8 +300,10 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and // u_debugreg, returning 0 or silently no-oping for other fields // respectively. - if addr < uintptr(ptraceRegsSize) { - buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs()) + if addr < uintptr(registersSize) { + regs := c.ptraceGetRegs() + buf := make([]byte, regs.SizeBytes()) + regs.MarshalUnsafe(buf) return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil } // Note: x86 debug registers are missing. @@ -314,8 +315,10 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error { if addr&7 != 0 || addr >= userStructSize { return syscall.EIO } - if addr < uintptr(ptraceRegsSize) { - buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs()) + if addr < uintptr(registersSize) { + regs := c.ptraceGetRegs() + buf := make([]byte, regs.SizeBytes()) + regs.MarshalUnsafe(buf) usermem.ByteOrder.PutUint64(buf[addr:], uint64(data)) _, err := c.PtraceSetRegs(bytes.NewBuffer(buf)) return err diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index db99c5acb..ada7ac7b8 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build arm64 + package arch import ( diff --git a/pkg/sentry/arch/arch_state_aarch64.go b/pkg/sentry/arch/arch_state_aarch64.go deleted file mode 100644 index 0136a85ad..000000000 --- a/pkg/sentry/arch/arch_state_aarch64.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package arch - -import ( - "syscall" -) - -type syscallPtraceRegs struct { - Regs [31]uint64 - Sp uint64 - Pc uint64 - Pstate uint64 -} - -// saveRegs is invoked by stateify. -func (s *State) saveRegs() syscallPtraceRegs { - return syscallPtraceRegs(s.Regs) -} - -// loadRegs is invoked by stateify. -func (s *State) loadRegs(r syscallPtraceRegs) { - s.Regs = syscall.PtraceRegs(r) -} diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index aa31169e0..19ce99d25 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -18,7 +18,6 @@ package arch import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/usermem" @@ -90,44 +89,3 @@ func (s *State) afterLoadFPState() { // Copy to the new, aligned location. copy(s.x86FPState, old) } - -// +stateify savable -type syscallPtraceRegs struct { - R15 uint64 - R14 uint64 - R13 uint64 - R12 uint64 - Rbp uint64 - Rbx uint64 - R11 uint64 - R10 uint64 - R9 uint64 - R8 uint64 - Rax uint64 - Rcx uint64 - Rdx uint64 - Rsi uint64 - Rdi uint64 - Orig_rax uint64 - Rip uint64 - Cs uint64 - Eflags uint64 - Rsp uint64 - Ss uint64 - Fs_base uint64 - Gs_base uint64 - Ds uint64 - Es uint64 - Fs uint64 - Gs uint64 -} - -// saveRegs is invoked by stateify. -func (s *State) saveRegs() syscallPtraceRegs { - return syscallPtraceRegs(s.Regs) -} - -// loadRegs is invoked by stateify. -func (s *State) loadRegs(r syscallPtraceRegs) { - s.Regs = syscall.PtraceRegs(r) -} diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index 7fc4c0473..dc458b37f 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -21,7 +21,7 @@ import ( "io" "syscall" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" @@ -30,6 +30,9 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// Registers represents the CPU registers for this architecture. +type Registers = linux.PtraceRegs + // System-related constants for x86. const ( // SyscallWidth is the width of syscall, sysenter, and int 80 insturctions. @@ -267,10 +270,12 @@ func (s *State) RegisterMap() (map[string]uintptr, error) { // PtraceGetRegs implements Context.PtraceGetRegs. func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { - return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs())) + regs := s.ptraceGetRegs() + n, err := regs.WriteTo(dst) + return int(n), err } -func (s *State) ptraceGetRegs() syscall.PtraceRegs { +func (s *State) ptraceGetRegs() Registers { regs := s.Regs // These may not be initialized. if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 { @@ -306,16 +311,16 @@ func (s *State) ptraceGetRegs() syscall.PtraceRegs { return regs } -var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{})) +var registersSize = (*Registers)(nil).SizeBytes() // PtraceSetRegs implements Context.PtraceSetRegs. func (s *State) PtraceSetRegs(src io.Reader) (int, error) { - var regs syscall.PtraceRegs - buf := make([]byte, ptraceRegsSize) + var regs Registers + buf := make([]byte, registersSize) if _, err := io.ReadFull(src, buf); err != nil { return 0, err } - binary.Unmarshal(buf, usermem.ByteOrder, ®s) + regs.UnmarshalUnsafe(buf) // Truncate segment registers to 16 bits. regs.Cs = uint64(uint16(regs.Cs)) regs.Ds = uint64(uint16(regs.Ds)) @@ -369,7 +374,7 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) { } regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable) s.Regs = regs - return ptraceRegsSize, nil + return registersSize, nil } // isUserSegmentSelector returns true if the given segment selector specifies a @@ -538,7 +543,7 @@ const ( func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < ptraceRegsSize { + if maxlen < registersSize { return 0, syserror.EFAULT } return s.PtraceGetRegs(dst) @@ -558,7 +563,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < ptraceRegsSize { + if maxlen < registersSize { return 0, syserror.EFAULT } return s.PtraceSetRegs(src) diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go index 3edf40764..0c73fcbfb 100644 --- a/pkg/sentry/arch/arch_x86_impl.go +++ b/pkg/sentry/arch/arch_x86_impl.go @@ -17,8 +17,6 @@ package arch import ( - "syscall" - "gvisor.dev/gvisor/pkg/cpuid" ) @@ -28,7 +26,7 @@ import ( // +stateify savable type State struct { // The system registers. - Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"` + Regs Registers // Our floating point state. x86FPState `state:"wait"` diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go index 8b03d0187..c9fb55d00 100644 --- a/pkg/sentry/arch/signal.go +++ b/pkg/sentry/arch/signal.go @@ -22,6 +22,7 @@ import ( // SignalAct represents the action that should be taken when a signal is // delivered, and is equivalent to struct sigaction. // +// +marshal // +stateify savable type SignalAct struct { Handler uint64 @@ -43,6 +44,7 @@ func (s *SignalAct) DeserializeTo(other *SignalAct) { // SignalStack represents information about a user stack, and is equivalent to // stack_t. // +// +marshal // +stateify savable type SignalStack struct { Addr uint64 @@ -64,6 +66,7 @@ func (s *SignalStack) DeserializeTo(other *SignalStack) { // SignalInfo represents information about a signal being delivered, and is // equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h). // +// +marshal // +stateify savable type SignalInfo struct { Signo int32 // Signal number diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go index f9ca2e74e..32173aa20 100644 --- a/pkg/sentry/arch/signal_act.go +++ b/pkg/sentry/arch/signal_act.go @@ -14,6 +14,8 @@ package arch +import "gvisor.dev/gvisor/tools/go_marshal/marshal" + // Special values for SignalAct.Handler. const ( // SignalActDefault is SIG_DFL and specifies that the default behavior for @@ -71,6 +73,8 @@ func (s SignalAct) HasRestorer() bool { // NativeSignalAct is a type that is equivalent to struct sigaction in the // guest architecture. type NativeSignalAct interface { + marshal.Marshallable + // SerializeFrom copies the data in the host SignalAct s into this object. SerializeFrom(s *SignalAct) diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go index e58f055c7..0fa738a1d 100644 --- a/pkg/sentry/arch/signal_stack.go +++ b/pkg/sentry/arch/signal_stack.go @@ -18,6 +18,7 @@ package arch import ( "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" ) const ( @@ -55,6 +56,8 @@ func (s *SignalStack) Contains(sp usermem.Addr) bool { // NativeSignalStack is a type that is equivalent to stack_t in the guest // architecture. type NativeSignalStack interface { + marshal.Marshallable + // SerializeFrom copies the data in the host SignalStack s into this // object. SerializeFrom(s *SignalStack) diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 7d25e98f7..79766cafe 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -716,7 +716,7 @@ func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (a func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error { n := t.Arch().NewSignalAct() n.SerializeFrom(s) - _, err := t.CopyOut(addr, n) + _, err := n.CopyOut(t, addr) return err } @@ -725,7 +725,7 @@ func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error { func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) { n := t.Arch().NewSignalAct() var s arch.SignalAct - if _, err := t.CopyIn(addr, n); err != nil { + if _, err := n.CopyIn(t, addr); err != nil { return s, err } n.DeserializeTo(&s) @@ -737,7 +737,7 @@ func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) { func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error { n := t.Arch().NewSignalStack() n.SerializeFrom(s) - _, err := t.CopyOut(addr, n) + _, err := n.CopyOut(t, addr) return err } @@ -746,7 +746,7 @@ func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) { n := t.Arch().NewSignalStack() var s arch.SignalStack - if _, err := t.CopyIn(addr, n); err != nil { + if _, err := n.CopyIn(t, addr); err != nil { return s, err } n.DeserializeTo(&s) diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go index 716198712..29d457a7e 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -17,8 +17,7 @@ package kvm import ( - "syscall" - + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) @@ -37,7 +36,7 @@ type userFpsimdState struct { } type userRegs struct { - Regs syscall.PtraceRegs + Regs arch.Registers sp_el1 uint64 elr_el1 uint64 spsr [KVM_NR_SPSR]uint64 diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index c42752d50..6c8f4fa28 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -117,10 +117,10 @@ func TestKernelFloatingPoint(t *testing.T) { }) } -func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *syscall.PtraceRegs, *pagetables.PageTables) bool) { +func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { // Initialize registers & page tables. var ( - regs syscall.PtraceRegs + regs arch.Registers pt *pagetables.PageTables ) testutil.SetTestTarget(®s, target) @@ -154,7 +154,7 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func } func TestApplicationSyscall(t *testing.T) { - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -168,7 +168,7 @@ func TestApplicationSyscall(t *testing.T) { } return false }) - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -184,7 +184,7 @@ func TestApplicationSyscall(t *testing.T) { } func TestApplicationFault(t *testing.T) { - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ @@ -199,7 +199,7 @@ func TestApplicationFault(t *testing.T) { } return false }) - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ @@ -216,7 +216,7 @@ func TestApplicationFault(t *testing.T) { } func TestRegistersSyscall(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { var si arch.SignalInfo @@ -239,7 +239,7 @@ func TestRegistersSyscall(t *testing.T) { } func TestRegistersFault(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { var si arch.SignalInfo @@ -263,7 +263,7 @@ func TestRegistersFault(t *testing.T) { } func TestSegments(t *testing.T) { - applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestSegments(regs) for { var si arch.SignalInfo @@ -287,7 +287,7 @@ func TestSegments(t *testing.T) { } func TestBounce(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { time.Sleep(time.Millisecond) c.BounceToKernel() @@ -302,7 +302,7 @@ func TestBounce(t *testing.T) { } return false }) - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { time.Sleep(time.Millisecond) c.BounceToKernel() @@ -321,7 +321,7 @@ func TestBounce(t *testing.T) { } func TestBounceStress(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { randomSleep := func() { // O(hundreds of microseconds) is appropriate to ensure // different overlaps and different schedules. @@ -357,7 +357,7 @@ func TestBounceStress(t *testing.T) { func TestInvalidate(t *testing.T) { var data uintptr // Used below. - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, &data) // Read legitimate value. for { var si arch.SignalInfo @@ -398,7 +398,7 @@ func IsFault(err error, si *arch.SignalInfo) bool { } func TestEmptyAddressSpace(t *testing.T) { - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -412,7 +412,7 @@ func TestEmptyAddressSpace(t *testing.T) { } return false }) - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -471,7 +471,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { i int // Iteration includes machine.Get() / machine.Put(). a int // Count for ErrContextInterrupt. ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -493,7 +493,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { func BenchmarkKernelSyscall(b *testing.B) { // Note that the target passed here is irrelevant, we never execute SwitchToUser. - applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { // iteration does not include machine.Get() / machine.Put(). for i := 0; i < b.N; i++ { testutil.Getpid() @@ -508,7 +508,7 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { i int a int ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD index f7605df8a..f7feb8683 100644 --- a/pkg/sentry/platform/kvm/testutil/BUILD +++ b/pkg/sentry/platform/kvm/testutil/BUILD @@ -13,4 +13,5 @@ go_library( "testutil_arm64.s", ], visibility = ["//pkg/sentry/platform/kvm:__pkg__"], + deps = ["//pkg/sentry/arch"], ) diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go index 4c108abbf..8048eedec 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go @@ -18,19 +18,20 @@ package testutil import ( "reflect" - "syscall" + + "gvisor.dev/gvisor/pkg/sentry/arch" ) // TwiddleSegments reads segments into known registers. func TwiddleSegments() // SetTestTarget sets the rip appropriately. -func SetTestTarget(regs *syscall.PtraceRegs, fn func()) { +func SetTestTarget(regs *arch.Registers, fn func()) { regs.Rip = uint64(reflect.ValueOf(fn).Pointer()) } // SetTouchTarget sets rax appropriately. -func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) { +func SetTouchTarget(regs *arch.Registers, target *uintptr) { if target != nil { regs.Rax = uint64(reflect.ValueOf(target).Pointer()) } else { @@ -39,12 +40,12 @@ func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) { } // RewindSyscall rewinds a syscall RIP. -func RewindSyscall(regs *syscall.PtraceRegs) { +func RewindSyscall(regs *arch.Registers) { regs.Rip -= 2 } // SetTestRegs initializes registers to known values. -func SetTestRegs(regs *syscall.PtraceRegs) { +func SetTestRegs(regs *arch.Registers) { regs.R15 = 0x15 regs.R14 = 0x14 regs.R13 = 0x13 @@ -64,7 +65,7 @@ func SetTestRegs(regs *syscall.PtraceRegs) { } // CheckTestRegs checks that registers were twiddled per TwiddleRegs. -func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) { +func CheckTestRegs(regs *arch.Registers, full bool) (err error) { if need := ^uint64(0x15); regs.R15 != need { err = addRegisterMismatch(err, "R15", regs.R15, need) } @@ -121,13 +122,13 @@ var fsData uint64 = 0x55 var gsData uint64 = 0x85 // SetTestSegments initializes segments to known values. -func SetTestSegments(regs *syscall.PtraceRegs) { +func SetTestSegments(regs *arch.Registers) { regs.Fs_base = uint64(reflect.ValueOf(&fsData).Pointer()) regs.Gs_base = uint64(reflect.ValueOf(&gsData).Pointer()) } // CheckTestSegments checks that registers were twiddled per TwiddleSegments. -func CheckTestSegments(regs *syscall.PtraceRegs) (err error) { +func CheckTestSegments(regs *arch.Registers) (err error) { if regs.Rax != fsData { err = addRegisterMismatch(err, "Rax", regs.Rax, fsData) } diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go index 40b2e4acc..ca902c8c1 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -19,16 +19,17 @@ package testutil import ( "fmt" "reflect" - "syscall" + + "gvisor.dev/gvisor/pkg/sentry/arch" ) // SetTestTarget sets the rip appropriately. -func SetTestTarget(regs *syscall.PtraceRegs, fn func()) { +func SetTestTarget(regs *arch.Registers, fn func()) { regs.Pc = uint64(reflect.ValueOf(fn).Pointer()) } // SetTouchTarget sets rax appropriately. -func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) { +func SetTouchTarget(regs *arch.Registers, target *uintptr) { if target != nil { regs.Regs[8] = uint64(reflect.ValueOf(target).Pointer()) } else { @@ -37,19 +38,19 @@ func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) { } // RewindSyscall rewinds a syscall RIP. -func RewindSyscall(regs *syscall.PtraceRegs) { +func RewindSyscall(regs *arch.Registers) { regs.Pc -= 4 } // SetTestRegs initializes registers to known values. -func SetTestRegs(regs *syscall.PtraceRegs) { +func SetTestRegs(regs *arch.Registers) { for i := 0; i <= 30; i++ { regs.Regs[i] = uint64(i) + 1 } } // CheckTestRegs checks that registers were twiddled per TwiddleRegs. -func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) { +func CheckTestRegs(regs *arch.Registers, full bool) (err error) { for i := 0; i <= 30; i++ { if need := ^uint64(i + 1); regs.Regs[i] != need { err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need) diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64.go b/pkg/sentry/platform/ptrace/ptrace_amd64.go index 24fc5dc62..3b9a870a5 100644 --- a/pkg/sentry/platform/ptrace/ptrace_amd64.go +++ b/pkg/sentry/platform/ptrace/ptrace_amd64.go @@ -15,9 +15,8 @@ package ptrace import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" ) // fpRegSet returns the GETREGSET/SETREGSET register set type to be used. @@ -28,12 +27,12 @@ func fpRegSet(useXsave bool) uintptr { return linux.NT_PRFPREG } -func stackPointer(r *syscall.PtraceRegs) uintptr { +func stackPointer(r *arch.Registers) uintptr { return uintptr(r.Rsp) } // x86 use the fs_base register to store the TLS pointer which can be -// get/set in "func (t *thread) get/setRegs(regs *syscall.PtraceRegs)". +// get/set in "func (t *thread) get/setRegs(regs *arch.Registers)". // So both of the get/setTLS() operations are noop here. // getTLS gets the thread local storage register. diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64.go b/pkg/sentry/platform/ptrace/ptrace_arm64.go index 4db28c534..5c869926a 100644 --- a/pkg/sentry/platform/ptrace/ptrace_arm64.go +++ b/pkg/sentry/platform/ptrace/ptrace_arm64.go @@ -15,9 +15,8 @@ package ptrace import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" ) // fpRegSet returns the GETREGSET/SETREGSET register set type to be used. @@ -25,6 +24,6 @@ func fpRegSet(_ bool) uintptr { return linux.NT_PRFPREG } -func stackPointer(r *syscall.PtraceRegs) uintptr { +func stackPointer(r *arch.Registers) uintptr { return uintptr(r.Sp) } diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index 6c0ed7b3e..8b72d24e8 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -24,7 +24,7 @@ import ( ) // getRegs gets the general purpose register set. -func (t *thread) getRegs(regs *syscall.PtraceRegs) error { +func (t *thread) getRegs(regs *arch.Registers) error { iovec := syscall.Iovec{ Base: (*byte)(unsafe.Pointer(regs)), Len: uint64(unsafe.Sizeof(*regs)), @@ -43,7 +43,7 @@ func (t *thread) getRegs(regs *syscall.PtraceRegs) error { } // setRegs sets the general purpose register set. -func (t *thread) setRegs(regs *syscall.PtraceRegs) error { +func (t *thread) setRegs(regs *arch.Registers) error { iovec := syscall.Iovec{ Base: (*byte)(unsafe.Pointer(regs)), Len: uint64(unsafe.Sizeof(*regs)), diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 773ddb1ed..2389423b0 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -63,7 +63,7 @@ type thread struct { // initRegs are the initial registers for the first thread. // // These are used for the register set for system calls. - initRegs syscall.PtraceRegs + initRegs arch.Registers } // threadPool is a collection of threads. @@ -317,7 +317,7 @@ const ( ) func (t *thread) dumpAndPanic(message string) { - var regs syscall.PtraceRegs + var regs arch.Registers message += "\n" if err := t.getRegs(®s); err == nil { message += dumpRegs(®s) @@ -423,7 +423,7 @@ func (t *thread) init() { // This is _not_ for use by application system calls, rather it is for use when // a system call must be injected into the remote context (e.g. mmap, munmap). // Note that clones are handled separately. -func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { +func (t *thread) syscall(regs *arch.Registers) (uintptr, error) { // Set registers. if err := t.setRegs(regs); err != nil { panic(fmt.Sprintf("ptrace set regs failed: %v", err)) @@ -461,7 +461,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { // syscallIgnoreInterrupt ignores interrupts on the system call thread and // restarts the syscall if the kernel indicates that should happen. func (t *thread) syscallIgnoreInterrupt( - initRegs *syscall.PtraceRegs, + initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { for { diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index cd74945e7..84b699f0d 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -41,7 +41,7 @@ const ( // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. -func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { +func (t *thread) resetSysemuRegs(regs *arch.Registers) { regs.Cs = t.initRegs.Cs regs.Ss = t.initRegs.Ss regs.Ds = t.initRegs.Ds @@ -53,7 +53,7 @@ func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. -func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs { +func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers { // Copy initial registers. regs := *initRegs @@ -82,18 +82,18 @@ func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch } // isSingleStepping determines if the registers indicate single-stepping. -func isSingleStepping(regs *syscall.PtraceRegs) bool { +func isSingleStepping(regs *arch.Registers) bool { return (regs.Eflags & arch.X86TrapFlag) != 0 } // updateSyscallRegs updates registers after finishing sysemu. -func updateSyscallRegs(regs *syscall.PtraceRegs) { +func updateSyscallRegs(regs *arch.Registers) { // Ptrace puts -ENOSYS in rax on syscall-enter-stop. regs.Rax = regs.Orig_rax } // syscallReturnValue extracts a sensible return from registers. -func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { +func syscallReturnValue(regs *arch.Registers) (uintptr, error) { rval := int64(regs.Rax) if rval < 0 { return 0, syscall.Errno(-rval) @@ -101,7 +101,7 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { return uintptr(rval), nil } -func dumpRegs(regs *syscall.PtraceRegs) string { +func dumpRegs(regs *arch.Registers) string { var m strings.Builder fmt.Fprintf(&m, "Registers:\n") @@ -143,7 +143,7 @@ func (t *thread) adjustInitRegsRip() { } // Pass the expected PPID to the child via R15 when creating stub process. -func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { +func initChildProcessPPID(initregs *arch.Registers, ppid int32) { initregs.R15 = uint64(ppid) // Rbx has to be set to 1 when creating stub process. initregs.Rbx = 1 @@ -156,7 +156,7 @@ func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { // // Note that this should only be called after verifying that the signalInfo has // been generated by the kernel. -func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { +func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) { if linux.Signal(signalInfo.Signo) == linux.SIGSYS { signalInfo.Signo = int32(linux.SIGSEGV) diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index 7f5c393f0..bd618fae8 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -41,13 +41,13 @@ const ( // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. -func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { +func (t *thread) resetSysemuRegs(regs *arch.Registers) { } // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. -func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs { +func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers { // Copy initial registers (Pc, Sp, etc.). regs := *initRegs @@ -78,7 +78,7 @@ func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch } // isSingleStepping determines if the registers indicate single-stepping. -func isSingleStepping(regs *syscall.PtraceRegs) bool { +func isSingleStepping(regs *arch.Registers) bool { // Refer to the ARM SDM D2.12.3: software step state machine // return (regs.Pstate.SS == 1) && (MDSCR_EL1.SS == 1). // @@ -89,13 +89,13 @@ func isSingleStepping(regs *syscall.PtraceRegs) bool { } // updateSyscallRegs updates registers after finishing sysemu. -func updateSyscallRegs(regs *syscall.PtraceRegs) { +func updateSyscallRegs(regs *arch.Registers) { // No special work is necessary. return } // syscallReturnValue extracts a sensible return from registers. -func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { +func syscallReturnValue(regs *arch.Registers) (uintptr, error) { rval := int64(regs.Regs[0]) if rval < 0 { return 0, syscall.Errno(-rval) @@ -103,7 +103,7 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { return uintptr(rval), nil } -func dumpRegs(regs *syscall.PtraceRegs) string { +func dumpRegs(regs *arch.Registers) string { var m strings.Builder fmt.Fprintf(&m, "Registers:\n") @@ -125,7 +125,7 @@ func (t *thread) adjustInitRegsRip() { } // Pass the expected PPID to the child via X7 when creating stub process -func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { +func initChildProcessPPID(initregs *arch.Registers, ppid int32) { initregs.Regs[7] = uint64(ppid) // R9 has to be set to 1 when creating stub process. initregs.Regs[9] = 1 @@ -138,7 +138,7 @@ func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { // // Note that this should only be called after verifying that the signalInfo has // been generated by the kernel. -func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { +func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) { if linux.Signal(signalInfo.Signo) == linux.SIGSYS { signalInfo.Signo = int32(linux.SIGSEGV) diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index b69520030..679b287c3 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -79,6 +79,7 @@ go_library( deps = [ "//pkg/cpuid", "//pkg/safecopy", + "//pkg/sentry/arch", "//pkg/sentry/platform/ring0/pagetables", "//pkg/usermem", ], diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go index 86fd5ed58..e6daf24df 100644 --- a/pkg/sentry/platform/ring0/defs.go +++ b/pkg/sentry/platform/ring0/defs.go @@ -15,8 +15,7 @@ package ring0 import ( - "syscall" - + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" ) @@ -72,7 +71,7 @@ type CPU struct { // registers is a set of registers; these may be used on kernel system // calls and exceptions via the Registers function. - registers syscall.PtraceRegs + registers arch.Registers // hooks are kernel hooks. hooks Hooks @@ -83,14 +82,14 @@ type CPU struct { // This is explicitly safe to call during KernelException and KernelSyscall. // //go:nosplit -func (c *CPU) Registers() *syscall.PtraceRegs { +func (c *CPU) Registers() *arch.Registers { return &c.registers } // SwitchOpts are passed to the Switch function. type SwitchOpts struct { // Registers are the user register state. - Registers *syscall.PtraceRegs + Registers *arch.Registers // FloatingPointState is a byte pointer where floating point state is // saved and restored. diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go index a5ce67885..7fa43c2f5 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.go +++ b/pkg/sentry/platform/ring0/entry_amd64.go @@ -17,7 +17,7 @@ package ring0 import ( - "syscall" + "gvisor.dev/gvisor/pkg/sentry/arch" ) // This is an assembly function. @@ -41,7 +41,7 @@ func swapgs() // The return code is the vector that interrupted execution. // // See stubs.go for a note regarding the frame size of this function. -func sysret(*CPU, *syscall.PtraceRegs) Vector +func sysret(*CPU, *arch.Registers) Vector // "iret is the cadillac of CPL switching." // @@ -50,7 +50,7 @@ func sysret(*CPU, *syscall.PtraceRegs) Vector // iret is nearly identical to sysret, except an iret is used to fully restore // all user state. This must be called in cases where all registers need to be // restored. -func iret(*CPU, *syscall.PtraceRegs) Vector +func iret(*CPU, *arch.Registers) Vector // exception is the generic exception entry. // diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index 4cae10459..549f3d228 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -27,6 +27,7 @@ go_binary( visibility = ["//pkg/sentry/platform/ring0:__pkg__"], deps = [ "//pkg/cpuid", + "//pkg/sentry/arch", "//pkg/sentry/platform/ring0/pagetables", "//pkg/usermem", ], diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go index 85cc3fdad..b8ab120a0 100644 --- a/pkg/sentry/platform/ring0/offsets_amd64.go +++ b/pkg/sentry/platform/ring0/offsets_amd64.go @@ -20,7 +20,8 @@ import ( "fmt" "io" "reflect" - "syscall" + + "gvisor.dev/gvisor/pkg/sentry/arch" ) // Emit prints architecture-specific offsets. @@ -64,7 +65,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80) fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) - p := &syscall.PtraceRegs{} + p := &arch.Registers{} fmt.Fprintf(w, "\n// Ptrace registers.\n") fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer()) diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index 057fb5c69..f3de962f0 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -20,7 +20,8 @@ import ( "fmt" "io" "reflect" - "syscall" + + "gvisor.dev/gvisor/pkg/sentry/arch" ) // Emit prints architecture-specific offsets. @@ -87,7 +88,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) - p := &syscall.PtraceRegs{} + p := &arch.Registers{} fmt.Fprintf(w, "\n// Ptrace registers.\n") fmt.Fprintf(w, "#define PTRACE_R0 0x%02x\n", reflect.ValueOf(&p.Regs[0]).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_R1 0x%02x\n", reflect.ValueOf(&p.Regs[1]).Pointer()-reflect.ValueOf(p).Pointer()) diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index 7e1747a0c..582d37e03 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -355,7 +355,7 @@ func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() pending := t.PendingSignals() - _, err := t.CopyOut(addr, pending) + _, err := pending.CopyOut(t, addr) return 0, nil, err } @@ -392,7 +392,7 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if siginfo != 0 { si.FixSignalCodeForUser() - if _, err := t.CopyOut(siginfo, si); err != nil { + if _, err := si.CopyOut(t, siginfo); err != nil { return 0, nil, err } } @@ -411,7 +411,7 @@ func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // same way), and that the code is in the allowed set. This same logic // appears below in RtSigtgqueueinfo and should be kept in sync. var info arch.SignalInfo - if _, err := t.CopyIn(infoAddr, &info); err != nil { + if _, err := info.CopyIn(t, infoAddr); err != nil { return 0, nil, err } info.Signo = int32(sig) @@ -455,7 +455,7 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker // Copy in the info. See RtSigqueueinfo above. var info arch.SignalInfo - if _, err := t.CopyIn(infoAddr, &info); err != nil { + if _, err := info.CopyIn(t, infoAddr); err != nil { return 0, nil, err } info.Signo = int32(sig) @@ -485,7 +485,7 @@ func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // Copy in the signal mask. var mask linux.SignalSet - if _, err := t.CopyIn(sigset, &mask); err != nil { + if _, err := mask.CopyIn(t, sigset); err != nil { return 0, nil, err } mask &^= kernel.UnblockableSignals -- cgit v1.2.3 From 292f3f99b73fb901ffdd3ad8ac682718e1e8960a Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 27 Apr 2020 07:37:45 -0700 Subject: Don't leak vfs.MountNamespace reference if kernel.TaskSet.NewTask fails. PiperOrigin-RevId: 308617610 --- pkg/sentry/kernel/task_start.go | 3 +++ 1 file changed, 3 insertions(+) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index a5035bb7f..8485fb4b6 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -104,6 +104,9 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { cfg.TaskContext.release() cfg.FSContext.DecRef() cfg.FDTable.DecRef() + if cfg.MountNamespaceVFS2 != nil { + cfg.MountNamespaceVFS2.DecRef() + } return nil, err } return t, nil -- cgit v1.2.3 From 2c986870e35f967c88ebc1b7df7b576aad2c46d4 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 4 May 2020 10:39:36 -0700 Subject: Fix flaky monotonic time. This change ensures that even platforms with some TSC issues (e.g. KVM), can get reliable monotonic time by applied a lower bound on each read. PiperOrigin-RevId: 309773801 --- pkg/sentry/kernel/timekeeper.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index dc99301de..da0ea7bb5 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -16,6 +16,7 @@ package kernel import ( "fmt" + "sync/atomic" "time" "gvisor.dev/gvisor/pkg/log" @@ -48,6 +49,9 @@ type Timekeeper struct { // It is set only once, by SetClocks. monotonicOffset int64 `state:"nosave"` + // monotonicLowerBound is the lowerBound for monotonic time. + monotonicLowerBound int64 `state:"nosave"` + // restored, if non-nil, indicates that this Timekeeper was restored // from a state file. The clocks are not set until restored is closed. restored chan struct{} `state:"nosave"` @@ -271,6 +275,21 @@ func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) { now, err := t.clocks.GetTime(c) if err == nil && c == sentrytime.Monotonic { now += t.monotonicOffset + for { + // It's possible that the clock is shaky. This may be due to + // platform issues, e.g. the KVM platform relies on the guest + // TSC and host TSC, which may not be perfectly in sync. To + // work around this issue, ensure that the monotonic time is + // always bounded by the last time read. + oldLowerBound := atomic.LoadInt64(&t.monotonicLowerBound) + if now < oldLowerBound { + now = oldLowerBound + break + } + if atomic.CompareAndSwapInt64(&t.monotonicLowerBound, oldLowerBound, now) { + break + } + } } return now, err } -- cgit v1.2.3 From 279f1eb7abb28966ef633fa61418bffad4a716b0 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 6 May 2020 14:12:08 -0700 Subject: Fix runsc syscall documentation generation. We can register any number of tables with any number of architectures, and need not limit the definitions to the architecture in question. This allows runsc to generate documentation for all architectures simultaneously. Similarly, this simplifies the VFSv2 patching process. PiperOrigin-RevId: 310224827 --- pkg/sentry/kernel/syscalls.go | 58 +- pkg/sentry/syscalls/linux/BUILD | 2 - pkg/sentry/syscalls/linux/linux64.go | 709 +++++++++++++++++++++ pkg/sentry/syscalls/linux/linux64_amd64.go | 406 ------------ pkg/sentry/syscalls/linux/linux64_arm64.go | 340 ---------- pkg/sentry/syscalls/linux/vfs2/BUILD | 4 +- pkg/sentry/syscalls/linux/vfs2/linux64.go | 16 - .../syscalls/linux/vfs2/linux64_override_amd64.go | 165 ----- .../syscalls/linux/vfs2/linux64_override_arm64.go | 27 - pkg/sentry/syscalls/linux/vfs2/vfs2.go | 172 +++++ runsc/boot/BUILD | 4 - runsc/boot/loader.go | 12 +- runsc/boot/loader_amd64.go | 26 - runsc/boot/loader_arm64.go | 26 - runsc/boot/loader_test.go | 12 - runsc/cmd/syscalls.go | 23 +- 16 files changed, 930 insertions(+), 1072 deletions(-) delete mode 100644 pkg/sentry/syscalls/linux/linux64_amd64.go delete mode 100644 pkg/sentry/syscalls/linux/linux64_arm64.go delete mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64.go delete mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go delete mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/vfs2.go delete mode 100644 runsc/boot/loader_amd64.go delete mode 100644 runsc/boot/loader_arm64.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 84156d5a1..413111faf 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -29,7 +29,7 @@ import ( // // The types below create fast lookup slices for all syscalls. This maximum // serves as a sanity check that we don't allocate huge slices for a very large -// syscall. +// syscall. This is checked during registration. const maxSyscallNum = 2000 // SyscallSupportLevel is a syscall support levels. @@ -266,6 +266,16 @@ type SyscallTable struct { FeatureEnable SyscallFlagsTable } +// MaxSysno returns the largest system call number. +func (s *SyscallTable) MaxSysno() (max uintptr) { + for num := range s.Table { + if num > max { + max = num + } + } + return max +} + // allSyscallTables contains all known tables. var allSyscallTables []*SyscallTable @@ -286,6 +296,20 @@ func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { // RegisterSyscallTable registers a new syscall table for use by a Kernel. func RegisterSyscallTable(s *SyscallTable) { + if max := s.MaxSysno(); max > maxSyscallNum { + panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) + } + if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { + panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) + } + allSyscallTables = append(allSyscallTables, s) + s.Init() +} + +// Init initializes the system call table. +// +// This should normally be called only during registration. +func (s *SyscallTable) Init() { if s.Table == nil { // Ensure non-nil lookup table. s.Table = make(map[uintptr]Syscall) @@ -295,42 +319,16 @@ func RegisterSyscallTable(s *SyscallTable) { s.Emulate = make(map[usermem.Addr]uintptr) } - var max uintptr - for num := range s.Table { - if num > max { - max = num - } - } - - if max > maxSyscallNum { - panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) - } - - s.lookup = make([]SyscallFn, max+1) + max := s.MaxSysno() // Checked during RegisterSyscallTable. // Initialize the fast-lookup table. + s.lookup = make([]SyscallFn, max+1) for num, sc := range s.Table { s.lookup[num] = sc.Fn } + // Initialize all features. s.FeatureEnable.init(s.Table, max) - - if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { - panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) - } - - // Save a reference to this table. - // - // This is required for a Kernel to find the table and for save/restore - // operations below. - allSyscallTables = append(allSyscallTables, s) -} - -// FlushSyscallTablesTestOnly flushes the syscall tables for tests. Used for -// parameterized VFSv2 tests. -// TODO(gvisor.dv/issue/1624): Remove when VFS1 is no longer supported. -func FlushSyscallTablesTestOnly() { - allSyscallTables = nil } // Lookup returns the syscall implementation, if one exists. diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 0d24fd3c4..245e8fe1e 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -8,8 +8,6 @@ go_library( "error.go", "flags.go", "linux64.go", - "linux64_amd64.go", - "linux64_arm64.go", "sigset.go", "sys_aio.go", "sys_capability.go", diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 68589a377..ea4f9b1a7 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -15,6 +15,16 @@ // Package linux provides syscall tables for amd64 Linux. package linux +import ( + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + const ( // LinuxSysname is the OS name advertised by gVisor. LinuxSysname = "Linux" @@ -25,3 +35,702 @@ const ( // LinuxVersion is the version info advertised by gVisor. LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016" ) + +// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall +// numbers from Linux 4.4. +var AMD64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.AMD64, + Version: kernel.Version{ + // Version 4.4 is chosen as a stable, longterm version of Linux, which + // guides the interface provided by this syscall table. The build + // version is that for a clean build with default kernel config, at 5 + // minutes after v4.4 was tagged. + Sysname: LinuxSysname, + Release: LinuxRelease, + Version: LinuxVersion, + }, + AuditNumber: linux.AUDIT_ARCH_X86_64, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.Supported("read", Read), + 1: syscalls.Supported("write", Write), + 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), + 3: syscalls.Supported("close", Close), + 4: syscalls.Supported("stat", Stat), + 5: syscalls.Supported("fstat", Fstat), + 6: syscalls.Supported("lstat", Lstat), + 7: syscalls.Supported("poll", Poll), + 8: syscalls.Supported("lseek", Lseek), + 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 10: syscalls.Supported("mprotect", Mprotect), + 11: syscalls.Supported("munmap", Munmap), + 12: syscalls.Supported("brk", Brk), + 13: syscalls.Supported("rt_sigaction", RtSigaction), + 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 15: syscalls.Supported("rt_sigreturn", RtSigreturn), + 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 17: syscalls.Supported("pread64", Pread64), + 18: syscalls.Supported("pwrite64", Pwrite64), + 19: syscalls.Supported("readv", Readv), + 20: syscalls.Supported("writev", Writev), + 21: syscalls.Supported("access", Access), + 22: syscalls.Supported("pipe", Pipe), + 23: syscalls.Supported("select", Select), + 24: syscalls.Supported("sched_yield", SchedYield), + 25: syscalls.Supported("mremap", Mremap), + 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 32: syscalls.Supported("dup", Dup), + 33: syscalls.Supported("dup2", Dup2), + 34: syscalls.Supported("pause", Pause), + 35: syscalls.Supported("nanosleep", Nanosleep), + 36: syscalls.Supported("getitimer", Getitimer), + 37: syscalls.Supported("alarm", Alarm), + 38: syscalls.Supported("setitimer", Setitimer), + 39: syscalls.Supported("getpid", Getpid), + 40: syscalls.Supported("sendfile", Sendfile), + 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 42: syscalls.Supported("connect", Connect), + 43: syscalls.Supported("accept", Accept), + 44: syscalls.Supported("sendto", SendTo), + 45: syscalls.Supported("recvfrom", RecvFrom), + 46: syscalls.Supported("sendmsg", SendMsg), + 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 50: syscalls.Supported("listen", Listen), + 51: syscalls.Supported("getsockname", GetSockName), + 52: syscalls.Supported("getpeername", GetPeerName), + 53: syscalls.Supported("socketpair", SocketPair), + 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 57: syscalls.Supported("fork", Fork), + 58: syscalls.Supported("vfork", Vfork), + 59: syscalls.Supported("execve", Execve), + 60: syscalls.Supported("exit", Exit), + 61: syscalls.Supported("wait4", Wait4), + 62: syscalls.Supported("kill", Kill), + 63: syscalls.Supported("uname", Uname), + 64: syscalls.Supported("semget", Semget), + 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 67: syscalls.Supported("shmdt", Shmdt), + 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 76: syscalls.Supported("truncate", Truncate), + 77: syscalls.Supported("ftruncate", Ftruncate), + 78: syscalls.Supported("getdents", Getdents), + 79: syscalls.Supported("getcwd", Getcwd), + 80: syscalls.Supported("chdir", Chdir), + 81: syscalls.Supported("fchdir", Fchdir), + 82: syscalls.Supported("rename", Rename), + 83: syscalls.Supported("mkdir", Mkdir), + 84: syscalls.Supported("rmdir", Rmdir), + 85: syscalls.Supported("creat", Creat), + 86: syscalls.Supported("link", Link), + 87: syscalls.Supported("unlink", Unlink), + 88: syscalls.Supported("symlink", Symlink), + 89: syscalls.Supported("readlink", Readlink), + 90: syscalls.Supported("chmod", Chmod), + 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 92: syscalls.Supported("chown", Chown), + 93: syscalls.Supported("fchown", Fchown), + 94: syscalls.Supported("lchown", Lchown), + 95: syscalls.Supported("umask", Umask), + 96: syscalls.Supported("gettimeofday", Gettimeofday), + 97: syscalls.Supported("getrlimit", Getrlimit), + 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 100: syscalls.Supported("times", Times), + 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 102: syscalls.Supported("getuid", Getuid), + 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 104: syscalls.Supported("getgid", Getgid), + 105: syscalls.Supported("setuid", Setuid), + 106: syscalls.Supported("setgid", Setgid), + 107: syscalls.Supported("geteuid", Geteuid), + 108: syscalls.Supported("getegid", Getegid), + 109: syscalls.Supported("setpgid", Setpgid), + 110: syscalls.Supported("getppid", Getppid), + 111: syscalls.Supported("getpgrp", Getpgrp), + 112: syscalls.Supported("setsid", Setsid), + 113: syscalls.Supported("setreuid", Setreuid), + 114: syscalls.Supported("setregid", Setregid), + 115: syscalls.Supported("getgroups", Getgroups), + 116: syscalls.Supported("setgroups", Setgroups), + 117: syscalls.Supported("setresuid", Setresuid), + 118: syscalls.Supported("getresuid", Getresuid), + 119: syscalls.Supported("setresgid", Setresgid), + 120: syscalls.Supported("getresgid", Getresgid), + 121: syscalls.Supported("getpgid", Getpgid), + 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 124: syscalls.Supported("getsid", Getsid), + 125: syscalls.Supported("capget", Capget), + 126: syscalls.Supported("capset", Capset), + 127: syscalls.Supported("rt_sigpending", RtSigpending), + 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 131: syscalls.Supported("sigaltstack", Sigaltstack), + 132: syscalls.Supported("utime", Utime), + 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), + 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), + 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), + 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), + 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 161: syscalls.Supported("chroot", Chroot), + 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), + 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), + 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 170: syscalls.Supported("sethostname", Sethostname), + 171: syscalls.Supported("setdomainname", Setdomainname), + 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), + 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), + 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), + 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), + 186: syscalls.Supported("gettid", Gettid), + 187: syscalls.Supported("readahead", Readahead), + 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), + 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), + 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), + 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), + 194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), + 200: syscalls.Supported("tkill", Tkill), + 201: syscalls.Supported("time", Time), + 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 213: syscalls.Supported("epoll_create", EpollCreate), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 217: syscalls.Supported("getdents64", Getdents64), + 218: syscalls.Supported("set_tid_address", SetTidAddress), + 219: syscalls.Supported("restart_syscall", RestartSyscall), + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), + 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 222: syscalls.Supported("timer_create", TimerCreate), + 223: syscalls.Supported("timer_settime", TimerSettime), + 224: syscalls.Supported("timer_gettime", TimerGettime), + 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 226: syscalls.Supported("timer_delete", TimerDelete), + 227: syscalls.Supported("clock_settime", ClockSettime), + 228: syscalls.Supported("clock_gettime", ClockGettime), + 229: syscalls.Supported("clock_getres", ClockGetres), + 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 231: syscalls.Supported("exit_group", ExitGroup), + 232: syscalls.Supported("epoll_wait", EpollWait), + 233: syscalls.Supported("epoll_ctl", EpollCtl), + 234: syscalls.Supported("tgkill", Tgkill), + 235: syscalls.Supported("utimes", Utimes), + 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), + 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 247: syscalls.Supported("waitid", Waitid), + 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), + 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 257: syscalls.Supported("openat", Openat), + 258: syscalls.Supported("mkdirat", Mkdirat), + 259: syscalls.Supported("mknodat", Mknodat), + 260: syscalls.Supported("fchownat", Fchownat), + 261: syscalls.Supported("futimesat", Futimesat), + 262: syscalls.Supported("fstatat", Fstatat), + 263: syscalls.Supported("unlinkat", Unlinkat), + 264: syscalls.Supported("renameat", Renameat), + 265: syscalls.Supported("linkat", Linkat), + 266: syscalls.Supported("symlinkat", Symlinkat), + 267: syscalls.Supported("readlinkat", Readlinkat), + 268: syscalls.Supported("fchmodat", Fchmodat), + 269: syscalls.Supported("faccessat", Faccessat), + 270: syscalls.Supported("pselect", Pselect), + 271: syscalls.Supported("ppoll", Ppoll), + 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 275: syscalls.Supported("splice", Splice), + 276: syscalls.Supported("tee", Tee), + 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 280: syscalls.Supported("utimensat", Utimensat), + 281: syscalls.Supported("epoll_pwait", EpollPwait), + 282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 283: syscalls.Supported("timerfd_create", TimerfdCreate), + 284: syscalls.Supported("eventfd", Eventfd), + 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 286: syscalls.Supported("timerfd_settime", TimerfdSettime), + 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 288: syscalls.Supported("accept4", Accept4), + 289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 290: syscalls.Supported("eventfd2", Eventfd2), + 291: syscalls.Supported("epoll_create1", EpollCreate1), + 292: syscalls.Supported("dup3", Dup3), + 293: syscalls.Supported("pipe2", Pipe2), + 294: syscalls.Supported("inotify_init1", InotifyInit1), + 295: syscalls.Supported("preadv", Preadv), + 296: syscalls.Supported("pwritev", Pwritev), + 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 302: syscalls.Supported("prlimit64", Prlimit64), + 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 309: syscalls.Supported("getcpu", Getcpu), + 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 317: syscalls.Supported("seccomp", Seccomp), + 318: syscalls.Supported("getrandom", GetRandom), + 319: syscalls.Supported("memfd_create", MemfdCreate), + 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), + 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 322: syscalls.Supported("execveat", Execveat), + 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + + // Syscalls implemented after 325 are "backports" from versions + // of Linux after 4.4. + 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 327: syscalls.Supported("preadv2", Preadv2), + 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 332: syscalls.Supported("statx", Statx), + 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + }, + Emulate: map[usermem.Addr]uintptr{ + 0xffffffffff600000: 96, // vsyscall gettimeofday(2) + 0xffffffffff600400: 201, // vsyscall time(2) + 0xffffffffff600800: 309, // vsyscall getcpu(2) + }, + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} + +// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall +// numbers from Linux 4.4. +var ARM64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.ARM64, + Version: kernel.Version{ + Sysname: LinuxSysname, + Release: LinuxRelease, + Version: LinuxVersion, + }, + AuditNumber: linux.AUDIT_ARCH_AARCH64, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), + 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), + 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), + 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), + 11: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 12: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 13: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 14: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 15: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 16: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), + 17: syscalls.Supported("getcwd", Getcwd), + 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 19: syscalls.Supported("eventfd2", Eventfd2), + 20: syscalls.Supported("epoll_create1", EpollCreate1), + 21: syscalls.Supported("epoll_ctl", EpollCtl), + 22: syscalls.Supported("epoll_pwait", EpollPwait), + 23: syscalls.Supported("dup", Dup), + 24: syscalls.Supported("dup3", Dup3), + 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 26: syscalls.Supported("inotify_init1", InotifyInit1), + 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 32: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 33: syscalls.Supported("mknodat", Mknodat), + 34: syscalls.Supported("mkdirat", Mkdirat), + 35: syscalls.Supported("unlinkat", Unlinkat), + 36: syscalls.Supported("symlinkat", Symlinkat), + 37: syscalls.Supported("linkat", Linkat), + 38: syscalls.Supported("renameat", Renameat), + 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), + 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 45: syscalls.Supported("truncate", Truncate), + 46: syscalls.Supported("ftruncate", Ftruncate), + 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 48: syscalls.Supported("faccessat", Faccessat), + 49: syscalls.Supported("chdir", Chdir), + 50: syscalls.Supported("fchdir", Fchdir), + 51: syscalls.Supported("chroot", Chroot), + 52: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 53: syscalls.Supported("fchmodat", Fchmodat), + 54: syscalls.Supported("fchownat", Fchownat), + 55: syscalls.Supported("fchown", Fchown), + 56: syscalls.Supported("openat", Openat), + 57: syscalls.Supported("close", Close), + 58: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 59: syscalls.Supported("pipe2", Pipe2), + 60: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 61: syscalls.Supported("getdents64", Getdents64), + 62: syscalls.Supported("lseek", Lseek), + 63: syscalls.Supported("read", Read), + 64: syscalls.Supported("write", Write), + 65: syscalls.Supported("readv", Readv), + 66: syscalls.Supported("writev", Writev), + 67: syscalls.Supported("pread64", Pread64), + 68: syscalls.Supported("pwrite64", Pwrite64), + 69: syscalls.Supported("preadv", Preadv), + 70: syscalls.Supported("pwritev", Pwritev), + 71: syscalls.Supported("sendfile", Sendfile), + 72: syscalls.Supported("pselect", Pselect), + 73: syscalls.Supported("ppoll", Ppoll), + 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 77: syscalls.Supported("tee", Tee), + 78: syscalls.Supported("readlinkat", Readlinkat), + 79: syscalls.Supported("fstatat", Fstatat), + 80: syscalls.Supported("fstat", Fstat), + 81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), + 82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 83: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 84: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 85: syscalls.Supported("timerfd_create", TimerfdCreate), + 86: syscalls.Supported("timerfd_settime", TimerfdSettime), + 87: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 88: syscalls.Supported("utimensat", Utimensat), + 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 90: syscalls.Supported("capget", Capget), + 91: syscalls.Supported("capset", Capset), + 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 93: syscalls.Supported("exit", Exit), + 94: syscalls.Supported("exit_group", ExitGroup), + 95: syscalls.Supported("waitid", Waitid), + 96: syscalls.Supported("set_tid_address", SetTidAddress), + 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 101: syscalls.Supported("nanosleep", Nanosleep), + 102: syscalls.Supported("getitimer", Getitimer), + 103: syscalls.Supported("setitimer", Setitimer), + 104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 107: syscalls.Supported("timer_create", TimerCreate), + 108: syscalls.Supported("timer_gettime", TimerGettime), + 109: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 110: syscalls.Supported("timer_settime", TimerSettime), + 111: syscalls.Supported("timer_delete", TimerDelete), + 112: syscalls.Supported("clock_settime", ClockSettime), + 113: syscalls.Supported("clock_gettime", ClockGettime), + 114: syscalls.Supported("clock_getres", ClockGetres), + 115: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 124: syscalls.Supported("sched_yield", SchedYield), + 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 128: syscalls.Supported("restart_syscall", RestartSyscall), + 129: syscalls.Supported("kill", Kill), + 130: syscalls.Supported("tkill", Tkill), + 131: syscalls.Supported("tgkill", Tgkill), + 132: syscalls.Supported("sigaltstack", Sigaltstack), + 133: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 134: syscalls.Supported("rt_sigaction", RtSigaction), + 135: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 136: syscalls.Supported("rt_sigpending", RtSigpending), + 137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 139: syscalls.Supported("rt_sigreturn", RtSigreturn), + 140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 143: syscalls.Supported("setregid", Setregid), + 144: syscalls.Supported("setgid", Setgid), + 145: syscalls.Supported("setreuid", Setreuid), + 146: syscalls.Supported("setuid", Setuid), + 147: syscalls.Supported("setresuid", Setresuid), + 148: syscalls.Supported("getresuid", Getresuid), + 149: syscalls.Supported("setresgid", Setresgid), + 150: syscalls.Supported("getresgid", Getresgid), + 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 153: syscalls.Supported("times", Times), + 154: syscalls.Supported("setpgid", Setpgid), + 155: syscalls.Supported("getpgid", Getpgid), + 156: syscalls.Supported("getsid", Getsid), + 157: syscalls.Supported("setsid", Setsid), + 158: syscalls.Supported("getgroups", Getgroups), + 159: syscalls.Supported("setgroups", Setgroups), + 160: syscalls.Supported("uname", Uname), + 161: syscalls.Supported("sethostname", Sethostname), + 162: syscalls.Supported("setdomainname", Setdomainname), + 163: syscalls.Supported("getrlimit", Getrlimit), + 164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 166: syscalls.Supported("umask", Umask), + 167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 168: syscalls.Supported("getcpu", Getcpu), + 169: syscalls.Supported("gettimeofday", Gettimeofday), + 170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 172: syscalls.Supported("getpid", Getpid), + 173: syscalls.Supported("getppid", Getppid), + 174: syscalls.Supported("getuid", Getuid), + 175: syscalls.Supported("geteuid", Geteuid), + 176: syscalls.Supported("getgid", Getgid), + 177: syscalls.Supported("getegid", Getegid), + 178: syscalls.Supported("gettid", Gettid), + 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 190: syscalls.Supported("semget", Semget), + 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), + 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 197: syscalls.Supported("shmdt", Shmdt), + 198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 199: syscalls.Supported("socketpair", SocketPair), + 200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 201: syscalls.Supported("listen", Listen), + 202: syscalls.Supported("accept", Accept), + 203: syscalls.Supported("connect", Connect), + 204: syscalls.Supported("getsockname", GetSockName), + 205: syscalls.Supported("getpeername", GetPeerName), + 206: syscalls.Supported("sendto", SendTo), + 207: syscalls.Supported("recvfrom", RecvFrom), + 208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 211: syscalls.Supported("sendmsg", SendMsg), + 212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 213: syscalls.Supported("readahead", Readahead), + 214: syscalls.Supported("brk", Brk), + 215: syscalls.Supported("munmap", Munmap), + 216: syscalls.Supported("mremap", Mremap), + 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 221: syscalls.Supported("execve", Execve), + 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 226: syscalls.Supported("mprotect", Mprotect), + 227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 242: syscalls.Supported("accept4", Accept4), + 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 260: syscalls.Supported("wait4", Wait4), + 261: syscalls.Supported("prlimit64", Prlimit64), + 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 277: syscalls.Supported("seccomp", Seccomp), + 278: syscalls.Supported("getrandom", GetRandom), + 279: syscalls.Supported("memfd_create", MemfdCreate), + 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 281: syscalls.Supported("execveat", Execveat), + 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + + // Syscalls after 284 are "backports" from versions of Linux after 4.4. + 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 286: syscalls.Supported("preadv2", Preadv2), + 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 291: syscalls.Supported("statx", Statx), + 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + }, + Emulate: map[usermem.Addr]uintptr{}, + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} + +func init() { + kernel.RegisterSyscallTable(AMD64) + kernel.RegisterSyscallTable(ARM64) +} diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go deleted file mode 100644 index 79066ad2a..000000000 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ /dev/null @@ -1,406 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package linux - -import ( - "gvisor.dev/gvisor/pkg/abi" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall -// numbers from Linux 4.4. -var AMD64 = &kernel.SyscallTable{ - OS: abi.Linux, - Arch: arch.AMD64, - Version: kernel.Version{ - // Version 4.4 is chosen as a stable, longterm version of Linux, which - // guides the interface provided by this syscall table. The build - // version is that for a clean build with default kernel config, at 5 - // minutes after v4.4 was tagged. - Sysname: LinuxSysname, - Release: LinuxRelease, - Version: LinuxVersion, - }, - AuditNumber: linux.AUDIT_ARCH_X86_64, - Table: map[uintptr]kernel.Syscall{ - 0: syscalls.Supported("read", Read), - 1: syscalls.Supported("write", Write), - 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), - 3: syscalls.Supported("close", Close), - 4: syscalls.Supported("stat", Stat), - 5: syscalls.Supported("fstat", Fstat), - 6: syscalls.Supported("lstat", Lstat), - 7: syscalls.Supported("poll", Poll), - 8: syscalls.Supported("lseek", Lseek), - 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), - 10: syscalls.Supported("mprotect", Mprotect), - 11: syscalls.Supported("munmap", Munmap), - 12: syscalls.Supported("brk", Brk), - 13: syscalls.Supported("rt_sigaction", RtSigaction), - 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), - 15: syscalls.Supported("rt_sigreturn", RtSigreturn), - 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), - 17: syscalls.Supported("pread64", Pread64), - 18: syscalls.Supported("pwrite64", Pwrite64), - 19: syscalls.Supported("readv", Readv), - 20: syscalls.Supported("writev", Writev), - 21: syscalls.Supported("access", Access), - 22: syscalls.Supported("pipe", Pipe), - 23: syscalls.Supported("select", Select), - 24: syscalls.Supported("sched_yield", SchedYield), - 25: syscalls.Supported("mremap", Mremap), - 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), - 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), - 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), - 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), - 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), - 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), - 32: syscalls.Supported("dup", Dup), - 33: syscalls.Supported("dup2", Dup2), - 34: syscalls.Supported("pause", Pause), - 35: syscalls.Supported("nanosleep", Nanosleep), - 36: syscalls.Supported("getitimer", Getitimer), - 37: syscalls.Supported("alarm", Alarm), - 38: syscalls.Supported("setitimer", Setitimer), - 39: syscalls.Supported("getpid", Getpid), - 40: syscalls.Supported("sendfile", Sendfile), - 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), - 42: syscalls.Supported("connect", Connect), - 43: syscalls.Supported("accept", Accept), - 44: syscalls.Supported("sendto", SendTo), - 45: syscalls.Supported("recvfrom", RecvFrom), - 46: syscalls.Supported("sendmsg", SendMsg), - 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), - 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), - 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), - 50: syscalls.Supported("listen", Listen), - 51: syscalls.Supported("getsockname", GetSockName), - 52: syscalls.Supported("getpeername", GetPeerName), - 53: syscalls.Supported("socketpair", SocketPair), - 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), - 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), - 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), - 57: syscalls.Supported("fork", Fork), - 58: syscalls.Supported("vfork", Vfork), - 59: syscalls.Supported("execve", Execve), - 60: syscalls.Supported("exit", Exit), - 61: syscalls.Supported("wait4", Wait4), - 62: syscalls.Supported("kill", Kill), - 63: syscalls.Supported("uname", Uname), - 64: syscalls.Supported("semget", Semget), - 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), - 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), - 67: syscalls.Supported("shmdt", Shmdt), - 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), - 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), - 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), - 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), - 76: syscalls.Supported("truncate", Truncate), - 77: syscalls.Supported("ftruncate", Ftruncate), - 78: syscalls.Supported("getdents", Getdents), - 79: syscalls.Supported("getcwd", Getcwd), - 80: syscalls.Supported("chdir", Chdir), - 81: syscalls.Supported("fchdir", Fchdir), - 82: syscalls.Supported("rename", Rename), - 83: syscalls.Supported("mkdir", Mkdir), - 84: syscalls.Supported("rmdir", Rmdir), - 85: syscalls.Supported("creat", Creat), - 86: syscalls.Supported("link", Link), - 87: syscalls.Supported("unlink", Unlink), - 88: syscalls.Supported("symlink", Symlink), - 89: syscalls.Supported("readlink", Readlink), - 90: syscalls.Supported("chmod", Chmod), - 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), - 92: syscalls.Supported("chown", Chown), - 93: syscalls.Supported("fchown", Fchown), - 94: syscalls.Supported("lchown", Lchown), - 95: syscalls.Supported("umask", Umask), - 96: syscalls.Supported("gettimeofday", Gettimeofday), - 97: syscalls.Supported("getrlimit", Getrlimit), - 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), - 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), - 100: syscalls.Supported("times", Times), - 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), - 102: syscalls.Supported("getuid", Getuid), - 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), - 104: syscalls.Supported("getgid", Getgid), - 105: syscalls.Supported("setuid", Setuid), - 106: syscalls.Supported("setgid", Setgid), - 107: syscalls.Supported("geteuid", Geteuid), - 108: syscalls.Supported("getegid", Getegid), - 109: syscalls.Supported("setpgid", Setpgid), - 110: syscalls.Supported("getppid", Getppid), - 111: syscalls.Supported("getpgrp", Getpgrp), - 112: syscalls.Supported("setsid", Setsid), - 113: syscalls.Supported("setreuid", Setreuid), - 114: syscalls.Supported("setregid", Setregid), - 115: syscalls.Supported("getgroups", Getgroups), - 116: syscalls.Supported("setgroups", Setgroups), - 117: syscalls.Supported("setresuid", Setresuid), - 118: syscalls.Supported("getresuid", Getresuid), - 119: syscalls.Supported("setresgid", Setresgid), - 120: syscalls.Supported("getresgid", Getresgid), - 121: syscalls.Supported("getpgid", Getpgid), - 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 124: syscalls.Supported("getsid", Getsid), - 125: syscalls.Supported("capget", Capget), - 126: syscalls.Supported("capset", Capset), - 127: syscalls.Supported("rt_sigpending", RtSigpending), - 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), - 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), - 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), - 131: syscalls.Supported("sigaltstack", Sigaltstack), - 132: syscalls.Supported("utime", Utime), - 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), - 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), - 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), - 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), - 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), - 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), - 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), - 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), - 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), - 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), - 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), - 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), - 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), - 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), - 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), - 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), - 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), - 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), - 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), - 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), - 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), - 161: syscalls.Supported("chroot", Chroot), - 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), - 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), - 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), - 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), - 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), - 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), - 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), - 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), - 170: syscalls.Supported("sethostname", Sethostname), - 171: syscalls.Supported("setdomainname", Setdomainname), - 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), - 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), - 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), - 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), - 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), - 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), - 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), - 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), - 186: syscalls.Supported("gettid", Gettid), - 187: syscalls.Supported("readahead", Readahead), - 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), - 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), - 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), - 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), - 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), - 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), - 194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), - 195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), - 196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), - 197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), - 198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), - 199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), - 200: syscalls.Supported("tkill", Tkill), - 201: syscalls.Supported("time", Time), - 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), - 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), - 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), - 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), - 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), - 213: syscalls.Supported("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), - 217: syscalls.Supported("getdents64", Getdents64), - 218: syscalls.Supported("set_tid_address", SetTidAddress), - 219: syscalls.Supported("restart_syscall", RestartSyscall), - 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), - 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), - 222: syscalls.Supported("timer_create", TimerCreate), - 223: syscalls.Supported("timer_settime", TimerSettime), - 224: syscalls.Supported("timer_gettime", TimerGettime), - 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), - 226: syscalls.Supported("timer_delete", TimerDelete), - 227: syscalls.Supported("clock_settime", ClockSettime), - 228: syscalls.Supported("clock_gettime", ClockGettime), - 229: syscalls.Supported("clock_getres", ClockGetres), - 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), - 231: syscalls.Supported("exit_group", ExitGroup), - 232: syscalls.Supported("epoll_wait", EpollWait), - 233: syscalls.Supported("epoll_ctl", EpollCtl), - 234: syscalls.Supported("tgkill", Tgkill), - 235: syscalls.Supported("utimes", Utimes), - 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), - 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), - 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), - 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), - 247: syscalls.Supported("waitid", Waitid), - 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), - 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), - 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), - 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), - 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), - 257: syscalls.Supported("openat", Openat), - 258: syscalls.Supported("mkdirat", Mkdirat), - 259: syscalls.Supported("mknodat", Mknodat), - 260: syscalls.Supported("fchownat", Fchownat), - 261: syscalls.Supported("futimesat", Futimesat), - 262: syscalls.Supported("fstatat", Fstatat), - 263: syscalls.Supported("unlinkat", Unlinkat), - 264: syscalls.Supported("renameat", Renameat), - 265: syscalls.Supported("linkat", Linkat), - 266: syscalls.Supported("symlinkat", Symlinkat), - 267: syscalls.Supported("readlinkat", Readlinkat), - 268: syscalls.Supported("fchmodat", Fchmodat), - 269: syscalls.Supported("faccessat", Faccessat), - 270: syscalls.Supported("pselect", Pselect), - 271: syscalls.Supported("ppoll", Ppoll), - 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 275: syscalls.Supported("splice", Splice), - 276: syscalls.Supported("tee", Tee), - 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) - 280: syscalls.Supported("utimensat", Utimensat), - 281: syscalls.Supported("epoll_pwait", EpollPwait), - 282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 283: syscalls.Supported("timerfd_create", TimerfdCreate), - 284: syscalls.Supported("eventfd", Eventfd), - 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), - 286: syscalls.Supported("timerfd_settime", TimerfdSettime), - 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), - 288: syscalls.Supported("accept4", Accept4), - 289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 290: syscalls.Supported("eventfd2", Eventfd2), - 291: syscalls.Supported("epoll_create1", EpollCreate1), - 292: syscalls.Supported("dup3", Dup3), - 293: syscalls.Supported("pipe2", Pipe2), - 294: syscalls.Supported("inotify_init1", InotifyInit1), - 295: syscalls.Supported("preadv", Preadv), - 296: syscalls.Supported("pwritev", Pwritev), - 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), - 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 302: syscalls.Supported("prlimit64", Prlimit64), - 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), - 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), - 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) - 309: syscalls.Supported("getcpu", Getcpu), - 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), - 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) - 317: syscalls.Supported("seccomp", Seccomp), - 318: syscalls.Supported("getrandom", GetRandom), - 319: syscalls.Supported("memfd_create", MemfdCreate), - 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), - 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 322: syscalls.Supported("execveat", Execveat), - 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) - 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - - // Syscalls implemented after 325 are "backports" from versions - // of Linux after 4.4. - 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), - 327: syscalls.Supported("preadv2", Preadv2), - 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), - 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), - 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), - 332: syscalls.Supported("statx", Statx), - 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), - 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), - - // Linux skips ahead to syscall 424 to sync numbers between arches. - 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), - 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), - 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), - 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), - 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), - 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), - 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), - 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), - }, - - Emulate: map[usermem.Addr]uintptr{ - 0xffffffffff600000: 96, // vsyscall gettimeofday(2) - 0xffffffffff600400: 201, // vsyscall time(2) - 0xffffffffff600800: 309, // vsyscall getcpu(2) - }, - Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { - t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS - }, -} diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go deleted file mode 100644 index 7421619de..000000000 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ /dev/null @@ -1,340 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package linux - -import ( - "gvisor.dev/gvisor/pkg/abi" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall -// numbers from Linux 4.4. -var ARM64 = &kernel.SyscallTable{ - OS: abi.Linux, - Arch: arch.ARM64, - Version: kernel.Version{ - Sysname: LinuxSysname, - Release: LinuxRelease, - Version: LinuxVersion, - }, - AuditNumber: linux.AUDIT_ARCH_AARCH64, - Table: map[uintptr]kernel.Syscall{ - 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), - 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), - 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), - 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), - 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), - 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), - 11: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), - 12: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), - 13: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), - 14: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), - 15: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), - 16: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), - 17: syscalls.Supported("getcwd", Getcwd), - 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), - 19: syscalls.Supported("eventfd2", Eventfd2), - 20: syscalls.Supported("epoll_create1", EpollCreate1), - 21: syscalls.Supported("epoll_ctl", EpollCtl), - 22: syscalls.Supported("epoll_pwait", EpollPwait), - 23: syscalls.Supported("dup", Dup), - 24: syscalls.Supported("dup3", Dup3), - 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), - 26: syscalls.Supported("inotify_init1", InotifyInit1), - 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), - 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), - 29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), - 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 32: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), - 33: syscalls.Supported("mknodat", Mknodat), - 34: syscalls.Supported("mkdirat", Mkdirat), - 35: syscalls.Supported("unlinkat", Unlinkat), - 36: syscalls.Supported("symlinkat", Symlinkat), - 37: syscalls.Supported("linkat", Linkat), - 38: syscalls.Supported("renameat", Renameat), - 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), - 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), - 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), - 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), - 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), - 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 45: syscalls.Supported("truncate", Truncate), - 46: syscalls.Supported("ftruncate", Ftruncate), - 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), - 48: syscalls.Supported("faccessat", Faccessat), - 49: syscalls.Supported("chdir", Chdir), - 50: syscalls.Supported("fchdir", Fchdir), - 51: syscalls.Supported("chroot", Chroot), - 52: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), - 53: syscalls.Supported("fchmodat", Fchmodat), - 54: syscalls.Supported("fchownat", Fchownat), - 55: syscalls.Supported("fchown", Fchown), - 56: syscalls.Supported("openat", Openat), - 57: syscalls.Supported("close", Close), - 58: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 59: syscalls.Supported("pipe2", Pipe2), - 60: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 61: syscalls.Supported("getdents64", Getdents64), - 62: syscalls.Supported("lseek", Lseek), - 63: syscalls.Supported("read", Read), - 64: syscalls.Supported("write", Write), - 65: syscalls.Supported("readv", Readv), - 66: syscalls.Supported("writev", Writev), - 67: syscalls.Supported("pread64", Pread64), - 68: syscalls.Supported("pwrite64", Pwrite64), - 69: syscalls.Supported("preadv", Preadv), - 70: syscalls.Supported("pwritev", Pwritev), - 71: syscalls.Supported("sendfile", Sendfile), - 72: syscalls.Supported("pselect", Pselect), - 73: syscalls.Supported("ppoll", Ppoll), - 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 77: syscalls.Supported("tee", Tee), - 78: syscalls.Supported("readlinkat", Readlinkat), - 79: syscalls.Supported("fstatat", Fstatat), - 80: syscalls.Supported("fstat", Fstat), - 81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), - 82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), - 83: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), - 84: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 85: syscalls.Supported("timerfd_create", TimerfdCreate), - 86: syscalls.Supported("timerfd_settime", TimerfdSettime), - 87: syscalls.Supported("timerfd_gettime", TimerfdGettime), - 88: syscalls.Supported("utimensat", Utimensat), - 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), - 90: syscalls.Supported("capget", Capget), - 91: syscalls.Supported("capset", Capset), - 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), - 93: syscalls.Supported("exit", Exit), - 94: syscalls.Supported("exit_group", ExitGroup), - 95: syscalls.Supported("waitid", Waitid), - 96: syscalls.Supported("set_tid_address", SetTidAddress), - 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), - 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 101: syscalls.Supported("nanosleep", Nanosleep), - 102: syscalls.Supported("getitimer", Getitimer), - 103: syscalls.Supported("setitimer", Setitimer), - 104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), - 105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), - 106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 107: syscalls.Supported("timer_create", TimerCreate), - 108: syscalls.Supported("timer_gettime", TimerGettime), - 109: syscalls.Supported("timer_getoverrun", TimerGetoverrun), - 110: syscalls.Supported("timer_settime", TimerSettime), - 111: syscalls.Supported("timer_delete", TimerDelete), - 112: syscalls.Supported("clock_settime", ClockSettime), - 113: syscalls.Supported("clock_gettime", ClockGettime), - 114: syscalls.Supported("clock_getres", ClockGetres), - 115: syscalls.Supported("clock_nanosleep", ClockNanosleep), - 116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), - 117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), - 118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), - 119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), - 120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), - 121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), - 122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), - 123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 124: syscalls.Supported("sched_yield", SchedYield), - 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), - 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), - 128: syscalls.Supported("restart_syscall", RestartSyscall), - 129: syscalls.Supported("kill", Kill), - 130: syscalls.Supported("tkill", Tkill), - 131: syscalls.Supported("tgkill", Tgkill), - 132: syscalls.Supported("sigaltstack", Sigaltstack), - 133: syscalls.Supported("rt_sigsuspend", RtSigsuspend), - 134: syscalls.Supported("rt_sigaction", RtSigaction), - 135: syscalls.Supported("rt_sigprocmask", RtSigprocmask), - 136: syscalls.Supported("rt_sigpending", RtSigpending), - 137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), - 138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), - 139: syscalls.Supported("rt_sigreturn", RtSigreturn), - 140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), - 141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), - 142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), - 143: syscalls.Supported("setregid", Setregid), - 144: syscalls.Supported("setgid", Setgid), - 145: syscalls.Supported("setreuid", Setreuid), - 146: syscalls.Supported("setuid", Setuid), - 147: syscalls.Supported("setresuid", Setresuid), - 148: syscalls.Supported("getresuid", Getresuid), - 149: syscalls.Supported("setresgid", Setresgid), - 150: syscalls.Supported("getresgid", Getresgid), - 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 153: syscalls.Supported("times", Times), - 154: syscalls.Supported("setpgid", Setpgid), - 155: syscalls.Supported("getpgid", Getpgid), - 156: syscalls.Supported("getsid", Getsid), - 157: syscalls.Supported("setsid", Setsid), - 158: syscalls.Supported("getgroups", Getgroups), - 159: syscalls.Supported("setgroups", Setgroups), - 160: syscalls.Supported("uname", Uname), - 161: syscalls.Supported("sethostname", Sethostname), - 162: syscalls.Supported("setdomainname", Setdomainname), - 163: syscalls.Supported("getrlimit", Getrlimit), - 164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), - 165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), - 166: syscalls.Supported("umask", Umask), - 167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), - 168: syscalls.Supported("getcpu", Getcpu), - 169: syscalls.Supported("gettimeofday", Gettimeofday), - 170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), - 171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), - 172: syscalls.Supported("getpid", Getpid), - 173: syscalls.Supported("getppid", Getppid), - 174: syscalls.Supported("getuid", Getuid), - 175: syscalls.Supported("geteuid", Geteuid), - 176: syscalls.Supported("getgid", Getgid), - 177: syscalls.Supported("getegid", Getegid), - 178: syscalls.Supported("gettid", Gettid), - 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), - 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 190: syscalls.Supported("semget", Semget), - 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), - 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), - 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), - 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), - 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), - 196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), - 197: syscalls.Supported("shmdt", Shmdt), - 198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), - 199: syscalls.Supported("socketpair", SocketPair), - 200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), - 201: syscalls.Supported("listen", Listen), - 202: syscalls.Supported("accept", Accept), - 203: syscalls.Supported("connect", Connect), - 204: syscalls.Supported("getsockname", GetSockName), - 205: syscalls.Supported("getpeername", GetPeerName), - 206: syscalls.Supported("sendto", SendTo), - 207: syscalls.Supported("recvfrom", RecvFrom), - 208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), - 209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), - 210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), - 211: syscalls.Supported("sendmsg", SendMsg), - 212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), - 213: syscalls.Supported("readahead", Readahead), - 214: syscalls.Supported("brk", Brk), - 215: syscalls.Supported("munmap", Munmap), - 216: syscalls.Supported("mremap", Mremap), - 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), - 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), - 221: syscalls.Supported("execve", Execve), - 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), - 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), - 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), - 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), - 226: syscalls.Supported("mprotect", Mprotect), - 227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), - 228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), - 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), - 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), - 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), - 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), - 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), - 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) - 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), - 242: syscalls.Supported("accept4", Accept4), - 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 260: syscalls.Supported("wait4", Wait4), - 261: syscalls.Supported("prlimit64", Prlimit64), - 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), - 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), - 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) - 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), - 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) - 277: syscalls.Supported("seccomp", Seccomp), - 278: syscalls.Supported("getrandom", GetRandom), - 279: syscalls.Supported("memfd_create", MemfdCreate), - 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 281: syscalls.Supported("execveat", Execveat), - 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) - 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - - // Syscalls after 284 are "backports" from versions of Linux after 4.4. - 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), - 286: syscalls.Supported("preadv2", Preadv2), - 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), - 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), - 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), - 291: syscalls.Supported("statx", Statx), - 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), - 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), - - // Linux skips ahead to syscall 424 to sync numbers between arches. - 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), - 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), - 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), - 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), - 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), - 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), - 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), - 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), - }, - Emulate: map[usermem.Addr]uintptr{}, - - Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { - t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS - }, -} diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index ffca627d4..4c7b8f819 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -13,9 +13,6 @@ go_library( "fscontext.go", "getdents.go", "ioctl.go", - "linux64.go", - "linux64_override_amd64.go", - "linux64_override_arm64.go", "mmap.go", "path.go", "pipe.go", @@ -28,6 +25,7 @@ go_library( "stat_arm64.go", "sync.go", "timerfd.go", + "vfs2.go", "xattr.go", ], marshal = True, diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/sentry/syscalls/linux/vfs2/linux64.go deleted file mode 100644 index 19ee36081..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/linux64.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package vfs2 provides syscall implementations that use VFS2. -package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go deleted file mode 100644 index 47c5d18e7..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package vfs2 - -import ( - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" -) - -// Override syscall table to add syscalls implementations from this package. -func Override(table map[uintptr]kernel.Syscall) { - table[0] = syscalls.Supported("read", Read) - table[1] = syscalls.Supported("write", Write) - table[2] = syscalls.Supported("open", Open) - table[3] = syscalls.Supported("close", Close) - table[4] = syscalls.Supported("stat", Stat) - table[5] = syscalls.Supported("fstat", Fstat) - table[6] = syscalls.Supported("lstat", Lstat) - table[7] = syscalls.Supported("poll", Poll) - table[8] = syscalls.Supported("lseek", Lseek) - table[9] = syscalls.Supported("mmap", Mmap) - table[16] = syscalls.Supported("ioctl", Ioctl) - table[17] = syscalls.Supported("pread64", Pread64) - table[18] = syscalls.Supported("pwrite64", Pwrite64) - table[19] = syscalls.Supported("readv", Readv) - table[20] = syscalls.Supported("writev", Writev) - table[21] = syscalls.Supported("access", Access) - table[22] = syscalls.Supported("pipe", Pipe) - table[23] = syscalls.Supported("select", Select) - table[32] = syscalls.Supported("dup", Dup) - table[33] = syscalls.Supported("dup2", Dup2) - delete(table, 40) // sendfile - table[41] = syscalls.Supported("socket", Socket) - table[42] = syscalls.Supported("connect", Connect) - table[43] = syscalls.Supported("accept", Accept) - table[44] = syscalls.Supported("sendto", SendTo) - table[45] = syscalls.Supported("recvfrom", RecvFrom) - table[46] = syscalls.Supported("sendmsg", SendMsg) - table[47] = syscalls.Supported("recvmsg", RecvMsg) - table[48] = syscalls.Supported("shutdown", Shutdown) - table[49] = syscalls.Supported("bind", Bind) - table[50] = syscalls.Supported("listen", Listen) - table[51] = syscalls.Supported("getsockname", GetSockName) - table[52] = syscalls.Supported("getpeername", GetPeerName) - table[53] = syscalls.Supported("socketpair", SocketPair) - table[54] = syscalls.Supported("setsockopt", SetSockOpt) - table[55] = syscalls.Supported("getsockopt", GetSockOpt) - table[59] = syscalls.Supported("execve", Execve) - table[72] = syscalls.Supported("fcntl", Fcntl) - delete(table, 73) // flock - table[74] = syscalls.Supported("fsync", Fsync) - table[75] = syscalls.Supported("fdatasync", Fdatasync) - table[76] = syscalls.Supported("truncate", Truncate) - table[77] = syscalls.Supported("ftruncate", Ftruncate) - table[78] = syscalls.Supported("getdents", Getdents) - table[79] = syscalls.Supported("getcwd", Getcwd) - table[80] = syscalls.Supported("chdir", Chdir) - table[81] = syscalls.Supported("fchdir", Fchdir) - table[82] = syscalls.Supported("rename", Rename) - table[83] = syscalls.Supported("mkdir", Mkdir) - table[84] = syscalls.Supported("rmdir", Rmdir) - table[85] = syscalls.Supported("creat", Creat) - table[86] = syscalls.Supported("link", Link) - table[87] = syscalls.Supported("unlink", Unlink) - table[88] = syscalls.Supported("symlink", Symlink) - table[89] = syscalls.Supported("readlink", Readlink) - table[90] = syscalls.Supported("chmod", Chmod) - table[91] = syscalls.Supported("fchmod", Fchmod) - table[92] = syscalls.Supported("chown", Chown) - table[93] = syscalls.Supported("fchown", Fchown) - table[94] = syscalls.Supported("lchown", Lchown) - table[132] = syscalls.Supported("utime", Utime) - table[133] = syscalls.Supported("mknod", Mknod) - table[137] = syscalls.Supported("statfs", Statfs) - table[138] = syscalls.Supported("fstatfs", Fstatfs) - table[161] = syscalls.Supported("chroot", Chroot) - table[162] = syscalls.Supported("sync", Sync) - delete(table, 165) // mount - delete(table, 166) // umount2 - delete(table, 187) // readahead - table[188] = syscalls.Supported("setxattr", Setxattr) - table[189] = syscalls.Supported("lsetxattr", Lsetxattr) - table[190] = syscalls.Supported("fsetxattr", Fsetxattr) - table[191] = syscalls.Supported("getxattr", Getxattr) - table[192] = syscalls.Supported("lgetxattr", Lgetxattr) - table[193] = syscalls.Supported("fgetxattr", Fgetxattr) - table[194] = syscalls.Supported("listxattr", Listxattr) - table[195] = syscalls.Supported("llistxattr", Llistxattr) - table[196] = syscalls.Supported("flistxattr", Flistxattr) - table[197] = syscalls.Supported("removexattr", Removexattr) - table[198] = syscalls.Supported("lremovexattr", Lremovexattr) - table[199] = syscalls.Supported("fremovexattr", Fremovexattr) - delete(table, 206) // io_setup - delete(table, 207) // io_destroy - delete(table, 208) // io_getevents - delete(table, 209) // io_submit - delete(table, 210) // io_cancel - table[213] = syscalls.Supported("epoll_create", EpollCreate) - table[217] = syscalls.Supported("getdents64", Getdents64) - delete(table, 221) // fdavise64 - table[232] = syscalls.Supported("epoll_wait", EpollWait) - table[233] = syscalls.Supported("epoll_ctl", EpollCtl) - table[235] = syscalls.Supported("utimes", Utimes) - delete(table, 253) // inotify_init - delete(table, 254) // inotify_add_watch - delete(table, 255) // inotify_rm_watch - table[257] = syscalls.Supported("openat", Openat) - table[258] = syscalls.Supported("mkdirat", Mkdirat) - table[259] = syscalls.Supported("mknodat", Mknodat) - table[260] = syscalls.Supported("fchownat", Fchownat) - table[261] = syscalls.Supported("futimens", Futimens) - table[262] = syscalls.Supported("newfstatat", Newfstatat) - table[263] = syscalls.Supported("unlinkat", Unlinkat) - table[264] = syscalls.Supported("renameat", Renameat) - table[265] = syscalls.Supported("linkat", Linkat) - table[266] = syscalls.Supported("symlinkat", Symlinkat) - table[267] = syscalls.Supported("readlinkat", Readlinkat) - table[268] = syscalls.Supported("fchmodat", Fchmodat) - table[269] = syscalls.Supported("faccessat", Faccessat) - table[270] = syscalls.Supported("pselect", Pselect) - table[271] = syscalls.Supported("ppoll", Ppoll) - delete(table, 275) // splice - delete(table, 276) // tee - table[277] = syscalls.Supported("sync_file_range", SyncFileRange) - table[280] = syscalls.Supported("utimensat", Utimensat) - table[281] = syscalls.Supported("epoll_pwait", EpollPwait) - delete(table, 282) // signalfd - table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) - table[284] = syscalls.Supported("eventfd", Eventfd) - delete(table, 285) // fallocate - table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) - table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) - table[288] = syscalls.Supported("accept4", Accept4) - delete(table, 289) // signalfd4 - table[290] = syscalls.Supported("eventfd2", Eventfd2) - table[291] = syscalls.Supported("epoll_create1", EpollCreate1) - table[292] = syscalls.Supported("dup3", Dup3) - table[293] = syscalls.Supported("pipe2", Pipe2) - delete(table, 294) // inotify_init1 - table[295] = syscalls.Supported("preadv", Preadv) - table[296] = syscalls.Supported("pwritev", Pwritev) - table[299] = syscalls.Supported("recvmmsg", RecvMMsg) - table[306] = syscalls.Supported("syncfs", Syncfs) - table[307] = syscalls.Supported("sendmmsg", SendMMsg) - table[316] = syscalls.Supported("renameat2", Renameat2) - delete(table, 319) // memfd_create - table[322] = syscalls.Supported("execveat", Execveat) - table[327] = syscalls.Supported("preadv2", Preadv2) - table[328] = syscalls.Supported("pwritev2", Pwritev2) - table[332] = syscalls.Supported("statx", Statx) -} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go deleted file mode 100644 index a6b367468..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package vfs2 - -import ( - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" -) - -// Override syscall table to add syscalls implementations from this package. -func Override(table map[uintptr]kernel.Syscall) { - table[63] = syscalls.Supported("read", Read) -} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go new file mode 100644 index 000000000..f1b697844 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -0,0 +1,172 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs2 provides syscall implementations that use VFS2. +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +// Override syscall table to add syscalls implementations from this package. +func Override() { + // Override AMD64. + s := linux.AMD64 + s.Table[0] = syscalls.Supported("read", Read) + s.Table[1] = syscalls.Supported("write", Write) + s.Table[2] = syscalls.Supported("open", Open) + s.Table[3] = syscalls.Supported("close", Close) + s.Table[4] = syscalls.Supported("stat", Stat) + s.Table[5] = syscalls.Supported("fstat", Fstat) + s.Table[6] = syscalls.Supported("lstat", Lstat) + s.Table[7] = syscalls.Supported("poll", Poll) + s.Table[8] = syscalls.Supported("lseek", Lseek) + s.Table[9] = syscalls.Supported("mmap", Mmap) + s.Table[16] = syscalls.Supported("ioctl", Ioctl) + s.Table[17] = syscalls.Supported("pread64", Pread64) + s.Table[18] = syscalls.Supported("pwrite64", Pwrite64) + s.Table[19] = syscalls.Supported("readv", Readv) + s.Table[20] = syscalls.Supported("writev", Writev) + s.Table[21] = syscalls.Supported("access", Access) + s.Table[22] = syscalls.Supported("pipe", Pipe) + s.Table[23] = syscalls.Supported("select", Select) + s.Table[32] = syscalls.Supported("dup", Dup) + s.Table[33] = syscalls.Supported("dup2", Dup2) + delete(s.Table, 40) // sendfile + s.Table[41] = syscalls.Supported("socket", Socket) + s.Table[42] = syscalls.Supported("connect", Connect) + s.Table[43] = syscalls.Supported("accept", Accept) + s.Table[44] = syscalls.Supported("sendto", SendTo) + s.Table[45] = syscalls.Supported("recvfrom", RecvFrom) + s.Table[46] = syscalls.Supported("sendmsg", SendMsg) + s.Table[47] = syscalls.Supported("recvmsg", RecvMsg) + s.Table[48] = syscalls.Supported("shutdown", Shutdown) + s.Table[49] = syscalls.Supported("bind", Bind) + s.Table[50] = syscalls.Supported("listen", Listen) + s.Table[51] = syscalls.Supported("getsockname", GetSockName) + s.Table[52] = syscalls.Supported("getpeername", GetPeerName) + s.Table[53] = syscalls.Supported("socketpair", SocketPair) + s.Table[54] = syscalls.Supported("setsockopt", SetSockOpt) + s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt) + s.Table[59] = syscalls.Supported("execve", Execve) + s.Table[72] = syscalls.Supported("fcntl", Fcntl) + delete(s.Table, 73) // flock + s.Table[74] = syscalls.Supported("fsync", Fsync) + s.Table[75] = syscalls.Supported("fdatasync", Fdatasync) + s.Table[76] = syscalls.Supported("truncate", Truncate) + s.Table[77] = syscalls.Supported("ftruncate", Ftruncate) + s.Table[78] = syscalls.Supported("getdents", Getdents) + s.Table[79] = syscalls.Supported("getcwd", Getcwd) + s.Table[80] = syscalls.Supported("chdir", Chdir) + s.Table[81] = syscalls.Supported("fchdir", Fchdir) + s.Table[82] = syscalls.Supported("rename", Rename) + s.Table[83] = syscalls.Supported("mkdir", Mkdir) + s.Table[84] = syscalls.Supported("rmdir", Rmdir) + s.Table[85] = syscalls.Supported("creat", Creat) + s.Table[86] = syscalls.Supported("link", Link) + s.Table[87] = syscalls.Supported("unlink", Unlink) + s.Table[88] = syscalls.Supported("symlink", Symlink) + s.Table[89] = syscalls.Supported("readlink", Readlink) + s.Table[90] = syscalls.Supported("chmod", Chmod) + s.Table[91] = syscalls.Supported("fchmod", Fchmod) + s.Table[92] = syscalls.Supported("chown", Chown) + s.Table[93] = syscalls.Supported("fchown", Fchown) + s.Table[94] = syscalls.Supported("lchown", Lchown) + s.Table[132] = syscalls.Supported("utime", Utime) + s.Table[133] = syscalls.Supported("mknod", Mknod) + s.Table[137] = syscalls.Supported("statfs", Statfs) + s.Table[138] = syscalls.Supported("fstatfs", Fstatfs) + s.Table[161] = syscalls.Supported("chroot", Chroot) + s.Table[162] = syscalls.Supported("sync", Sync) + delete(s.Table, 165) // mount + delete(s.Table, 166) // umount2 + delete(s.Table, 187) // readahead + s.Table[188] = syscalls.Supported("setxattr", Setxattr) + s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr) + s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr) + s.Table[191] = syscalls.Supported("getxattr", Getxattr) + s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr) + s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr) + s.Table[194] = syscalls.Supported("listxattr", Listxattr) + s.Table[195] = syscalls.Supported("llistxattr", Llistxattr) + s.Table[196] = syscalls.Supported("flistxattr", Flistxattr) + s.Table[197] = syscalls.Supported("removexattr", Removexattr) + s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr) + s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr) + delete(s.Table, 206) // io_setup + delete(s.Table, 207) // io_destroy + delete(s.Table, 208) // io_getevents + delete(s.Table, 209) // io_submit + delete(s.Table, 210) // io_cancel + s.Table[213] = syscalls.Supported("epoll_create", EpollCreate) + s.Table[217] = syscalls.Supported("getdents64", Getdents64) + delete(s.Table, 221) // fdavise64 + s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) + s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) + s.Table[235] = syscalls.Supported("utimes", Utimes) + delete(s.Table, 253) // inotify_init + delete(s.Table, 254) // inotify_add_watch + delete(s.Table, 255) // inotify_rm_watch + s.Table[257] = syscalls.Supported("openat", Openat) + s.Table[258] = syscalls.Supported("mkdirat", Mkdirat) + s.Table[259] = syscalls.Supported("mknodat", Mknodat) + s.Table[260] = syscalls.Supported("fchownat", Fchownat) + s.Table[261] = syscalls.Supported("futimens", Futimens) + s.Table[262] = syscalls.Supported("newfstatat", Newfstatat) + s.Table[263] = syscalls.Supported("unlinkat", Unlinkat) + s.Table[264] = syscalls.Supported("renameat", Renameat) + s.Table[265] = syscalls.Supported("linkat", Linkat) + s.Table[266] = syscalls.Supported("symlinkat", Symlinkat) + s.Table[267] = syscalls.Supported("readlinkat", Readlinkat) + s.Table[268] = syscalls.Supported("fchmodat", Fchmodat) + s.Table[269] = syscalls.Supported("faccessat", Faccessat) + s.Table[270] = syscalls.Supported("pselect", Pselect) + s.Table[271] = syscalls.Supported("ppoll", Ppoll) + delete(s.Table, 275) // splice + delete(s.Table, 276) // tee + s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange) + s.Table[280] = syscalls.Supported("utimensat", Utimensat) + s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait) + delete(s.Table, 282) // signalfd + s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) + s.Table[284] = syscalls.Supported("eventfd", Eventfd) + delete(s.Table, 285) // fallocate + s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) + s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) + s.Table[288] = syscalls.Supported("accept4", Accept4) + delete(s.Table, 289) // signalfd4 + s.Table[290] = syscalls.Supported("eventfd2", Eventfd2) + s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1) + s.Table[292] = syscalls.Supported("dup3", Dup3) + s.Table[293] = syscalls.Supported("pipe2", Pipe2) + delete(s.Table, 294) // inotify_init1 + s.Table[295] = syscalls.Supported("preadv", Preadv) + s.Table[296] = syscalls.Supported("pwritev", Pwritev) + s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg) + s.Table[306] = syscalls.Supported("syncfs", Syncfs) + s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg) + s.Table[316] = syscalls.Supported("renameat2", Renameat2) + delete(s.Table, 319) // memfd_create + s.Table[322] = syscalls.Supported("execveat", Execveat) + s.Table[327] = syscalls.Supported("preadv2", Preadv2) + s.Table[328] = syscalls.Supported("pwritev2", Pwritev2) + s.Table[332] = syscalls.Supported("statx", Statx) + s.Init() + + // Override ARM64. + s = linux.ARM64 + s.Table[63] = syscalls.Supported("read", Read) + s.Init() +} diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 0e71e800b..a907c103b 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -15,8 +15,6 @@ go_library( "fs.go", "limits.go", "loader.go", - "loader_amd64.go", - "loader_arm64.go", "network.go", "strace.go", "vfs.go", @@ -77,7 +75,6 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/state", "//pkg/sentry/strace", - "//pkg/sentry/syscalls/linux", "//pkg/sentry/syscalls/linux/vfs2", "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", @@ -126,7 +123,6 @@ go_test( "//pkg/p9", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/sentry/kernel", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/unet", diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 79ef3a880..8c8bad11c 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -77,8 +77,6 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) -var syscallTable *kernel.SyscallTable - // Loader keeps state needed to start the kernel and run the container.. type Loader struct { // k is the kernel. @@ -204,14 +202,12 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("setting up memory usage: %v", err) } - // Patch the syscall table. - kernel.VFS2Enabled = args.Conf.VFS2 - if kernel.VFS2Enabled { - vfs2.Override(syscallTable.Table) + // Is this a VFSv2 kernel? + if args.Conf.VFS2 { + kernel.VFS2Enabled = true + vfs2.Override() } - kernel.RegisterSyscallTable(syscallTable) - // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go deleted file mode 100644 index 78df86611..000000000 --- a/runsc/boot/loader_amd64.go +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package boot - -import ( - "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" -) - -func init() { - // Set the global syscall table. - syscallTable = linux.AMD64 -} diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go deleted file mode 100644 index 250785010..000000000 --- a/runsc/boot/loader_arm64.go +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package boot - -import ( - "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" -) - -func init() { - // Set the global syscall table. - syscallTable = linux.ARM64 -} diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 7a30fea70..e448fd773 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -31,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" @@ -69,11 +68,6 @@ func testSpec() *specs.Spec { } } -func resetSyscallTable() { - kernel.VFS2Enabled = false - kernel.FlushSyscallTablesTestOnly() -} - // startGofer starts a new gofer routine serving 'root' path. It returns the // sandbox side of the connection, and a function that when called will stop the // gofer. @@ -150,13 +144,11 @@ func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) { // TestRun runs a simple application in a sandbox and checks that it succeeds. func TestRun(t *testing.T) { - defer resetSyscallTable() doRun(t, false) } // TestRunVFS2 runs TestRun in VFSv2. func TestRunVFS2(t *testing.T) { - defer resetSyscallTable() doRun(t, true) } @@ -199,13 +191,11 @@ func doRun(t *testing.T, vfsEnabled bool) { // TestStartSignal tests that the controller Start message will cause // WaitForStartSignal to return. func TestStartSignal(t *testing.T) { - defer resetSyscallTable() doStartSignal(t, false) } // TestStartSignalVFS2 does TestStartSignal with VFS2. func TestStartSignalVFS2(t *testing.T) { - defer resetSyscallTable() doStartSignal(t, true) } @@ -477,8 +467,6 @@ func TestCreateMountNamespace(t *testing.T) { func TestCreateMountNamespaceVFS2(t *testing.T) { for _, tc := range createMountTestcases(true /* vfs2 */) { t.Run(tc.name, func(t *testing.T) { - defer resetSyscallTable() - spec := testSpec() spec.Mounts = tc.spec.Mounts spec.Root = tc.spec.Root diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go index 7072547be..a37d66139 100644 --- a/runsc/cmd/syscalls.go +++ b/runsc/cmd/syscalls.go @@ -32,9 +32,10 @@ import ( // Syscalls implements subcommands.Command for the "syscalls" command. type Syscalls struct { - output string - os string - arch string + format string + os string + arch string + filename string } // CompatibilityInfo is a map of system and architecture to compatibility doc. @@ -95,16 +96,17 @@ func (*Syscalls) Usage() string { // SetFlags implements subcommands.Command.SetFlags. func (s *Syscalls) SetFlags(f *flag.FlagSet) { - f.StringVar(&s.output, "o", "table", "Output format (table, csv, json).") + f.StringVar(&s.format, "format", "table", "Output format (table, csv, json).") f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)") f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).") + f.StringVar(&s.filename, "filename", "", "Output filename (otherwise stdout).") } // Execute implements subcommands.Command.Execute. func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - out, ok := outputMap[s.output] + out, ok := outputMap[s.format] if !ok { - Fatalf("Unsupported output format %q", s.output) + Fatalf("Unsupported output format %q", s.format) } // Build map of all supported architectures. @@ -124,7 +126,14 @@ func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface Fatalf("%v", err) } - if err := out(os.Stdout, info); err != nil { + w := os.Stdout // Default. + if s.filename != "" { + w, err = os.OpenFile(s.filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + if err != nil { + Fatalf("Error opening %q: %v", s.filename, err) + } + } + if err := out(w, info); err != nil { Fatalf("Error writing output: %v", err) } -- cgit v1.2.3 From d0b1d0233dc8a8ac837d534cd0664eabb9dd0a71 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 7 May 2020 12:42:46 -0700 Subject: Move pkg/sentry/vfs/{eventfd,timerfd} to new packages in pkg/sentry/fsimpl. They don't depend on anything in VFS2, so they should be their own packages. PiperOrigin-RevId: 310416807 --- pkg/sentry/fsimpl/eventfd/BUILD | 33 ++++ pkg/sentry/fsimpl/eventfd/eventfd.go | 284 ++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/eventfd/eventfd_test.go | 97 ++++++++++ pkg/sentry/fsimpl/timerfd/BUILD | 17 ++ pkg/sentry/fsimpl/timerfd/timerfd.go | 143 +++++++++++++++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 11 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/eventfd.go | 4 +- pkg/sentry/syscalls/linux/vfs2/timerfd.go | 19 +- pkg/sentry/vfs/BUILD | 4 - pkg/sentry/vfs/eventfd.go | 282 ----------------------------- pkg/sentry/vfs/eventfd_test.go | 96 ---------- pkg/sentry/vfs/timerfd.go | 141 --------------- 14 files changed, 596 insertions(+), 538 deletions(-) create mode 100644 pkg/sentry/fsimpl/eventfd/BUILD create mode 100644 pkg/sentry/fsimpl/eventfd/eventfd.go create mode 100644 pkg/sentry/fsimpl/eventfd/eventfd_test.go create mode 100644 pkg/sentry/fsimpl/timerfd/BUILD create mode 100644 pkg/sentry/fsimpl/timerfd/timerfd.go delete mode 100644 pkg/sentry/vfs/eventfd.go delete mode 100644 pkg/sentry/vfs/eventfd_test.go delete mode 100644 pkg/sentry/vfs/timerfd.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD new file mode 100644 index 000000000..ea167d38c --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "eventfd", + srcs = ["eventfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fdnotifier", + "//pkg/log", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "eventfd_test", + size = "small", + srcs = ["eventfd_test.go"], + library = ":eventfd", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/contexttest", + "//pkg/sentry/vfs", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go new file mode 100644 index 000000000..c573d7935 --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -0,0 +1,284 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package eventfd implements event fds. +package eventfd + +import ( + "math" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EventFileDescription implements FileDescriptionImpl for file-based event +// notification (eventfd). Eventfds are usually internal to the Sentry but in +// certain situations they may be converted into a host-backed eventfd. +type EventFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + // queue is used to notify interested parties when the event object + // becomes readable or writable. + queue waiter.Queue `state:"zerovalue"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool + + // hostfd indicates whether this eventfd is passed through to the host. + hostfd int +} + +var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) + +// New creates a new event fd. +func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[eventfd]") + defer vd.DecRef() + efd := &EventFileDescription{ + val: initVal, + semMode: semMode, + hostfd: -1, + } + if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &efd.vfsfd, nil +} + +// HostFD returns the host eventfd associated with this event. +func (efd *EventFileDescription) HostFD() (int, error) { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + return efd.hostfd, nil + } + + flags := linux.EFD_NONBLOCK + if efd.semMode { + flags |= linux.EFD_SEMAPHORE + } + + fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) + if errno != 0 { + return -1, errno + } + + if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { + if closeErr := syscall.Close(int(fd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) + } + return -1, err + } + + efd.hostfd = int(fd) + return efd.hostfd, nil +} + +// Release implements FileDescriptionImpl.Release() +func (efd *EventFileDescription) Release() { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.RemoveFD(int32(efd.hostfd)) + if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) + } + efd.hostfd = -1 + } +} + +// Read implements FileDescriptionImpl.Read. +func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements FileDescriptionImpl.Write. +func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { + var buf [8]byte + if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err + } + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { + efd.mu.Lock() + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostReadLocked(ctx, dst) + } + + // We can't complete the read if the value is currently zero. + if efd.val == 0 { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if efd.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + efd.val-- + } else { + val = efd.val + efd.val = 0 + } + + efd.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + efd.queue.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostWriteLocked(val uint64) error { + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := syscall.Write(efd.hostfd, buf[:]) + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return efd.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (efd *EventFileDescription) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + efd.mu.Lock() + + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostWriteLocked(val) + } + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-efd.val { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + efd.val += val + efd.mu.Unlock() + + // Always trigger a notification. + efd.queue.Notify(waiter.EventIn) + + return nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + efd.mu.Lock() + defer efd.mu.Unlock() + + if efd.hostfd >= 0 { + return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) + } + + ready := waiter.EventMask(0) + if efd.val > 0 { + ready |= waiter.EventIn + } + + if efd.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + + return mask & ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { + efd.queue.EventRegister(entry, mask) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { + efd.queue.EventUnregister(entry) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go new file mode 100644 index 000000000..20e3adffc --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -0,0 +1,97 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventfd + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestEventFD(t *testing.T) { + initVals := []uint64{ + 0, + // Using a non-zero initial value verifies that writing to an + // eventfd signals when the eventfd's counter was already + // non-zero. + 343, + } + + for _, initVal := range initVals { + ctx := contexttest.Context(t) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + defer eventfd.DecRef() + + // Register a callback for a write event. + w, ch := waiter.NewChannelEntry(nil) + eventfd.EventRegister(&w, waiter.EventIn) + defer eventfd.EventUnregister(&w) + + data := []byte("00000124") + // Create and submit a write request. + n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatal(err) + } + if n != 8 { + t.Errorf("eventfd.write wrote %d bytes, not full int64", n) + } + + // Check if the callback fired due to the write event. + select { + case <-ch: + default: + t.Errorf("Didn't get notified of EventIn after write") + } + } +} + +func TestEventFDStat(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + defer eventfd.DecRef() + + statx, err := eventfd.Stat(ctx, vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + t.Fatalf("eventfd.Stat failed: %v", err) + } + if statx.Size != 0 { + t.Errorf("eventfd size should be 0") + } +} diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD new file mode 100644 index 000000000..fbb02a271 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "timerfd", + srcs = ["timerfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go new file mode 100644 index 000000000..60c92d626 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -0,0 +1,143 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package timerfd implements timer fds. +package timerfd + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// implements ktime.TimerListener. +type TimerFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + events waiter.Queue + timer *ktime.Timer + + // val is the number of timer expirations since the last successful + // call to PRead, or SetTime. val must be accessed using atomic memory + // operations. + val uint64 +} + +var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) +var _ ktime.TimerListener = (*TimerFileDescription)(nil) + +// New returns a new timer fd. +func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[timerfd]") + defer vd.DecRef() + tfd := &TimerFileDescription{} + tfd.timer = ktime.NewTimer(clock, tfd) + if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &tfd.vfsfd, nil +} + +// Read implements FileDescriptionImpl.Read. +func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + const sizeofUint64 = 8 + if dst.NumBytes() < sizeofUint64 { + return 0, syserror.EINVAL + } + if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], val) + if _, err := dst.CopyOut(ctx, buf[:]); err != nil { + // Linux does not undo consuming the number of + // expirations even if writing to userspace fails. + return 0, err + } + return sizeofUint64, nil + } + return 0, syserror.ErrWouldBlock +} + +// Clock returns the timer fd's Clock. +func (tfd *TimerFileDescription) Clock() ktime.Clock { + return tfd.timer.Clock() +} + +// GetTime returns the associated Timer's setting and the time at which it was +// observed. +func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { + return tfd.timer.Get() +} + +// SetTime atomically changes the associated Timer's setting, resets the number +// of expirations to 0, and returns the previous setting and the time at which +// it was observed. +func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { + return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + if atomic.LoadUint64(&tfd.val) != 0 { + ready |= waiter.EventIn + } + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + tfd.events.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { + tfd.events.EventUnregister(e) +} + +// PauseTimer pauses the associated Timer. +func (tfd *TimerFileDescription) PauseTimer() { + tfd.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (tfd *TimerFileDescription) ResumeTimer() { + tfd.timer.Resume() +} + +// Release implements FileDescriptionImpl.Release() +func (tfd *TimerFileDescription) Release() { + tfd.timer.Destroy() +} + +// Notify implements ktime.TimerListener.Notify. +func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + atomic.AddUint64(&tfd.val, exp) + tfd.events.Notify(waiter.EventIn) + return ktime.Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (tfd *TimerFileDescription) Destroy() {} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e47af66d6..8104f50f3 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -172,6 +172,7 @@ go_library( "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/fsimpl/timerfd", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index c91b9dce2..271ea5faf 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -48,10 +48,11 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -1068,11 +1069,11 @@ func (k *Kernel) pauseTimeLocked() { if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { - if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() } } else { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { tfd.PauseTimer() } } @@ -1104,11 +1105,11 @@ func (k *Kernel) resumeTimeLocked() { if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { - if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() } } else { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { tfd.ResumeTimer() } } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 14838aa2c..c32f942fb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -39,8 +39,10 @@ go_library( "//pkg/gohacks", "//pkg/sentry/arch", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/eventfd", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/signalfd", + "//pkg/sentry/fsimpl/timerfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index bd2194972..aff1a2070 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,12 +32,13 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, syserror.EINVAL } + vfsObj := t.Kernel().VFS() fileFlags := uint32(linux.O_RDWR) if flags&linux.EFD_NONBLOCK != 0 { fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 - eventfd, err := t.Kernel().VFS().NewEventFD(initVal, semMode, fileFlags) + eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 839a07db1..5ac79bc09 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -17,9 +17,9 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,9 +32,12 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } - var fileFlags uint32 + // Timerfds aren't writable per se (their implementation of Write just + // returns EINVAL), but they are "opened for writing", which is necessary + // to actually reach said implementation of Write. + fileFlags := uint32(linux.O_RDWR) if flags&linux.TFD_NONBLOCK != 0 { - fileFlags = linux.O_NONBLOCK + fileFlags |= linux.O_NONBLOCK } var clock ktime.Clock @@ -46,10 +49,8 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel default: return 0, nil, syserror.EINVAL } - // Timerfds aren't writable per se (their implementation of Write just - // returns EINVAL), but they are "opened for writing", which is necessary - // to actually reach said implementation of Write. - file, err := t.Kernel().VFS().NewTimerFD(clock, linux.O_RDWR|fileFlags) + vfsObj := t.Kernel().VFS() + file, err := timerfd.New(vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } @@ -80,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } defer file.DecRef() - tfd, ok := file.Impl().(*vfs.TimerFileDescription) + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, syserror.EINVAL } @@ -114,7 +115,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } defer file.DecRef() - tfd, ok := file.Impl().(*vfs.TimerFileDescription) + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, syserror.EINVAL } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 86046dd99..94d69c1cc 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -25,7 +25,6 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", - "eventfd.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", @@ -37,7 +36,6 @@ go_library( "pathname.go", "permissions.go", "resolving_path.go", - "timerfd.go", "vfs.go", ], visibility = ["//pkg/sentry:internal"], @@ -71,7 +69,6 @@ go_test( name = "vfs_test", size = "small", srcs = [ - "eventfd_test.go", "file_description_impl_util_test.go", "mount_test.go", ], @@ -83,6 +80,5 @@ go_test( "//pkg/sync", "//pkg/syserror", "//pkg/usermem", - "//pkg/waiter", ], ) diff --git a/pkg/sentry/vfs/eventfd.go b/pkg/sentry/vfs/eventfd.go deleted file mode 100644 index f39dacacf..000000000 --- a/pkg/sentry/vfs/eventfd.go +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "math" - "sync" - "syscall" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// EventFileDescription implements FileDescriptionImpl for file-based event -// notification (eventfd). Eventfds are usually internal to the Sentry but in -// certain situations they may be converted into a host-backed eventfd. -type EventFileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl - DentryMetadataFileDescriptionImpl - - // queue is used to notify interested parties when the event object - // becomes readable or writable. - queue waiter.Queue `state:"zerovalue"` - - // mu protects the fields below. - mu sync.Mutex `state:"nosave"` - - // val is the current value of the event counter. - val uint64 - - // semMode specifies whether the event is in "semaphore" mode. - semMode bool - - // hostfd indicates whether this eventfd is passed through to the host. - hostfd int -} - -var _ FileDescriptionImpl = (*EventFileDescription)(nil) - -// NewEventFD creates a new event fd. -func (vfs *VirtualFilesystem) NewEventFD(initVal uint64, semMode bool, flags uint32) (*FileDescription, error) { - vd := vfs.NewAnonVirtualDentry("[eventfd]") - defer vd.DecRef() - efd := &EventFileDescription{ - val: initVal, - semMode: semMode, - hostfd: -1, - } - if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ - UseDentryMetadata: true, - DenyPRead: true, - DenyPWrite: true, - }); err != nil { - return nil, err - } - return &efd.vfsfd, nil -} - -// HostFD returns the host eventfd associated with this event. -func (efd *EventFileDescription) HostFD() (int, error) { - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - return efd.hostfd, nil - } - - flags := linux.EFD_NONBLOCK - if efd.semMode { - flags |= linux.EFD_SEMAPHORE - } - - fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) - if errno != 0 { - return -1, errno - } - - if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { - if closeErr := syscall.Close(int(fd)); closeErr != nil { - log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) - } - return -1, err - } - - efd.hostfd = int(fd) - return efd.hostfd, nil -} - -// Release implements FileDescriptionImpl.Release() -func (efd *EventFileDescription) Release() { - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.RemoveFD(int32(efd.hostfd)) - if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { - log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) - } - efd.hostfd = -1 - } -} - -// Read implements FileDescriptionImpl.Read. -func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ ReadOptions) (int64, error) { - if dst.NumBytes() < 8 { - return 0, syscall.EINVAL - } - if err := efd.read(ctx, dst); err != nil { - return 0, err - } - return 8, nil -} - -// Write implements FileDescriptionImpl.Write. -func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ WriteOptions) (int64, error) { - if src.NumBytes() < 8 { - return 0, syscall.EINVAL - } - if err := efd.write(ctx, src); err != nil { - return 0, err - } - return 8, nil -} - -// Preconditions: Must be called with efd.mu locked. -func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { - var buf [8]byte - if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { - if err == syscall.EWOULDBLOCK { - return syserror.ErrWouldBlock - } - return err - } - _, err := dst.CopyOut(ctx, buf[:]) - return err -} - -func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { - efd.mu.Lock() - if efd.hostfd >= 0 { - defer efd.mu.Unlock() - return efd.hostReadLocked(ctx, dst) - } - - // We can't complete the read if the value is currently zero. - if efd.val == 0 { - efd.mu.Unlock() - return syserror.ErrWouldBlock - } - - // Update the value based on the mode the event is operating in. - var val uint64 - if efd.semMode { - val = 1 - // Consistent with Linux, this is done even if writing to memory fails. - efd.val-- - } else { - val = efd.val - efd.val = 0 - } - - efd.mu.Unlock() - - // Notify writers. We do this even if we were already writable because - // it is possible that a writer is waiting to write the maximum value - // to the event. - efd.queue.Notify(waiter.EventOut) - - var buf [8]byte - usermem.ByteOrder.PutUint64(buf[:], val) - _, err := dst.CopyOut(ctx, buf[:]) - return err -} - -// Preconditions: Must be called with efd.mu locked. -func (efd *EventFileDescription) hostWriteLocked(val uint64) error { - var buf [8]byte - usermem.ByteOrder.PutUint64(buf[:], val) - _, err := syscall.Write(efd.hostfd, buf[:]) - if err == syscall.EWOULDBLOCK { - return syserror.ErrWouldBlock - } - return err -} - -func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { - var buf [8]byte - if _, err := src.CopyIn(ctx, buf[:]); err != nil { - return err - } - val := usermem.ByteOrder.Uint64(buf[:]) - - return efd.Signal(val) -} - -// Signal is an internal function to signal the event fd. -func (efd *EventFileDescription) Signal(val uint64) error { - if val == math.MaxUint64 { - return syscall.EINVAL - } - - efd.mu.Lock() - - if efd.hostfd >= 0 { - defer efd.mu.Unlock() - return efd.hostWriteLocked(val) - } - - // We only allow writes that won't cause the value to go over the max - // uint64 minus 1. - if val > math.MaxUint64-1-efd.val { - efd.mu.Unlock() - return syserror.ErrWouldBlock - } - - efd.val += val - efd.mu.Unlock() - - // Always trigger a notification. - efd.queue.Notify(waiter.EventIn) - - return nil -} - -// Readiness implements waiter.Waitable.Readiness. -func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { - efd.mu.Lock() - defer efd.mu.Unlock() - - if efd.hostfd >= 0 { - return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) - } - - ready := waiter.EventMask(0) - if efd.val > 0 { - ready |= waiter.EventIn - } - - if efd.val < math.MaxUint64-1 { - ready |= waiter.EventOut - } - - return mask & ready -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { - efd.queue.EventRegister(entry, mask) - - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.UpdateFD(int32(efd.hostfd)) - } -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { - efd.queue.EventUnregister(entry) - - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.UpdateFD(int32(efd.hostfd)) - } -} diff --git a/pkg/sentry/vfs/eventfd_test.go b/pkg/sentry/vfs/eventfd_test.go deleted file mode 100644 index 2dff2d10b..000000000 --- a/pkg/sentry/vfs/eventfd_test.go +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -func TestEventFD(t *testing.T) { - initVals := []uint64{ - 0, - // Using a non-zero initial value verifies that writing to an - // eventfd signals when the eventfd's counter was already - // non-zero. - 343, - } - - for _, initVal := range initVals { - ctx := contexttest.Context(t) - vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - t.Fatalf("VFS init: %v", err) - } - - // Make a new eventfd that is writable. - eventfd, err := vfsObj.NewEventFD(initVal, false, linux.O_RDWR) - if err != nil { - t.Fatalf("NewEventFD failed: %v", err) - } - defer eventfd.DecRef() - - // Register a callback for a write event. - w, ch := waiter.NewChannelEntry(nil) - eventfd.EventRegister(&w, waiter.EventIn) - defer eventfd.EventUnregister(&w) - - data := []byte("00000124") - // Create and submit a write request. - n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), WriteOptions{}) - if err != nil { - t.Fatal(err) - } - if n != 8 { - t.Errorf("eventfd.write wrote %d bytes, not full int64", n) - } - - // Check if the callback fired due to the write event. - select { - case <-ch: - default: - t.Errorf("Didn't get notified of EventIn after write") - } - } -} - -func TestEventFDStat(t *testing.T) { - ctx := contexttest.Context(t) - vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - t.Fatalf("VFS init: %v", err) - } - - // Make a new eventfd that is writable. - eventfd, err := vfsObj.NewEventFD(0, false, linux.O_RDWR) - if err != nil { - t.Fatalf("NewEventFD failed: %v", err) - } - defer eventfd.DecRef() - - statx, err := eventfd.Stat(ctx, StatOptions{ - Mask: linux.STATX_BASIC_STATS, - }) - if err != nil { - t.Fatalf("eventfd.Stat failed: %v", err) - } - if statx.Size != 0 { - t.Errorf("eventfd size should be 0") - } -} diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go deleted file mode 100644 index cc536ceaf..000000000 --- a/pkg/sentry/vfs/timerfd.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/context" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// TimerFileDescription implements FileDescriptionImpl for timer fds. It also -// implements ktime.TimerListener. -type TimerFileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl - DentryMetadataFileDescriptionImpl - - events waiter.Queue - timer *ktime.Timer - - // val is the number of timer expirations since the last successful - // call to PRead, or SetTime. val must be accessed using atomic memory - // operations. - val uint64 -} - -var _ FileDescriptionImpl = (*TimerFileDescription)(nil) -var _ ktime.TimerListener = (*TimerFileDescription)(nil) - -// NewTimerFD returns a new timer fd. -func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) { - vd := vfs.NewAnonVirtualDentry("[timerfd]") - defer vd.DecRef() - tfd := &TimerFileDescription{} - tfd.timer = ktime.NewTimer(clock, tfd) - if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ - UseDentryMetadata: true, - DenyPRead: true, - DenyPWrite: true, - }); err != nil { - return nil, err - } - return &tfd.vfsfd, nil -} - -// Read implements FileDescriptionImpl.Read. -func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - const sizeofUint64 = 8 - if dst.NumBytes() < sizeofUint64 { - return 0, syserror.EINVAL - } - if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { - var buf [sizeofUint64]byte - usermem.ByteOrder.PutUint64(buf[:], val) - if _, err := dst.CopyOut(ctx, buf[:]); err != nil { - // Linux does not undo consuming the number of - // expirations even if writing to userspace fails. - return 0, err - } - return sizeofUint64, nil - } - return 0, syserror.ErrWouldBlock -} - -// Clock returns the timer fd's Clock. -func (tfd *TimerFileDescription) Clock() ktime.Clock { - return tfd.timer.Clock() -} - -// GetTime returns the associated Timer's setting and the time at which it was -// observed. -func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { - return tfd.timer.Get() -} - -// SetTime atomically changes the associated Timer's setting, resets the number -// of expirations to 0, and returns the previous setting and the time at which -// it was observed. -func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { - return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) -} - -// Readiness implements waiter.Waitable.Readiness. -func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { - var ready waiter.EventMask - if atomic.LoadUint64(&tfd.val) != 0 { - ready |= waiter.EventIn - } - return ready -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - tfd.events.EventRegister(e, mask) -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { - tfd.events.EventUnregister(e) -} - -// PauseTimer pauses the associated Timer. -func (tfd *TimerFileDescription) PauseTimer() { - tfd.timer.Pause() -} - -// ResumeTimer resumes the associated Timer. -func (tfd *TimerFileDescription) ResumeTimer() { - tfd.timer.Resume() -} - -// Release implements FileDescriptionImpl.Release() -func (tfd *TimerFileDescription) Release() { - tfd.timer.Destroy() -} - -// Notify implements ktime.TimerListener.Notify. -func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { - atomic.AddUint64(&tfd.val, exp) - tfd.events.Notify(waiter.EventIn) - return ktime.Setting{}, false -} - -// Destroy implements ktime.TimerListener.Destroy. -func (tfd *TimerFileDescription) Destroy() {} -- cgit v1.2.3 From 9115f26851b6f00ae01e9c130e3b5b342495c9e5 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 7 May 2020 14:00:36 -0700 Subject: Allocate device numbers for VFS2 filesystems. Updates #1197, #1198, #1672 PiperOrigin-RevId: 310432006 --- pkg/abi/linux/dev.go | 4 + pkg/sentry/fsimpl/devpts/devpts.go | 40 +++++-- pkg/sentry/fsimpl/ext/ext.go | 16 ++- pkg/sentry/fsimpl/ext/filesystem.go | 8 +- pkg/sentry/fsimpl/ext/inode.go | 2 + pkg/sentry/fsimpl/gofer/gofer.go | 19 +++- pkg/sentry/fsimpl/host/host.go | 143 +++++++++++++------------ pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 4 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 36 +++++-- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 6 +- pkg/sentry/fsimpl/kernfs/symlink.go | 8 +- pkg/sentry/fsimpl/pipefs/BUILD | 1 + pkg/sentry/fsimpl/pipefs/pipefs.go | 79 +++++++++----- pkg/sentry/fsimpl/proc/filesystem.go | 29 +++-- pkg/sentry/fsimpl/proc/subtasks.go | 12 +-- pkg/sentry/fsimpl/proc/task.go | 64 +++++------ pkg/sentry/fsimpl/proc/task_fds.go | 28 ++--- pkg/sentry/fsimpl/proc/task_files.go | 16 +-- pkg/sentry/fsimpl/proc/task_net.go | 38 +++---- pkg/sentry/fsimpl/proc/tasks.go | 47 ++++---- pkg/sentry/fsimpl/proc/tasks_files.go | 8 +- pkg/sentry/fsimpl/proc/tasks_sys.go | 108 +++++++++---------- pkg/sentry/fsimpl/sockfs/BUILD | 1 + pkg/sentry/fsimpl/sockfs/sockfs.go | 46 ++++++-- pkg/sentry/fsimpl/sys/sys.go | 21 +++- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 39 ++++--- pkg/sentry/kernel/kernel.go | 10 +- pkg/sentry/socket/hostinet/BUILD | 1 - pkg/sentry/socket/hostinet/socket_vfs2.go | 4 +- pkg/sentry/socket/netlink/BUILD | 1 - pkg/sentry/socket/netlink/provider_vfs2.go | 4 +- pkg/sentry/socket/netstack/BUILD | 1 - pkg/sentry/socket/netstack/netstack_vfs2.go | 4 +- pkg/sentry/socket/unix/BUILD | 1 - pkg/sentry/socket/unix/unix_vfs2.go | 4 +- pkg/sentry/vfs/anonfs.go | 2 +- pkg/sentry/vfs/device.go | 2 +- runsc/boot/loader.go | 5 +- 38 files changed, 517 insertions(+), 345 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go index 89f9a793f..fa3ae5f18 100644 --- a/pkg/abi/linux/dev.go +++ b/pkg/abi/linux/dev.go @@ -36,6 +36,10 @@ func DecodeDeviceID(rdev uint32) (uint16, uint32) { // // See Documentations/devices.txt and uapi/linux/major.h. const ( + // UNNAMED_MAJOR is the major device number for "unnamed" devices, whose + // minor numbers are dynamically allocated by the kernel. + UNNAMED_MAJOR = 0 + // MEM_MAJOR is the major device number for "memory" character devices. MEM_MAJOR = 1 diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 94db8fe5c..c03c65445 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -51,21 +51,37 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EINVAL } - fs, root := fstype.newFilesystem(vfsObj, creds) - return fs.VFSFilesystem(), root.VFSDentry(), nil + fs, root, err := fstype.newFilesystem(vfsObj, creds) + if err != nil { + return nil, nil, err + } + return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil +} + +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 } // newFilesystem creates a new devpts filesystem with root directory and ptmx // master inode. It returns the filesystem and root Dentry. -func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*kernfs.Filesystem, *kernfs.Dentry) { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, fstype, fs) +func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, fstype, fs) // Construct the root directory. This is always inode id 1. root := &rootInode{ slaves: make(map[uint32]*slaveInode), } - root.InodeAttrs.Init(creds, 1, linux.ModeDirectory|0555) + root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) root.dentry.Init(root) @@ -74,7 +90,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds master := &masterInode{ root: root, } - master.InodeAttrs.Init(creds, 2, linux.ModeCharacterDevice|0666) + master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666) master.dentry.Init(master) // Add the master as a child of the root. @@ -83,7 +99,13 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds }) root.IncLinks(links) - return fs, &root.dentry + return fs, &root.dentry, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() } // rootInode is the root directory inode for the devpts mounts. @@ -140,7 +162,7 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) } // Linux always uses pty index + 3 as the inode id. See // fs/devpts/inode.c:devpts_pty_new(). - slave.InodeAttrs.Init(creds, uint64(idx+3), linux.ModeCharacterDevice|0600) + slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) slave.dentry.Init(slave) i.slaves[idx] = slave diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index 7176af6d1..dac6effbf 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -105,36 +105,50 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // EACCESS should be returned according to mount(2). Filesystem independent // flags (like readonly) are currently not available in pkg/sentry/vfs. + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + dev, err := getDeviceFd(source, opts) if err != nil { return nil, nil, err } - fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} + fs := filesystem{ + dev: dev, + inodeCache: make(map[uint32]*inode), + devMinor: devMinor, + } fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { // mount(2) specifies that EINVAL should be returned if the superblock is // invalid. + fs.vfsfs.DecRef() return nil, nil, syserror.EINVAL } // Refuse to mount if the filesystem is incompatible. if !isCompatible(fs.sb) { + fs.vfsfs.DecRef() return nil, nil, syserror.EINVAL } fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } rootInode.incRef() diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 77b644275..557963e03 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -64,6 +64,10 @@ type filesystem struct { // bgs represents all the block group descriptors for the filesystem. // Immutable after initialization. bgs []disklayout.BlockGroup + + // devMinor is this filesystem's device minor number. Immutable after + // initialization. + devMinor uint32 } // Compiles only if filesystem implements vfs.FilesystemImpl. @@ -366,7 +370,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() {} +func (fs *filesystem) Release() { + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) +} // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index a98512350..485f86f4b 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -204,6 +204,8 @@ func (in *inode) statTo(stat *linux.Statx) { stat.Atime = in.diskInode.AccessTime().StatxTimestamp() stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp() stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp() + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = in.fs.devMinor // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks // (including metadata blocks) required to represent this file. } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 9ab8fdc65..e68e37ebc 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -81,6 +81,9 @@ type filesystem struct { // clock is a realtime clock used to set timestamps in file operations. clock ktime.Clock + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 + // uid and gid are the effective KUID and KGID of the filesystem's creator, // and are used as the owner and group for files that don't specify one. // uid and gid are immutable. @@ -399,14 +402,21 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Construct the filesystem object. + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } fs := &filesystem{ mfp: mfp, opts: fsopts, iopts: iopts, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, client: client, clock: ktime.RealtimeClockFromContext(ctx), + devMinor: devMinor, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, syncableDentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), } @@ -464,6 +474,8 @@ func (fs *filesystem) Release() { // Close the connection to the server. This implicitly clunks all fids. fs.client.Close() } + + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } // dentry implements vfs.DentryImpl. @@ -796,7 +808,8 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) - // TODO(gvisor.dev/issue/1198): device number + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = d.fs.devMinor } func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 144e04905..55de9c438 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -115,15 +115,28 @@ func (filesystemType) Name() string { // // Note that there should only ever be one instance of host.filesystem, // a global mount for host fds. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() + return fs.VFSFilesystem(), nil } // filesystem implements vfs.FilesystemImpl. type filesystem struct { kernfs.Filesystem + + devMinor uint32 +} + +func (fs *filesystem) Release() { + fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() } func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { @@ -219,7 +232,7 @@ func (i *inode) Mode() linux.FileMode { } // Stat implements kernfs.Inode. -func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { return linux.Statx{}, syserror.EINVAL } @@ -227,73 +240,73 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro return linux.Statx{}, syserror.EINVAL } + fs := vfsfs.Impl().(*filesystem) + // Limit our host call only to known flags. mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) - // Fallback to fstat(2), if statx(2) is not supported on the host. - // - // TODO(b/151263641): Remove fallback. if err == syserror.ENOSYS { - return i.fstat(opts) - } else if err != nil { + // Fallback to fstat(2), if statx(2) is not supported on the host. + // + // TODO(b/151263641): Remove fallback. + return i.fstat(fs) + } + if err != nil { return linux.Statx{}, err } - ls := linux.Statx{Mask: mask} - // Unconditionally fill blksize, attributes, and device numbers, as indicated - // by /include/uapi/linux/stat.h. - // - // RdevMajor/RdevMinor are left as zero, so as not to expose host device - // numbers. - // - // TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined - // device numbers. If we use the device number from the host, it may collide - // with another sentry-internal device number. We handle device/inode - // numbers without relying on the host to prevent collisions. - ls.Blksize = s.Blksize - ls.Attributes = s.Attributes - ls.AttributesMask = s.Attributes_mask - - if mask&linux.STATX_TYPE != 0 { + // Unconditionally fill blksize, attributes, and device numbers, as + // indicated by /include/uapi/linux/stat.h. Inode number is always + // available, since we use our own rather than the host's. + ls := linux.Statx{ + Mask: linux.STATX_INO, + Blksize: s.Blksize, + Attributes: s.Attributes, + Ino: i.ino, + AttributesMask: s.Attributes_mask, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + } + + // Copy other fields that were returned by the host. RdevMajor/RdevMinor + // are never copied (and therefore left as zero), so as not to expose host + // device numbers. + ls.Mask |= s.Mask & linux.STATX_ALL + if s.Mask&linux.STATX_TYPE != 0 { ls.Mode |= s.Mode & linux.S_IFMT } - if mask&linux.STATX_MODE != 0 { + if s.Mask&linux.STATX_MODE != 0 { ls.Mode |= s.Mode &^ linux.S_IFMT } - if mask&linux.STATX_NLINK != 0 { + if s.Mask&linux.STATX_NLINK != 0 { ls.Nlink = s.Nlink } - if mask&linux.STATX_UID != 0 { + if s.Mask&linux.STATX_UID != 0 { ls.UID = s.Uid } - if mask&linux.STATX_GID != 0 { + if s.Mask&linux.STATX_GID != 0 { ls.GID = s.Gid } - if mask&linux.STATX_ATIME != 0 { + if s.Mask&linux.STATX_ATIME != 0 { ls.Atime = unixToLinuxStatxTimestamp(s.Atime) } - if mask&linux.STATX_BTIME != 0 { + if s.Mask&linux.STATX_BTIME != 0 { ls.Btime = unixToLinuxStatxTimestamp(s.Btime) } - if mask&linux.STATX_CTIME != 0 { + if s.Mask&linux.STATX_CTIME != 0 { ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) } - if mask&linux.STATX_MTIME != 0 { + if s.Mask&linux.STATX_MTIME != 0 { ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) } - if mask&linux.STATX_SIZE != 0 { + if s.Mask&linux.STATX_SIZE != 0 { ls.Size = s.Size } - if mask&linux.STATX_BLOCKS != 0 { + if s.Mask&linux.STATX_BLOCKS != 0 { ls.Blocks = s.Blocks } - // Use our own internal inode number. - if mask&linux.STATX_INO != 0 { - ls.Ino = i.ino - } - return ls, nil } @@ -305,36 +318,30 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro // of a mask or sync flags. fstat(2) does not provide any metadata // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so // those fields remain empty. -func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { var s unix.Stat_t if err := unix.Fstat(i.hostFD, &s); err != nil { return linux.Statx{}, err } - // Note that rdev numbers are left as 0; do not expose host device numbers. - ls := linux.Statx{ - Mask: linux.STATX_BASIC_STATS, - Blksize: uint32(s.Blksize), - Nlink: uint32(s.Nlink), - UID: s.Uid, - GID: s.Gid, - Mode: uint16(s.Mode), - Size: uint64(s.Size), - Blocks: uint64(s.Blocks), - Atime: timespecToStatxTimestamp(s.Atim), - Ctime: timespecToStatxTimestamp(s.Ctim), - Mtime: timespecToStatxTimestamp(s.Mtim), - } - - // Use our own internal inode number. - // - // TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well. - // If we use the device number from the host, it may collide with another - // sentry-internal device number. We handle device/inode numbers without - // relying on the host to prevent collisions. - ls.Ino = i.ino - - return ls, nil + // As with inode.Stat(), we always use internal device and inode numbers, + // and never expose the host's represented device numbers. + return linux.Statx{ + Mask: linux.STATX_BASIC_STATS, + Blksize: uint32(s.Blksize), + Nlink: uint32(s.Nlink), + UID: s.Uid, + GID: s.Gid, + Mode: uint16(s.Mode), + Ino: i.ino, + Size: uint64(s.Size), + Blocks: uint64(s.Blocks), + Atime: timespecToStatxTimestamp(s.Atim), + Ctime: timespecToStatxTimestamp(s.Ctim), + Mtime: timespecToStatxTimestamp(s.Mtim), + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + }, nil } // SetStat implements kernfs.Inode. @@ -453,8 +460,6 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.F } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. -// -// TODO(gvisor.dev/issue/1672): Implement Waitable interface. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -471,12 +476,12 @@ type fileDescription struct { // SetStat implements vfs.FileDescriptionImpl. func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - return f.inode.SetStat(ctx, nil, creds, opts) + return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) } // Stat implements vfs.FileDescriptionImpl. func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) { - return f.inode.Stat(nil, opts) + return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts) } // Release implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c7779fc11..1568a9d49 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -44,11 +44,11 @@ type DynamicBytesFile struct { var _ Inode = (*DynamicBytesFile)(nil) // Init initializes a dynamic bytes file. -func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { +func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) + f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) f.data = data } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 615592d5f..982daa2e6 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -192,15 +192,17 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, // // Must be initialized by Init prior to first use. type InodeAttrs struct { - ino uint64 - mode uint32 - uid uint32 - gid uint32 - nlink uint32 + devMajor uint32 + devMinor uint32 + ino uint64 + mode uint32 + uid uint32 + gid uint32 + nlink uint32 } // Init initializes this InodeAttrs. -func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) { +func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) { if mode.FileType() == 0 { panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) } @@ -209,6 +211,8 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo if mode.FileType() == linux.ModeDirectory { nlink = 2 } + a.devMajor = devMajor + a.devMinor = devMinor atomic.StoreUint64(&a.ino, ino) atomic.StoreUint32(&a.mode, uint32(mode)) atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID)) @@ -216,6 +220,16 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo atomic.StoreUint32(&a.nlink, nlink) } +// DevMajor returns the device major number. +func (a *InodeAttrs) DevMajor() uint32 { + return a.devMajor +} + +// DevMinor returns the device minor number. +func (a *InodeAttrs) DevMinor() uint32 { + return a.devMinor +} + // Ino returns the inode id. func (a *InodeAttrs) Ino() uint64 { return atomic.LoadUint64(&a.ino) @@ -232,6 +246,8 @@ func (a *InodeAttrs) Mode() linux.FileMode { func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK + stat.DevMajor = a.devMajor + stat.DevMinor = a.devMinor stat.Ino = atomic.LoadUint64(&a.ino) stat.Mode = uint16(a.Mode()) stat.UID = atomic.LoadUint32(&a.uid) @@ -544,9 +560,9 @@ type StaticDirectory struct { var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. -func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { +func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { inode := &StaticDirectory{} - inode.Init(creds, ino, perm) + inode.Init(creds, devMajor, devMinor, ino, perm) dentry := &Dentry{} dentry.Init(inode) @@ -559,11 +575,11 @@ func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, chil } // Init initializes StaticDirectory. -func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { +func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - s.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) + s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } // Open implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 1c5d3e7e7..412cf6ac9 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -75,7 +75,7 @@ type file struct { func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { f := &file{} f.content = content - f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777) + f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777) d := &kernfs.Dentry{} d.Init(f) @@ -107,7 +107,7 @@ type readonlyDir struct { func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &readonlyDir{} - dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) dir.dentry.Init(dir) @@ -137,7 +137,7 @@ type dir struct { func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &dir{} dir.fs = fs - dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) dir.dentry.Init(dir) diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 0aa6dc979..2ab3f53fd 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -35,9 +35,9 @@ type StaticSymlink struct { var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. -func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentry { +func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry { inode := &StaticSymlink{} - inode.Init(creds, ino, target) + inode.Init(creds, devMajor, devMinor, ino, target) d := &Dentry{} d.Init(inode) @@ -45,9 +45,9 @@ func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentr } // Init initializes the instance. -func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) { +func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) { s.target = target - s.InodeAttrs.Init(creds, ino, linux.ModeSymlink|0777) + s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) } // Readlink implements Inode. diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index 0d411606f..5950a2d59 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 5375e5e75..cab771211 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -17,8 +17,11 @@ package pipefs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" @@ -40,20 +43,36 @@ func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile panic("pipefs.filesystemType.GetFilesystem should never be called") } -// TODO(gvisor.dev/issue/1193): -// -// - kernfs does not provide a way to implement statfs, from which we -// should indicate PIPEFS_MAGIC. -// -// - kernfs does not provide a way to override names for -// vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic -// name fmt.Sprintf("pipe:[%d]", inode.ino). +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} // NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.Filesystem.VFSFilesystem(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) + b.PrependComponent(fmt.Sprintf("pipe:[%d]", inode.ino)) + return vfs.PrependPathSyntheticError{} } // inode implements kernfs.Inode. @@ -71,11 +90,11 @@ type inode struct { ctime ktime.Time } -func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode { +func newInode(ctx context.Context, fs *filesystem) *inode { creds := auth.CredentialsFromContext(ctx) return &inode{ pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), - ino: fs.NextIno(), + ino: fs.Filesystem.NextIno(), uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, ctime: ktime.NowFromContext(ctx), @@ -98,19 +117,20 @@ func (i *inode) Mode() linux.FileMode { func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) return linux.Statx{ - Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, - Blksize: usermem.PageSize, - Nlink: 1, - UID: uint32(i.uid), - GID: uint32(i.gid), - Mode: pipeMode, - Ino: i.ino, - Size: 0, - Blocks: 0, - Atime: ts, - Ctime: ts, - Mtime: ts, - // TODO(gvisor.dev/issue/1197): Device number. + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: usermem.PageSize, + Nlink: 1, + UID: uint32(i.uid), + GID: uint32(i.gid), + Mode: pipeMode, + Ino: i.ino, + Size: 0, + Blocks: 0, + Atime: ts, + Ctime: ts, + Mtime: ts, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: vfsfs.Impl().(*filesystem).devMinor, }, nil } @@ -122,6 +142,9 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. return syserror.EPERM } +// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement +// statfs, from which we should indicate PIPEFS_MAGIC. + // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags) @@ -132,7 +155,7 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr // // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) { - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + fs := mnt.Filesystem().Impl().(*filesystem) inode := newInode(ctx, fs) var d kernfs.Dentry d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 104fc9030..609210253 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -41,6 +41,12 @@ func (FilesystemType) Name() string { return Name } +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) @@ -51,8 +57,13 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF if pidns == nil { return nil, nil, fmt.Errorf("procfs requires a PID namespace") } - - procfs := &kernfs.Filesystem{} + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + procfs := &filesystem{ + devMinor: devMinor, + } procfs.VFSFilesystem().Init(vfsObj, &ft, procfs) var cgroups map[string]string @@ -61,21 +72,27 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF cgroups = data.Cgroups } - _, dentry := newTasksInode(procfs, k, pidns, cgroups) + _, dentry := procfs.newTasksInode(k, pidns, cgroups) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + // dynamicInode is an overfitted interface for common Inodes with // dynamicByteSource types used in procfs. type dynamicInode interface { kernfs.Inode vfs.DynamicBytesSource - Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) + Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) } -func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { - inode.Init(creds, ino, inode, perm) +func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) d := &kernfs.Dentry{} d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index a5cfa8333..36a911db4 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -37,23 +37,23 @@ type subtasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid + fs *filesystem task *kernel.Task pidns *kernel.PIDNamespace - inoGen InoGenerator cgroupControllers map[string]string } var _ kernfs.Inode = (*subtasksInode)(nil) -func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator, cgroupControllers map[string]string) *kernfs.Dentry { +func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry { subInode := &subtasksInode{ + fs: fs, task: task, pidns: pidns, - inoGen: inoGen, cgroupControllers: cgroupControllers, } // Note: credentials are overridden by taskOwnedInode. - subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) inode := &taskOwnedInode{Inode: subInode, owner: task} @@ -78,7 +78,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e return nil, syserror.ENOENT } - subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false, i.cgroupControllers) + subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) return subTaskDentry.VFSDentry(), nil } @@ -102,7 +102,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 66419d91b..482055db1 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -43,45 +43,45 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) -func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { +func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited. contents := map[string]*kernfs.Dentry{ - "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), - "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), - "comm": newComm(task, inoGen.NextIno(), 0444), - "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), - "exe": newExeSymlink(task, inoGen.NextIno()), - "fd": newFDDirInode(task, inoGen), - "fdinfo": newFDInfoDirInode(task, inoGen), - "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), - "mountinfo": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountsData{task: task}), - "net": newTaskNetDir(task, inoGen), - "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ - "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), - "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), - "user": newNamespaceSymlink(task, inoGen.NextIno(), "user"), + "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}), + "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": fs.newComm(task, fs.NextIno(), 0444), + "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "exe": fs.newExeSymlink(task, fs.NextIno()), + "fd": fs.newFDDirInode(task), + "fdinfo": fs.newFDInfoDirInode(task), + "gid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}), + "mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}), + "net": fs.newTaskNetDir(task), + "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{ + "net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"), + "pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"), + "user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"), }), - "oom_score": newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")), - "oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}), + "stat": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}), + "status": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { - contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers) + contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers) } if len(cgroupControllers) > 0 { - contents["cgroup"] = newTaskOwnedFile(task, inoGen.NextIno(), 0444, newCgroupData(cgroupControllers)) + contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) } taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. - taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode := &taskOwnedInode{Inode: taskInode, owner: task} dentry := &kernfs.Dentry{} @@ -126,9 +126,9 @@ type taskOwnedInode struct { var _ kernfs.Inode = (*taskOwnedInode)(nil) -func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { +func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, inode, perm) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) taskInode := &taskOwnedInode{Inode: inode, owner: task} d := &kernfs.Dentry{} @@ -136,11 +136,11 @@ func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode return d } -func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &kernfs.StaticDirectory{} // Note: credentials are overridden by taskOwnedInode. - dir.Init(task.Credentials(), ino, perm) + dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) inode := &taskOwnedInode{Inode: dir, owner: task} d := &kernfs.Dentry{} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 8ad976073..44ccc9e4a 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -53,8 +53,8 @@ func taskFDExists(t *kernel.Task, fd int32) bool { } type fdDir struct { - inoGen InoGenerator - task *kernel.Task + fs *filesystem + task *kernel.Task // When produceSymlinks is set, dirents produces for the FDs are reported // as symlink. Otherwise, they are reported as regular files. @@ -85,7 +85,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, abs dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(fd), 10), Type: typ, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -110,15 +110,15 @@ type fdDirInode struct { var _ kernfs.Inode = (*fdDirInode)(nil) -func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { inode := &fdDirInode{ fdDir: fdDir{ - inoGen: inoGen, + fs: fs, task: task, produceSymlink: true, }, } - inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -137,7 +137,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro if !taskFDExists(i.task, fd) { return nil, syserror.ENOENT } - taskDentry := newFDSymlink(i.task, fd, i.inoGen.NextIno()) + taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) return taskDentry.VFSDentry(), nil } @@ -186,12 +186,12 @@ type fdSymlink struct { var _ kernfs.Inode = (*fdSymlink)(nil) -func newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { inode := &fdSymlink{ task: task, fd: fd, } - inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -234,14 +234,14 @@ type fdInfoDirInode struct { var _ kernfs.Inode = (*fdInfoDirInode)(nil) -func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { inode := &fdInfoDirInode{ fdDir: fdDir{ - inoGen: inoGen, - task: task, + fs: fs, + task: task, }, } - inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -264,7 +264,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, task: i.task, fd: fd, } - dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data) + dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data) return dentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 515f25327..2f297e48a 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -241,9 +241,9 @@ type commInode struct { task *kernel.Task } -func newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { +func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { inode := &commInode{task: task} - inode.DynamicBytesFile.Init(task.Credentials(), ino, &commData{task: task}, perm) + inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) d := &kernfs.Dentry{} d.Init(inode) @@ -596,9 +596,9 @@ type exeSymlink struct { var _ kernfs.Inode = (*exeSymlink)(nil) -func newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { inode := &exeSymlink{task: task} - inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -729,7 +729,7 @@ type namespaceSymlink struct { task *kernel.Task } -func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { +func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { // Namespace symlinks should contain the namespace name and the inode number // for the namespace instance, so for example user:[123456]. We currently fake // the inode number by sticking the symlink inode in its place. @@ -737,7 +737,7 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr inode := &namespaceSymlink{task: task} // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, target) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) taskInode := &taskOwnedInode{Inode: inode, owner: task} d := &kernfs.Dentry{} @@ -780,11 +780,11 @@ type namespaceInode struct { var _ kernfs.Inode = (*namespaceInode)(nil) // Init initializes a namespace inode. -func (i *namespaceInode) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { +func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - i.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) + i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) } // Open implements Inode.Open. diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 9c329341a..6bde27376 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -37,7 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry { k := task.Kernel() pidns := task.PIDNamespace() root := auth.NewRootCredentials(pidns.UserNamespace()) @@ -57,37 +57,37 @@ func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task // network namespace. contents = map[string]*kernfs.Dentry{ - "dev": newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}), - "snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}), + "dev": fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}), + "snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, if the file contains a header the stub is just the header // otherwise it is an empty file. - "arp": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)), - "netlink": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)), - "netstat": newDentry(root, inoGen.NextIno(), 0444, &netStatData{}), - "packet": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)), - "protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)), + "arp": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)), + "netlink": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)), + "netstat": fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}), + "packet": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)), + "protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)), // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, // high res timer ticks per sec (ClockGetres returns 1ns resolution). - "psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)), - "ptype": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)), - "route": newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}), - "tcp": newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}), - "udp": newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}), - "unix": newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}), + "psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)), + "ptype": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)), + "route": fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}), + "tcp": fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}), + "udp": fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}), + "unix": fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}), } if stack.SupportsIPv6() { - contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack}) - contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")) - contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k}) - contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6)) + contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")) + contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6)) } } - return newTaskOwnedDir(task, inoGen.NextIno(), 0555, contents) + return fs.newTaskOwnedDir(task, fs.NextIno(), 0555, contents) } // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 5aeda8c9b..b51d43954 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -33,11 +33,6 @@ const ( threadSelfName = "thread-self" ) -// InoGenerator generates unique inode numbers for a given filesystem. -type InoGenerator interface { - NextIno() uint64 -} - // tasksInode represents the inode for /proc/ directory. // // +stateify savable @@ -48,8 +43,8 @@ type tasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid - inoGen InoGenerator - pidns *kernel.PIDNamespace + fs *filesystem + pidns *kernel.PIDNamespace // '/proc/self' and '/proc/thread-self' have custom directory offsets in // Linux. So handle them outside of OrderedChildren. @@ -64,29 +59,29 @@ type tasksInode struct { var _ kernfs.Inode = (*tasksInode)(nil) -func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { +func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), - "filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), - "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), - "sys": newSysDir(root, inoGen, k), - "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), - "net": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), - "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), + "cpuinfo": fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), + "filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}), + "loadavg": fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}), + "sys": fs.newSysDir(root, k), + "meminfo": fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"), + "net": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"), + "stat": fs.newDentry(root, fs.NextIno(), 0444, &statData{}), + "uptime": fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}), + "version": fs.newDentry(root, fs.NextIno(), 0444, &versionData{}), } inode := &tasksInode{ pidns: pidns, - inoGen: inoGen, - selfSymlink: newSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), - threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), + fs: fs, + selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), + threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), cgroupControllers: cgroupControllers, } - inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -118,7 +113,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } - taskDentry := newTaskInode(i.inoGen, task, i.pidns, true, i.cgroupControllers) + taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) return taskDentry.VFSDentry(), nil } @@ -144,7 +139,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: selfName, Type: linux.DT_LNK, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -156,7 +151,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: threadSelfName, Type: linux.DT_LNK, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -189,7 +184,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, } if err := cb.Handle(dirent); err != nil { diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index e5f13b69e..7d8983aa5 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -41,9 +41,9 @@ type selfSymlink struct { var _ kernfs.Inode = (*selfSymlink)(nil) -func newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &selfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|0777) + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -83,9 +83,9 @@ type threadSelfSymlink struct { var _ kernfs.Inode = (*threadSelfSymlink)(nil) -func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &threadSelfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|0777) + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 0e90e02fe..6dac2afa4 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -30,89 +30,89 @@ import ( ) // newSysDir returns the dentry corresponding to /proc/sys directory. -func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}), - "shmall": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMALL)), - "shmmax": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMAX)), - "shmmni": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)), +func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { + return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}), + "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)), + "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)), + "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)), }), - "vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{k: k}), - "overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")), + "vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}), + "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")), }), - "net": newSysNetDir(root, inoGen, k), + "net": fs.newSysNetDir(root, k), }) } // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. -func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { +func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry // TODO(gvisor.dev/issue/1833): Support for using the network stack in the // network namespace of the calling process. if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ - "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}), + "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the // value closest to the actual netstack behavior or any empty file, all // of these files will have mode 0444 (read-only for all users). - "ip_local_port_range": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("16000 65535")), - "ip_local_reserved_ports": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "ipfrag_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("30")), - "ip_nonlocal_bind": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "ip_no_pmtu_disc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "ip_local_port_range": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000 65535")), + "ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "ipfrag_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")), + "ip_nonlocal_bind": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "ip_no_pmtu_disc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), // tcp_allowed_congestion_control tell the user what they are able to // do as an unprivledged process so we leave it empty. - "tcp_allowed_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "tcp_available_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), - "tcp_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), + "tcp_allowed_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), + "tcp_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), // Many of the following stub files are features netstack doesn't // support. The unsupported features return "0" to indicate they are // disabled. - "tcp_base_mss": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1280")), - "tcp_dsack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_early_retrans": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen_key": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "tcp_invalid_ratelimit": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_intvl": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_probes": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("7200")), - "tcp_mtu_probing": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_no_metrics_save": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_probe_interval": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_probe_threshold": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_retries1": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), - "tcp_retries2": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("15")), - "tcp_rfc1337": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_slow_start_after_idle": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_synack_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), - "tcp_syn_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), - "tcp_timestamps": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_base_mss": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")), + "tcp_dsack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_early_retrans": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen_key": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "tcp_invalid_ratelimit": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_intvl": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_probes": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")), + "tcp_mtu_probing": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_no_metrics_save": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_probe_interval": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_probe_threshold": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_retries1": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), + "tcp_retries2": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")), + "tcp_rfc1337": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_synack_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), + "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), + "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), }), - "core": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "default_qdisc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("pfifo_fast")), - "message_burst": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("10")), - "message_cost": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), - "optmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "rmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "rmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "somaxconn": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("128")), - "wmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "wmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")), + "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")), + "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), + "optmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "rmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "rmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "somaxconn": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")), + "wmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "wmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), }), } } - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents) + return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD index 52084ddb5..9453277b8 100644 --- a/pkg/sentry/fsimpl/sockfs/BUILD +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 239a9f4b4..ee0828a15 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -16,8 +16,11 @@ package sockfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -41,19 +44,42 @@ func (filesystemType) Name() string { return "sockfs" } +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + // NewFilesystem sets up and returns a new sockfs filesystem. // // Note that there should only ever be one instance of sockfs.Filesystem, // backing a global socket mount. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.Filesystem.VFSFilesystem(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) + b.PrependComponent(fmt.Sprintf("socket:[%d]", inode.InodeAttrs.Ino())) + return vfs.PrependPathSyntheticError{} } // inode implements kernfs.Inode. -// -// TODO(gvisor.dev/issue/1193): Device numbers. type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink @@ -67,11 +93,15 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } // NewDentry constructs and returns a sockfs dentry. -func NewDentry(creds *auth.Credentials, ino uint64) *vfs.Dentry { +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry { + fs := mnt.Filesystem().Impl().(*filesystem) + // File mode matches net/socket.c:sock_alloc. filemode := linux.FileMode(linux.S_IFSOCK | 0600) i := &inode{} - i.Init(creds, ino, filemode) + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode) d := &kernfs.Dentry{} d.Init(i) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 00f7d6214..0af373604 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -37,6 +37,8 @@ type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. type filesystem struct { kernfs.Filesystem + + devMinor uint32 } // Name implements vfs.FilesystemType.Name. @@ -46,7 +48,14 @@ func (FilesystemType) Name() string { // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - fs := &filesystem{} + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + } fs.VFSFilesystem().Init(vfsObj, &fsType, fs) k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() @@ -77,6 +86,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), root.VFSDentry(), nil } +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + // dir implements kernfs.Inode. type dir struct { kernfs.InodeAttrs @@ -90,7 +105,7 @@ type dir struct { func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { d := &dir{} - d.InodeAttrs.Init(creds, fs.NextIno(), linux.ModeDirectory|0755) + d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) d.dentry.Init(d) @@ -127,7 +142,7 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry { c := &cpuFile{maxCores: maxCores} - c.DynamicBytesFile.Init(creds, fs.NextIno(), c, mode) + c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) d := &kernfs.Dentry{} d.Init(c) return d diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index efc931468..405928bd0 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -63,6 +63,9 @@ type filesystem struct { // clock is a realtime clock used to set timestamps in file operations. clock time.Clock + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 + // mu serializes changes to the Dentry tree. mu sync.RWMutex @@ -96,11 +99,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") } - clock := time.RealtimeClockFromContext(ctx) - fs := filesystem{ - memFile: memFileProvider.MemoryFile(), - clock: clock, - } rootFileType := uint16(linux.S_IFDIR) newFSType := vfs.FilesystemType(&fstype) @@ -114,6 +112,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + clock := time.RealtimeClockFromContext(ctx) + fs := filesystem{ + memFile: memFileProvider.MemoryFile(), + clock: clock, + devMinor: devMinor, + } fs.vfsfs.Init(vfsObj, newFSType, &fs) var root *dentry @@ -125,6 +133,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt case linux.S_IFDIR: root = &fs.newDirectory(creds, 01777).dentry default: + fs.vfsfs.DecRef() return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &root.vfsd, nil @@ -132,6 +141,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } // dentry implements vfs.DentryImpl. @@ -188,8 +198,8 @@ func (d *dentry) DecRef() { // inode represents a filesystem object. type inode struct { - // clock is a realtime clock used to set timestamps in file operations. - clock time.Clock + // fs is the owning filesystem. fs is immutable. + fs *filesystem // refs is a reference count. refs is accessed using atomic memory // operations. @@ -232,7 +242,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, if mode.FileType() == 0 { panic("file type is required in FileMode") } - i.clock = fs.clock + i.fs = fs i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) @@ -327,7 +337,8 @@ func (i *inode) statTo(stat *linux.Statx) { stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) - // TODO(gvisor.dev/issue/1197): Device number. + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = i.fs.devMinor switch impl := i.impl.(type) { case *regularFile: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS @@ -401,7 +412,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return syserror.EINVAL } } - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() if mask&linux.STATX_ATIME != 0 { if stat.Atime.Nsec == linux.UTIME_NOW { atomic.StoreInt64(&i.atime, now) @@ -518,7 +529,7 @@ func (i *inode) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.atime, now) i.mu.Unlock() @@ -527,7 +538,7 @@ func (i *inode) touchAtime(mnt *vfs.Mount) { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCtime() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.ctime, now) i.mu.Unlock() @@ -535,7 +546,7 @@ func (i *inode) touchCtime() { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCMtime() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) @@ -545,7 +556,7 @@ func (i *inode) touchCMtime() { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds // inode.mu. func (i *inode) touchCMtimeLocked() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 271ea5faf..3617da8c6 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -373,7 +373,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { return fmt.Errorf("failed to initialize VFS: %v", err) } - pipeFilesystem := pipefs.NewFilesystem(&k.vfs) + pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create pipefs filesystem: %v", err) + } defer pipeFilesystem.DecRef() pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) if err != nil { @@ -381,7 +384,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.pipeMount = pipeMount - socketFilesystem := sockfs.NewFilesystem(&k.vfs) + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create sockfs filesystem: %v", err) + } defer socketFilesystem.DecRef() socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index deedd35f7..e82d6cd1e 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -26,7 +26,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostfd", "//pkg/sentry/inet", diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index a8278bffc..677743113 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -50,8 +49,7 @@ var _ = socket.SocketVFS2(&socketVFS2{}) func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) s := &socketVFS2{ socketOpsCommon: socketOpsCommon{ diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 09ca00a4a..7212d8644 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go index dcd92b5cd..bb205be0d 100644 --- a/pkg/sentry/socket/netlink/provider_vfs2.go +++ b/pkg/sentry/socket/netlink/provider_vfs2.go @@ -16,7 +16,6 @@ package netlink import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -52,8 +51,7 @@ func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol vfsfd := &s.vfsfd mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index ccf9fcf5c..6129fb83d 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -27,7 +27,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index f7d9b2ff4..191970d41 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -18,7 +18,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -53,8 +52,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu } mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) s := &SocketVFS2{ socketOpsCommon: socketOpsCommon{ diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 941a91097..de2cc4bdf 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -21,7 +21,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 06d838868..45e109361 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -50,8 +49,7 @@ var _ = socket.SocketVFS2(&SocketVFS2{}) // returns a corresponding file description. func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d) if err != nil { diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index adebaeefb..caf770fd5 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -202,7 +202,7 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St Ino: 1, Size: 0, Blocks: 0, - DevMajor: 0, + DevMajor: linux.UNNAMED_MAJOR, DevMinor: fs.devMinor, }, nil } diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index bda5576fa..1e9dffc8f 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -103,7 +103,7 @@ func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mo } // GetAnonBlockDevMinor allocates and returns an unused minor device number for -// an "anonymous" block device with major number 0. +// an "anonymous" block device with major number UNNAMED_MAJOR. func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) { vfs.anonBlockDevMinorMu.Lock() defer vfs.anonBlockDevMinorMu.Unlock() diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 8c8bad11c..f802bc9fb 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -334,7 +334,10 @@ func New(args Args) (*Loader, error) { if kernel.VFS2Enabled { // Set up host mount that will be used for imported fds. - hostFilesystem := hostvfs2.NewFilesystem(k.VFS()) + hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS()) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) + } defer hostFilesystem.DecRef() hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { -- cgit v1.2.3 From 47dfba76616a69887f0d5a4be6eb82b5dc5d0f52 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 14 May 2020 09:34:21 -0700 Subject: Port memfd_create to vfs2 and finish implementation of file seals. Closes #2612. PiperOrigin-RevId: 311548074 --- pkg/sentry/fsimpl/tmpfs/BUILD | 2 - pkg/sentry/fsimpl/tmpfs/filesystem.go | 21 +++++++++- pkg/sentry/fsimpl/tmpfs/regular_file.go | 42 +++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/stat_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 38 ++++++++++++++++- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 25 +++++++++++ pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/fd.go | 10 +++++ pkg/sentry/syscalls/linux/vfs2/memfd.go | 63 ++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 2 +- 12 files changed, 203 insertions(+), 7 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/memfd.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 9a076ad71..007be1572 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -52,7 +52,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/lock", - "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", @@ -106,7 +105,6 @@ go_test( "//pkg/sentry/contexttest", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/contexttest", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e0ad82769..80fa7b29d 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -772,5 +772,24 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) + mnt := vd.Mount() + d := vd.Dentry().Impl().(*dentry) + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + if d.name != "" { + // This must be an anonymous memfd file. + b.PrependComponent("/" + d.name) + return vfs.PrependPathSyntheticError{} + } + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 57e5e28ec..3f433d666 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -88,6 +88,7 @@ type regularFile struct { func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { file := ®ularFile{ memFile: fs.memFile, + seals: linux.F_SEAL_SEAL, } file.inode.init(file, fs, creds, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory @@ -577,3 +578,44 @@ exitLoop: return done, retErr } + +// GetSeals returns the current set of seals on a memfd inode. +func GetSeals(fd *vfs.FileDescription) (uint32, error) { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return 0, syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + return rf.seals, nil +} + +// AddSeals adds new file seals to a memfd inode. +func AddSeals(fd *vfs.FileDescription, val uint32) error { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + + if rf.seals&linux.F_SEAL_SEAL != 0 { + // Seal applied which prevents addition of any new seals. + return syserror.EPERM + } + + // F_SEAL_WRITE can only be added if there are no active writable maps. + if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { + if rf.writableMappingPages > 0 { + return syserror.EBUSY + } + } + + // Seals can only be added, never removed. + rf.seals |= val + return nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index f2bc96d51..64e1c40ad 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -21,8 +21,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index f52755092..f7ee4aab2 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -19,8 +19,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" ) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 405928bd0..1e781aecd 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -94,7 +94,7 @@ type FilesystemOpts struct { } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") @@ -139,6 +139,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return &fs.vfsfs, &root.vfsd, nil } +// NewFilesystem returns a new tmpfs filesystem. +func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) { + return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{}) +} + // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) @@ -658,3 +663,34 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) } + +// NewMemfd creates a new tmpfs regular file and file description that can back +// an anonymous fd created by memfd_create. +func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("NewMemfd() called with non-tmpfs mount") + } + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with + // S_IRWXUGO. + mode := linux.FileMode(0777) + inode := fs.newRegularFile(creds, mode) + rf := inode.impl.(*regularFile) + if allowSeals { + rf.seals = 0 + } + + d := fs.newDentry(inode) + defer d.DecRef() + d.name = name + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with + // FMODE_READ | FMODE_WRITE. + var fd regularFileFD + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 8104f50f3..a28eab8b8 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -173,6 +173,7 @@ go_library( "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 3617da8c6..5efeb3767 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -53,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -259,6 +260,10 @@ type Kernel struct { // syscalls (as opposed to named pipes created by mknod()). pipeMount *vfs.Mount + // shmMount is the Mount used for anonymous files created by the + // memfd_create() syscalls. It is analagous to Linux's shm_mnt. + shmMount *vfs.Mount + // socketMount is the Mount used for sockets created by the socket() and // socketpair() syscalls. There are several cases where a socket dentry will // not be contained in socketMount: @@ -330,6 +335,9 @@ func (k *Kernel) Init(args InitKernelArgs) error { if args.Timekeeper == nil { return fmt.Errorf("Timekeeper is nil") } + if args.Timekeeper.clocks == nil { + return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + } if args.RootUserNamespace == nil { return fmt.Errorf("RootUserNamespace is nil") } @@ -384,6 +392,18 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.pipeMount = pipeMount + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + if err != nil { + return fmt.Errorf("failed to create tmpfs filesystem: %v", err) + } + defer tmpfsFilesystem.DecRef() + defer tmpfsRoot.DecRef() + shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create tmpfs mount: %v", err) + } + k.shmMount = shmMount + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) @@ -1656,6 +1676,11 @@ func (k *Kernel) PipeMount() *vfs.Mount { return k.pipeMount } +// ShmMount returns the tmpfs mount. +func (k *Kernel) ShmMount() *vfs.Mount { + return k.shmMount +} + // SocketMount returns the sockfs mount. func (k *Kernel) SocketMount() *vfs.Mount { return k.socketMount diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index c32f942fb..f882ef840 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -13,6 +13,7 @@ go_library( "fscontext.go", "getdents.go", "ioctl.go", + "memfd.go", "mmap.go", "path.go", "pipe.go", @@ -43,6 +44,7 @@ go_library( "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/signalfd", "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 8181d80f4..ca0f7fd1e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" @@ -157,6 +158,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.EBADF } return uintptr(pipefile.PipeSize()), nil, nil + case linux.F_GET_SEALS: + val, err := tmpfs.GetSeals(file) + return uintptr(val), nil, err + case linux.F_ADD_SEALS: + if !file.IsWritable() { + return 0, nil, syserror.EPERM + } + err := tmpfs.AddSeals(file, args[2].Uint()) + return 0, nil, err default: // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go new file mode 100644 index 000000000..bbe248d17 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + memfdPrefix = "memfd:" + memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) +) + +// MemfdCreate implements the linux syscall memfd_create(2). +func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + + if flags&^memfdAllFlags != 0 { + // Unknown bits in flags. + return 0, nil, syserror.EINVAL + } + + allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 + cloExec := flags&linux.MFD_CLOEXEC != 0 + + name, err := t.CopyInString(addr, memfdMaxNameLen) + if err != nil { + return 0, nil, err + } + + shmMount := t.Kernel().ShmMount() + file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + if err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 9c04677f1..ec8da7f06 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -158,7 +158,7 @@ func Override() { s.Table[306] = syscalls.Supported("syncfs", Syncfs) s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg) s.Table[316] = syscalls.Supported("renameat2", Renameat2) - delete(s.Table, 319) // memfd_create + s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate) s.Table[322] = syscalls.Supported("execveat", Execveat) s.Table[327] = syscalls.Supported("preadv2", Preadv2) s.Table[328] = syscalls.Supported("pwritev2", Pwritev2) -- cgit v1.2.3 From 420b791a3d6e0e6e2fc30c6f8be013bce7ca6549 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Fri, 15 May 2020 20:03:54 -0700 Subject: Minor formatting updates for gvisor.dev. * Aggregate architecture Overview in "What is gVisor?" as it makes more sense in one place. * Drop "user-space kernel" and use "application kernel". The term "user-space kernel" is confusing when some platform implementation do not run in user-space (instead running in guest ring zero). * Clear up the relationship between the Platform page in the user guide and the Platform page in the architecture guide, and ensure they are cross-linked. * Restore the call-to-action quick start link in the main page, and drop the GitHub link (which also appears in the top-right). * Improve image formatting by centering all doc and blog images, and move the image captions to the alt text. PiperOrigin-RevId: 311845158 --- README.md | 79 ++--- g3doc/BUILD | 4 + g3doc/Layers.png | Bin 0 -> 11044 bytes g3doc/Layers.svg | 1 + g3doc/Machine-Virtualization.png | Bin 0 -> 13205 bytes g3doc/Machine-Virtualization.svg | 1 + g3doc/README.md | 161 +++++++++- g3doc/Rule-Based-Execution.png | Bin 0 -> 6780 bytes g3doc/Rule-Based-Execution.svg | 1 + g3doc/Sentry-Gofer.png | Bin 0 -> 9064 bytes g3doc/Sentry-Gofer.svg | 1 + g3doc/architecture_guide/BUILD | 30 +- g3doc/architecture_guide/Layers.png | Bin 11044 -> 0 bytes g3doc/architecture_guide/Layers.svg | 1 - .../architecture_guide/Machine-Virtualization.png | Bin 13205 -> 0 bytes .../architecture_guide/Machine-Virtualization.svg | 1 - g3doc/architecture_guide/README.md | 83 ----- g3doc/architecture_guide/Rule-Based-Execution.png | Bin 6780 -> 0 bytes g3doc/architecture_guide/Rule-Based-Execution.svg | 1 - g3doc/architecture_guide/Sentry-Gofer.png | Bin 9064 -> 0 bytes g3doc/architecture_guide/Sentry-Gofer.svg | 1 - g3doc/architecture_guide/performance.md | 35 ++- g3doc/architecture_guide/platforms.md | 109 +++---- g3doc/architecture_guide/platforms.png | Bin 0 -> 21384 bytes g3doc/architecture_guide/platforms.svg | 334 +++++++++++++++++++++ g3doc/architecture_guide/resources.md | 27 +- g3doc/architecture_guide/resources.png | Bin 0 -> 16621 bytes g3doc/architecture_guide/resources.svg | 208 +++++++++++++ g3doc/architecture_guide/security.md | 28 +- g3doc/architecture_guide/security.png | Bin 0 -> 16932 bytes g3doc/architecture_guide/security.svg | 153 ++++++++++ g3doc/user_guide/filesystem.md | 4 +- g3doc/user_guide/platforms.md | 100 +++--- pkg/sentry/arch/syscalls_arm64.go | 2 +- pkg/sentry/kernel/pipe/pipe_util.go | 2 +- pkg/sentry/kernel/task_syscall.go | 4 +- pkg/sentry/socket/netstack/netstack.go | 6 +- pkg/tcpip/stack/stack.go | 4 +- pkg/tcpip/transport/tcp/endpoint.go | 2 +- pkg/tcpip/transport/tcp/tcp_test.go | 2 +- runsc/cmd/help.go | 12 +- website/BUILD | 1 - website/_layouts/docs.html | 2 + website/_sass/front.scss | 4 +- website/_sass/style.scss | 10 + website/blog/2019-11-18-security-basics.md | 28 +- website/blog/2020-04-02-networking-security.md | 8 +- website/index.md | 10 +- 48 files changed, 1054 insertions(+), 406 deletions(-) create mode 100644 g3doc/Layers.png create mode 100644 g3doc/Layers.svg create mode 100644 g3doc/Machine-Virtualization.png create mode 100644 g3doc/Machine-Virtualization.svg create mode 100644 g3doc/Rule-Based-Execution.png create mode 100644 g3doc/Rule-Based-Execution.svg create mode 100644 g3doc/Sentry-Gofer.png create mode 100644 g3doc/Sentry-Gofer.svg delete mode 100644 g3doc/architecture_guide/Layers.png delete mode 100644 g3doc/architecture_guide/Layers.svg delete mode 100644 g3doc/architecture_guide/Machine-Virtualization.png delete mode 100644 g3doc/architecture_guide/Machine-Virtualization.svg delete mode 100644 g3doc/architecture_guide/README.md delete mode 100644 g3doc/architecture_guide/Rule-Based-Execution.png delete mode 100644 g3doc/architecture_guide/Rule-Based-Execution.svg delete mode 100644 g3doc/architecture_guide/Sentry-Gofer.png delete mode 100644 g3doc/architecture_guide/Sentry-Gofer.svg create mode 100644 g3doc/architecture_guide/platforms.png create mode 100644 g3doc/architecture_guide/platforms.svg create mode 100644 g3doc/architecture_guide/resources.png create mode 100644 g3doc/architecture_guide/resources.svg create mode 100644 g3doc/architecture_guide/security.png create mode 100644 g3doc/architecture_guide/security.svg (limited to 'pkg/sentry/kernel') diff --git a/README.md b/README.md index de3e06f4e..442f5672a 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ## What is gVisor? -**gVisor** is a user-space kernel, written in Go, that implements a substantial +**gVisor** is a application kernel, written in Go, that implements a substantial portion of the Linux system surface. It includes an [Open Container Initiative (OCI)][oci] runtime called `runsc` that provides an isolation boundary between the application and the host kernel. The `runsc` @@ -15,16 +15,17 @@ containers. ## Why does gVisor exist? Containers are not a [**sandbox**][sandbox]. While containers have -revolutionized how we develop, package, and deploy applications, running -untrusted or potentially malicious code without additional isolation is not a -good idea. The efficiency and performance gains from using a single, shared -kernel also mean that container escape is possible with a single vulnerability. - -gVisor is a user-space kernel for containers. It limits the host kernel surface -accessible to the application while still giving the application access to all -the features it expects. Unlike most kernels, gVisor does not assume or require -a fixed set of physical resources; instead, it leverages existing host kernel -functionality and runs as a normal user-space process. In other words, gVisor +revolutionized how we develop, package, and deploy applications, using them to +run untrusted or potentially malicious code without additional isolation is not +a good idea. While using a single, shared kernel allows for efficiency and +performance gains, it also means that container escape is possible with a single +vulnerability. + +gVisor is an application kernel for containers. It limits the host kernel +surface accessible to the application while still giving the application access +to all the features it expects. Unlike most kernels, gVisor does not assume or +require a fixed set of physical resources; instead, it leverages existing host +kernel functionality and runs as a normal process. In other words, gVisor implements Linux by way of Linux. gVisor should not be confused with technologies and tools to harden containers @@ -39,33 +40,24 @@ be found at [gvisor.dev][gvisor-dev]. ## Installing from source -gVisor currently requires x86\_64 Linux to build, though support for other -architectures may become available in the future. +gVisor builds on x86_64 and ARM64. Other architectures may become available in +the future. + +For the purposes of these instructions, [bazel][bazel] and other build +dependencies are wrapped in a build container. It is possible to use +[bazel][bazel] directly, or type `make help` for standard targets. ### Requirements Make sure the following dependencies are installed: * Linux 4.14.77+ ([older linux][old-linux]) -* [git][git] -* [Bazel][bazel] 1.2+ -* [Python][python] * [Docker version 17.09.0 or greater][docker] -* C++ toolchain supporting C++17 (GCC 7+, Clang 5+) -* Gold linker (e.g. `binutils-gold` package on Ubuntu) ### Building Build and install the `runsc` binary: -``` -bazel build runsc -sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin -``` - -If you don't want to install bazel on your system, you can build runsc in a -Docker container: - ``` make runsc sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin @@ -73,41 +65,19 @@ sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin ### Testing -The test suite can be run with Bazel: - -``` -bazel test //... -``` - -or in a Docker container: +To run standard test suites, you can use: ``` make unit-tests make tests ``` -### Using remote execution - -If you have a [Remote Build Execution][rbe] environment, you can use it to speed -up build and test cycles. - -You must authenticate with the project first: +To run specific tests, you can specify the target: ``` -gcloud auth application-default login --no-launch-browser +make test TARGET="//runsc:version_test" ``` -Then invoke bazel with the following flags: - -``` ---config=remote ---project_id=$PROJECT ---remote_instance_name=projects/$PROJECT/instances/default_instance -``` - -You can also add those flags to your local ~/.bazelrc to avoid needing to -specify them each time on the command line. - ### Using `go get` This project uses [bazel][bazel] to build and manage dependencies. A synthetic @@ -128,7 +98,7 @@ development on this branch is not supported. Development should occur on the ## Community & Governance -The governance model is documented in our [community][community] repository. +See [GOVERNANCE.md](GOVERANCE.md) for project governance information. The [gvisor-users mailing list][gvisor-users-list] and [gvisor-dev mailing list][gvisor-dev-list] are good starting points for @@ -145,12 +115,9 @@ See [Contributing.md](CONTRIBUTING.md). [bazel]: https://bazel.build [community]: https://gvisor.googlesource.com/community [docker]: https://www.docker.com -[git]: https://git-scm.com [gvisor-users-list]: https://groups.google.com/forum/#!forum/gvisor-users +[gvisor-dev]: https://gvisor.dev [gvisor-dev-list]: https://groups.google.com/forum/#!forum/gvisor-dev [oci]: https://www.opencontainers.org [old-linux]: https://gvisor.dev/docs/user_guide/networking/#gso -[python]: https://python.org -[rbe]: https://blog.bazel.build/2018/10/05/remote-build-execution.html [sandbox]: https://en.wikipedia.org/wiki/Sandbox_(computer_security) -[gvisor-dev]: https://gvisor.dev diff --git a/g3doc/BUILD b/g3doc/BUILD index 24177ad06..dbbf96204 100644 --- a/g3doc/BUILD +++ b/g3doc/BUILD @@ -9,6 +9,10 @@ doc( name = "index", src = "README.md", category = "Project", + data = glob([ + "*.png", + "*.svg", + ]), permalink = "/docs/", weight = "0", ) diff --git a/g3doc/Layers.png b/g3doc/Layers.png new file mode 100644 index 000000000..308c6c451 Binary files /dev/null and b/g3doc/Layers.png differ diff --git a/g3doc/Layers.svg b/g3doc/Layers.svg new file mode 100644 index 000000000..0a366f841 --- /dev/null +++ b/g3doc/Layers.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/Machine-Virtualization.png b/g3doc/Machine-Virtualization.png new file mode 100644 index 000000000..1ba2ed6b2 Binary files /dev/null and b/g3doc/Machine-Virtualization.png differ diff --git a/g3doc/Machine-Virtualization.svg b/g3doc/Machine-Virtualization.svg new file mode 100644 index 000000000..5352da07b --- /dev/null +++ b/g3doc/Machine-Virtualization.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/README.md b/g3doc/README.md index 7999f5d47..304a91493 100644 --- a/g3doc/README.md +++ b/g3doc/README.md @@ -1,6 +1,6 @@ # What is gVisor? -gVisor is a user-space kernel, written in Go, that implements a substantial +gVisor is an application kernel, written in Go, that implements a substantial portion of the [Linux system call interface][linux]. It provides an additional layer of isolation between running applications and the host operating system. @@ -9,19 +9,160 @@ that makes it easy to work with existing container tooling. The `runsc` runtime integrates with Docker and Kubernetes, making it simple to run sandboxed containers. -gVisor takes a distinct approach to container sandboxing and makes a different -set of technical trade-offs compared to existing sandbox technologies, thus -providing new tools and ideas for the container security landscape. - gVisor can be used with Docker, Kubernetes, or directly using `runsc`. Use the links below to see detailed instructions for each of them: -* [Docker](./user_guide/quick_start/docker/): The quickest and easiest way to - get started. -* [Kubernetes](./user_guide/quick_start/kubernetes/): Isolate Pods in your K8s - cluster with gVisor. -* [OCI Quick Start](./user_guide/quick_start/oci/): Expert mode. Customize +* [Docker](./user_guide/quick_start/docker.md): The quickest and easiest way + to get started. +* [Kubernetes](./user_guide/quick_start/kubernetes.md): Isolate Pods in your + K8s cluster with gVisor. +* [OCI Quick Start](./user_guide/quick_start/oci.md): Expert mode. Customize gVisor for your environment. +## What does gVisor do? + +gVisor provides a virtualized environment in order to sandbox containers. The +system interfaces normally implemented by the host kernel are moved into a +distinct, per-sandbox application kernel in order to minimize the risk of an +container escape exploit. gVisor does not introduce large fixed overheads +however, and still retains a process-like model with respect to resource +utilization. + +## How is this different? + +Two other approaches are commonly taken to provide stronger isolation than +native containers. + +**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes +virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This +virtualized hardware is generally enlightened (paravirtualized) and additional +mechanisms can be used to improve the visibility between the guest and host +(e.g. balloon drivers, paravirtualized spinlocks). Running containers in +distinct virtual machines can provide great isolation, compatibility and +performance (though nested virtualization may bring challenges in this area), +but for containers it often requires additional proxies and agents, and may +require a larger resource footprint and slower start-up times. + +![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization") + +**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and +[AppArmor][apparmor], allows the specification of a fine-grained security policy +for an application or container. These schemes typically rely on hooks +implemented inside the host kernel to enforce the rules. If the surface can be +made small enough, then this is an excellent way to sandbox applications and +maintain native performance. However, in practice it can be extremely difficult +(if not impossible) to reliably define a policy for arbitrary, previously +unknown applications, making this approach challenging to apply universally. + +![Rule-based execution](Rule-Based-Execution.png "Rule-based execution") + +Rule-based execution is often combined with additional layers for +defense-in-depth. + +**gVisor** provides a third isolation mechanism, distinct from those above. + +gVisor intercepts application system calls and acts as the guest kernel, without +the need for translation through virtualized hardware. gVisor may be thought of +as either a merged guest kernel and VMM, or as seccomp on steroids. This +architecture allows it to provide a flexible resource footprint (i.e. one based +on threads and memory mappings, not fixed guest physical resources) while also +lowering the fixed costs of virtualization. However, this comes at the price of +reduced application compatibility and higher per-system call overhead. + +![gVisor](Layers.png "gVisor") + +On top of this, gVisor employs rule-based execution to provide defense-in-depth +(details below). + +gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML +virtualizes hardware internally and thus provides a fixed resource footprint. + +Each of the above approaches may excel in distinct scenarios. For example, +machine-level virtualization will face challenges achieving high density, while +gVisor may provide poor performance for system call heavy workloads. + +## Why Go? + +gVisor is written in [Go][golang] in order to avoid security pitfalls that can +plague kernels. With Go, there are strong types, built-in bounds checks, no +uninitialized variables, no use-after-free, no stack overflow, and a built-in +race detector. However, the use of Go has its challenges, and the runtime often +introduces performance overhead. + +## What are the different components? + +A gVisor sandbox consists of multiple processes. These processes collectively +comprise an environment in which one or more containers can be run. + +Each sandbox has its own isolated instance of: + +* The **Sentry**, which is a kernel that runs the containers and intercepts + and responds to system calls made by the application. + +Each container running in the sandbox has its own isolated instance of: + +* A **Gofer** which provides file system access to the containers. + +![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram") + +## What is runsc? + +The entrypoint to running a sandboxed container is the `runsc` executable. +`runsc` implements the [Open Container Initiative (OCI)][oci] runtime +specification, which is used by Docker and Kubernetes. This means that OCI +compatible _filesystem bundles_ can be run by `runsc`. Filesystem bundles are +comprised of a `config.json` file containing container configuration, and a root +filesystem for the container. Please see the [OCI runtime spec][runtime-spec] +for more information on filesystem bundles. `runsc` implements multiple commands +that perform various functions such as starting, stopping, listing, and querying +the status of containers. + +### Sentry + + + +The Sentry is the largest component of gVisor. It can be thought of as a +application kernel. The Sentry implements all the kernel functionality needed by +the application, including: system calls, signal delivery, memory management and +page faulting logic, the threading model, and more. + +When the application makes a system call, the +[Platform](./architecture_guide/platforms.md) redirects the call to the Sentry, +which will do the necessary work to service it. It is important to note that the +Sentry does not pass system calls through to the host kernel. As a userspace +application, the Sentry will make some host system calls to support its +operation, but it does not allow the application to directly control the system +calls it makes. For example, the Sentry is not able to open files directly; file +system operations that extend beyond the sandbox (not internal `/proc` files, +pipes, etc) are sent to the Gofer, described below. + +### Gofer + + + +The Gofer is a standard host process which is started with each container and +communicates with the Sentry via the [9P protocol][9p] over a socket or shared +memory channel. The Sentry process is started in a restricted seccomp container +without access to file system resources. The Gofer mediates all access to the +these resources, providing an additional level of isolation. + +### Application + +The application is a normal Linux binary provided to gVisor in an OCI runtime +bundle. gVisor aims to provide an environment equivalent to Linux v4.4, so +applications should be able to run unmodified. However, gVisor does not +presently implement every system call, `/proc` file, or `/sys` file so some +incompatibilities may occur. See [Commpatibility](./user_guide/compatibility.md) +for more information. + +[9p]: https://en.wikipedia.org/wiki/9P_(protocol) +[apparmor]: https://wiki.ubuntu.com/AppArmor +[golang]: https://golang.org +[kvm]: https://www.linux-kvm.org [linux]: https://en.wikipedia.org/wiki/Linux_kernel_interfaces [oci]: https://www.opencontainers.org +[runtime-spec]: https://github.com/opencontainers/runtime-spec +[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt +[selinux]: https://selinuxproject.org +[uml]: http://user-mode-linux.sourceforge.net/ +[xen]: https://www.xenproject.org diff --git a/g3doc/Rule-Based-Execution.png b/g3doc/Rule-Based-Execution.png new file mode 100644 index 000000000..b42654a90 Binary files /dev/null and b/g3doc/Rule-Based-Execution.png differ diff --git a/g3doc/Rule-Based-Execution.svg b/g3doc/Rule-Based-Execution.svg new file mode 100644 index 000000000..bd6717043 --- /dev/null +++ b/g3doc/Rule-Based-Execution.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/Sentry-Gofer.png b/g3doc/Sentry-Gofer.png new file mode 100644 index 000000000..ca2c27ef7 Binary files /dev/null and b/g3doc/Sentry-Gofer.png differ diff --git a/g3doc/Sentry-Gofer.svg b/g3doc/Sentry-Gofer.svg new file mode 100644 index 000000000..5c10750d2 --- /dev/null +++ b/g3doc/Sentry-Gofer.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/BUILD b/g3doc/architecture_guide/BUILD index 72038305b..404f627a4 100644 --- a/g3doc/architecture_guide/BUILD +++ b/g3doc/architecture_guide/BUILD @@ -5,31 +5,13 @@ package( licenses = ["notice"], ) -doc( - name = "index", - src = "README.md", - category = "Architecture Guide", - data = [ - "Layers.png", - "Layers.svg", - "Machine-Virtualization.png", - "Machine-Virtualization.svg", - "Rule-Based-Execution.png", - "Rule-Based-Execution.svg", - "Sentry-Gofer.png", - "Sentry-Gofer.svg", - ], - permalink = "/docs/architecture_guide/", - weight = "0", -) - doc( name = "platforms", src = "platforms.md", category = "Architecture Guide", data = [ - "Sentry-Gofer.png", - "Sentry-Gofer.svg", + "platforms.png", + "platforms.svg", ], permalink = "/docs/architecture_guide/platforms/", weight = "40", @@ -39,6 +21,10 @@ doc( name = "resources", src = "resources.md", category = "Architecture Guide", + data = [ + "resources.png", + "resources.svg", + ], permalink = "/docs/architecture_guide/resources/", weight = "30", ) @@ -48,8 +34,8 @@ doc( src = "security.md", category = "Architecture Guide", data = [ - "Layers.png", - "Layers.svg", + "security.png", + "security.svg", ], permalink = "/docs/architecture_guide/security/", weight = "10", diff --git a/g3doc/architecture_guide/Layers.png b/g3doc/architecture_guide/Layers.png deleted file mode 100644 index 308c6c451..000000000 Binary files a/g3doc/architecture_guide/Layers.png and /dev/null differ diff --git a/g3doc/architecture_guide/Layers.svg b/g3doc/architecture_guide/Layers.svg deleted file mode 100644 index 0a366f841..000000000 --- a/g3doc/architecture_guide/Layers.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/g3doc/architecture_guide/Machine-Virtualization.png b/g3doc/architecture_guide/Machine-Virtualization.png deleted file mode 100644 index 1ba2ed6b2..000000000 Binary files a/g3doc/architecture_guide/Machine-Virtualization.png and /dev/null differ diff --git a/g3doc/architecture_guide/Machine-Virtualization.svg b/g3doc/architecture_guide/Machine-Virtualization.svg deleted file mode 100644 index 5352da07b..000000000 --- a/g3doc/architecture_guide/Machine-Virtualization.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/g3doc/architecture_guide/README.md b/g3doc/architecture_guide/README.md deleted file mode 100644 index ab9ef7174..000000000 --- a/g3doc/architecture_guide/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# Overview - -gVisor provides a virtualized environment in order to sandbox untrusted -containers. The system interfaces normally implemented by the host kernel are -moved into a distinct, per-sandbox user space kernel in order to minimize the -risk of an exploit. gVisor does not introduce large fixed overheads however, and -still retains a process-like model with respect to resource utilization. - -## How is this different? - -Two other approaches are commonly taken to provide stronger isolation than -native containers. - -**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes -virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This -virtualized hardware is generally enlightened (paravirtualized) and additional -mechanisms can be used to improve the visibility between the guest and host -(e.g. balloon drivers, paravirtualized spinlocks). Running containers in -distinct virtual machines can provide great isolation, compatibility and -performance (though nested virtualization may bring challenges in this area), -but for containers it often requires additional proxies and agents, and may -require a larger resource footprint and slower start-up times. - -![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization") - -**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and -[AppArmor][apparmor], allows the specification of a fine-grained security policy -for an application or container. These schemes typically rely on hooks -implemented inside the host kernel to enforce the rules. If the surface can be -made small enough (i.e. a sufficiently complete policy defined), then this is an -excellent way to sandbox applications and maintain native performance. However, -in practice it can be extremely difficult (if not impossible) to reliably define -a policy for arbitrary, previously unknown applications, making this approach -challenging to apply universally. - -![Rule-based execution](Rule-Based-Execution.png "Rule-based execution") - -Rule-based execution is often combined with additional layers for -defense-in-depth. - -**gVisor** provides a third isolation mechanism, distinct from those above. - -gVisor intercepts application system calls and acts as the guest kernel, without -the need for translation through virtualized hardware. gVisor may be thought of -as either a merged guest kernel and VMM, or as seccomp on steroids. This -architecture allows it to provide a flexible resource footprint (i.e. one based -on threads and memory mappings, not fixed guest physical resources) while also -lowering the fixed costs of virtualization. However, this comes at the price of -reduced application compatibility and higher per-system call overhead. - -![gVisor](Layers.png "gVisor") - -On top of this, gVisor employs rule-based execution to provide defense-in-depth -(details below). - -gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML -virtualizes hardware internally and thus provides a fixed resource footprint. - -Each of the above approaches may excel in distinct scenarios. For example, -machine-level virtualization will face challenges achieving high density, while -gVisor may provide poor performance for system call heavy workloads. - -### Why Go? - -gVisor is written in [Go][golang] in order to avoid security pitfalls that can -plague kernels. With Go, there are strong types, built-in bounds checks, no -uninitialized variables, no use-after-free, no stack overflow, and a built-in -race detector. (The use of Go has its challenges too, and isn't free.) - -### What about Gofers? - - - -Gofers mediate file system interactions, and are used to provide additional -isolation. For more details, see the [Platform Guide](./platforms.md). - -[apparmor]: https://wiki.ubuntu.com/AppArmor -[golang]: https://golang.org -[kvm]: https://www.linux-kvm.org -[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt -[selinux]: https://selinuxproject.org -[uml]: http://user-mode-linux.sourceforge.net/ -[xen]: https://www.xenproject.org diff --git a/g3doc/architecture_guide/Rule-Based-Execution.png b/g3doc/architecture_guide/Rule-Based-Execution.png deleted file mode 100644 index b42654a90..000000000 Binary files a/g3doc/architecture_guide/Rule-Based-Execution.png and /dev/null differ diff --git a/g3doc/architecture_guide/Rule-Based-Execution.svg b/g3doc/architecture_guide/Rule-Based-Execution.svg deleted file mode 100644 index bd6717043..000000000 --- a/g3doc/architecture_guide/Rule-Based-Execution.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/g3doc/architecture_guide/Sentry-Gofer.png b/g3doc/architecture_guide/Sentry-Gofer.png deleted file mode 100644 index ca2c27ef7..000000000 Binary files a/g3doc/architecture_guide/Sentry-Gofer.png and /dev/null differ diff --git a/g3doc/architecture_guide/Sentry-Gofer.svg b/g3doc/architecture_guide/Sentry-Gofer.svg deleted file mode 100644 index 5c10750d2..000000000 --- a/g3doc/architecture_guide/Sentry-Gofer.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md index 3862d78ee..39dbb0045 100644 --- a/g3doc/architecture_guide/performance.md +++ b/g3doc/architecture_guide/performance.md @@ -13,12 +13,13 @@ forms: additional cycles and memory usage, which may manifest as increased latency, reduced throughput or density, or not at all. In general, these costs come from two different sources. -First, the existence of the [Sentry](../) means that additional memory will be -required, and application system calls must traverse additional layers of -software. The design emphasizes [security](../security/) and therefore we chose -to use a language for the Sentry that provides benefits in this domain but may -not yet offer the raw performance of other choices. Costs imposed by these -design choices are **structural costs**. +First, the existence of the [Sentry](../README.md#sentry) means that additional +memory will be required, and application system calls must traverse additional +layers of software. The design emphasizes +[security](/docs/architecture_guide/security/) and therefore we chose to use a +language for the Sentry that provides benefits in this domain but may not yet +offer the raw performance of other choices. Costs imposed by these design +choices are **structural costs**. Second, as gVisor is an independent implementation of the system call surface, many of the subsystems or specific calls are not as optimized as more mature @@ -50,7 +51,7 @@ Virtual Machines (VMs) with the following specifications: Through this document, `runsc` is used to indicate the runtime provided by gVisor. When relevant, we use the name `runsc-platform` to describe a specific -[platform choice](../platforms/). +[platform choice](/docs/architecture_guide/platforms/). **Except where specified, all tests below are conducted with the `ptrace` platform. The `ptrace` platform works everywhere and does not require hardware @@ -131,11 +132,11 @@ full start-up and run time for the workload, which trains a model. ## System calls Some **structural costs** of gVisor are heavily influenced by the -[platform choice](../platforms/), which implements system call interception. -Today, gVisor supports a variety of platforms. These platforms present distinct -performance, compatibility and security trade-offs. For example, the KVM -platform has low overhead system call interception but runs poorly with nested -virtualization. +[platform choice](/docs/architecture_guide/platforms/), which implements system +call interception. Today, gVisor supports a variety of platforms. These +platforms present distinct performance, compatibility and security trade-offs. +For example, the KVM platform has low overhead system call interception but runs +poorly with nested virtualization. {% include graph.html id="syscall" url="/performance/syscall.csv" title="perf.py syscall --runtime=runc --runtime=runsc-ptrace --runtime=runsc-kvm" y_min="100" @@ -163,7 +164,8 @@ overhead. Some of these costs above are **structural costs**, and `redis` is likely to remain a challenging performance scenario. However, optimizing the -[platform](../platforms/) will also have a dramatic impact. +[platform](/docs/architecture_guide/platforms/) will also have a dramatic +impact. ## Start-up time @@ -184,7 +186,7 @@ similarly loads a number of modules and binds an HTTP server. > Note: most of the time overhead above is associated Docker itself. This is > evident with the empty `runc` benchmark. To avoid these costs with `runsc`, > you may also consider using `runsc do` mode or invoking the -> [OCI runtime](../../user_guide/quick_start/oci/) directly. +> [OCI runtime](../user_guide/quick_start/oci.md) directly. ## Network @@ -222,8 +224,9 @@ In terms of raw disk I/O, gVisor does not introduce significant fundamental overhead. For general file operations, gVisor introduces a small fixed overhead for data that transitions across the sandbox boundary. This manifests as **structural costs** in some cases, since these operations must be routed -through the [Gofer](../) as a result of our [security model](../security/), but -in most cases are dominated by **implementation costs**, due to an internal +through the [Gofer](../README.md#gofer) as a result of our +[Security Model](/docs/architecture_guide/security/), but in most cases are +dominated by **implementation costs**, due to an internal [Virtual File System][vfs] (VFS) implementation that needs improvement. {% include graph.html id="fio-bw" url="/performance/fio.csv" title="perf.py fio diff --git a/g3doc/architecture_guide/platforms.md b/g3doc/architecture_guide/platforms.md index 6e63da8ce..d112c9a28 100644 --- a/g3doc/architecture_guide/platforms.md +++ b/g3doc/architecture_guide/platforms.md @@ -1,86 +1,61 @@ # Platform Guide -A gVisor sandbox consists of multiple processes when running. These processes -collectively comprise a shared environment in which one or more containers can -be run. +[TOC] -Each sandbox has its own isolated instance of: - -* The **Sentry**, A user-space kernel that runs the container and intercepts - and responds to system calls made by the application. - -Each container running in the sandbox has its own isolated instance of: - -* A **Gofer** which provides file system access to the container. - -![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram") - -## runsc - -The entrypoint to running a sandboxed container is the `runsc` executable. -`runsc` implements the [Open Container Initiative (OCI)][oci] runtime -specification. This means that OCI compatible _filesystem bundles_ can be run by -`runsc`. Filesystem bundles are comprised of a `config.json` file containing -container configuration, and a root filesystem for the container. Please see the -[OCI runtime spec][runtime-spec] for more information on filesystem bundles. -`runsc` implements multiple commands that perform various functions such as -starting, stopping, listing, and querying the status of containers. +gVisor requires a platform to implement interception of syscalls, basic context +switching, and memory mapping functionality. Internally, gVisor uses an +abstraction sensibly called [Platform][platform]. A simplified version of this +interface looks like: -## Sentry +```golang +type Platform interface { + NewAddressSpace() (AddressSpace, error) + NewContext() Context +} -The Sentry is the largest component of gVisor. It can be thought of as a -userspace OS kernel. The Sentry implements all the kernel functionality needed -by the untrusted application. It implements all of the supported system calls, -signal delivery, memory management and page faulting logic, the threading model, -and more. +type Context interface { + Switch(as AddressSpace, ac arch.Context) (..., error) +} -When the untrusted application makes a system call, the currently used platform -redirects the call to the Sentry, which will do the necessary work to service -it. It is important to note that the Sentry will not simply pass through system -calls to the host kernel. As a userspace application, the Sentry will make some -host system calls to support its operation, but it will not allow the -application to directly control the system calls it makes. +type AddressSpace interface { + MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, ...) error + Unmap(addr usermem.Addr, length uint64) +} +``` -The Sentry aims to present an equivalent environment to (upstream) Linux v4.4. +There are a number of different ways to implement this interface that come with +various trade-offs, generally around performance and hardware requirements. -File system operations that extend beyond the sandbox (not internal /proc files, -pipes, etc) are sent to the Gofer, described below. +## Implementations -## Platforms +The choice of platform depends on the context in which `runsc` is executing. In +general, virtualized platforms may be limited to platforms that do not require +hardware virtualized support (since the hardware is already in use): -gVisor requires a platform to implement interception of syscalls, basic context -switching, and memory mapping functionality. +![Platforms](platforms.png "Platform examples.") ### ptrace -The ptrace platform uses `PTRACE_SYSEMU` to execute user code without allowing -it to execute host system calls. This platform can run anywhere that ptrace -works (even VMs without nested virtualization). - -### KVM (experimental) +The ptrace platform uses [PTRACE_SYSEMU][ptrace] to execute user code without +allowing it to execute host system calls. This platform can run anywhere that +`ptrace` works (even VMs without nested virtualization), which is ubiquitous. -The KVM platform allows the Sentry to act as both guest OS and VMM, switching -back and forth between the two worlds seamlessly. The KVM platform can run on -bare-metal or in a VM with nested virtualization enabled. While there is no -virtualized hardware layer -- the sandbox retains a process model -- gVisor -leverages virtualization extensions available on modern processors in order to -improve isolation and performance of address space switches. +Unfortunately, the ptrace platform has high context switch overhead, so system +call-heavy applications may pay a [performance penalty](./performance.md). -## Gofer +### KVM -The Gofer is a normal host Linux process. The Gofer is started with each sandbox -and connected to the Sentry. The Sentry process is started in a restricted -seccomp container without access to file system resources. The Gofer provides -the Sentry access to file system resources via the 9P protocol and provides an -additional level of isolation. +The KVM platform uses the kernel's [KVM][kvm] functionality to allow the Sentry +to act as both guest OS and VMM. The KVM platform can run on bare-metal or in a +VM with nested virtualization enabled. While there is no virtualized hardware +layer -- the sandbox retains a process model -- gVisor leverages virtualization +extensions available on modern processors in order to improve isolation and +performance of address space switches. -## Application +## Changing Platforms -The application (aka the untrusted application) is a normal Linux binary -provided to gVisor in an OCI runtime bundle. gVisor aims to provide an -environment equivalent to Linux v4.4, so applications should be able to run -unmodified. However, gVisor does not presently implement every system call, -/proc file, or /sys file so some incompatibilities may occur. +See [Changing Platforms](../user_guide/platforms.md). -[oci]: https://www.opencontainers.org -[runtime-spec]: https://github.com/opencontainers/runtime-spec +[kvm]: https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt +[platform]: https://cs.opensource.google/gvisor/gvisor/+/release-20190304.1:pkg/sentry/platform/platform.go;l=33 +[ptrace]: http://man7.org/linux/man-pages/man2/ptrace.2.html diff --git a/g3doc/architecture_guide/platforms.png b/g3doc/architecture_guide/platforms.png new file mode 100644 index 000000000..005d56feb Binary files /dev/null and b/g3doc/architecture_guide/platforms.png differ diff --git a/g3doc/architecture_guide/platforms.svg b/g3doc/architecture_guide/platforms.svg new file mode 100644 index 000000000..b0bac9ba7 --- /dev/null +++ b/g3doc/architecture_guide/platforms.svg @@ -0,0 +1,334 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + gVisor + workload + host + + + + gVisor + workload + + KVM + + + gVisor + workload + + ptrace + + ptrace + VM + + guest + + + gVisor + workload + + ptrace + + diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md index 894f995ae..1dec37bd1 100644 --- a/g3doc/architecture_guide/resources.md +++ b/g3doc/architecture_guide/resources.md @@ -10,9 +10,10 @@ sandbox to be highly dynamic in terms of resource usage: spanning a large number of cores and large amount of memory when busy, and yielding those resources back to the host when not. -Some of the details here may depend on the [platform](../platforms/), but in -general this page describes the resource model used by gVisor. If you're not -familiar with the terms here, uou may want to start with the [Overview](../). +In order words, the shape of the sandbox should closely track the shape of the +sandboxed process: + +![Resource model](resources.png "Workloads of different shapes.") ## Processes @@ -23,9 +24,9 @@ the sandbox (e.g. via a [Docker exec][exec]). ## Networking -Similarly to processes, the sandbox attaches a network endpoint to the system, -but runs it's own network stack. All network resources, other than packets in -flight, exist only inside the sandbox, bound by relevant resource limits. +The sandbox attaches a network endpoint to the system, but runs it's own network +stack. All network resources, other than packets in flight on the host, exist +only inside the sandbox, bound by relevant resource limits. You can interact with network endpoints exposed by the sandbox, just as you would any other container, but network introspection similarly requires entering @@ -33,15 +34,14 @@ the sandbox. ## Files -Files may be backed by different implementations. For host-native files (where a -file descriptor is available), the Gofer may return a file descriptor to the -Sentry via [SCM_RIGHTS][scmrights][^1]. +Files in the sandbox may be backed by different implementations. For host-native +files (where a file descriptor is available), the Gofer may return a file +descriptor to the Sentry via [SCM_RIGHTS][scmrights][^1]. These files may be read from and written to through standard system calls, and also mapped into the associated application's address space. This allows the same host memory to be shared across multiple sandboxes, although this mechanism -does not preclude the use of side-channels (see the -[security model](../security/)). +does not preclude the use of side-channels (see [Security Model](./security.md). Note that some file systems exist only within the context of the sandbox. For example, in many cases a `tmpfs` mount will be available at `/tmp` or @@ -64,8 +64,9 @@ scheduling decisions about all application threads. ## Time Time in the sandbox is provided by the Sentry, through its own [vDSO][vdso] and -timekeeping implementation. This is divorced from the host time, and no state is -shared with the host, although the time will be initialized with the host clock. +time-keeping implementation. This is distinct from the host time, and no state +is shared with the host, although the time will be initialized with the host +clock. The Sentry runs timers to note the passage of time, much like a kernel running on hardware (though the timers are software timers, in this case). These timers diff --git a/g3doc/architecture_guide/resources.png b/g3doc/architecture_guide/resources.png new file mode 100644 index 000000000..f715008ec Binary files /dev/null and b/g3doc/architecture_guide/resources.png differ diff --git a/g3doc/architecture_guide/resources.svg b/g3doc/architecture_guide/resources.svg new file mode 100644 index 000000000..fd7805d90 --- /dev/null +++ b/g3doc/architecture_guide/resources.svg @@ -0,0 +1,208 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + gVisor + gVisor + gVisor + workload + workload + workload + host + + diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md index f78586291..b99b86332 100644 --- a/g3doc/architecture_guide/security.md +++ b/g3doc/architecture_guide/security.md @@ -86,15 +86,17 @@ a substitute for a secure architecture*. ## Goals: Limiting Exposure -gVisor’s primary design goal is to minimize the System API attack vector while -still providing a process model. There are two primary security principles that -inform this design. First, the application’s direct interactions with the host -System API are intercepted by the Sentry, which implements the System API -instead. Second, the System API accessible to the Sentry itself is minimized to -a safer, restricted set. The first principle minimizes the possibility of direct -exploitation of the host System API by applications, and the second principle -minimizes indirect exploitability, which is the exploitation by an exploited or -buggy Sentry (e.g. chaining an exploit). +![Threat model](security.png "Threat model.") + +gVisor’s primary design goal is to minimize the System API attack vector through +multiple layers of defense, while still providing a process model. There are two +primary security principles that inform this design. First, the application’s +direct interactions with the host System API are intercepted by the Sentry, +which implements the System API instead. Second, the System API accessible to +the Sentry itself is minimized to a safer, restricted set. The first principle +minimizes the possibility of direct exploitation of the host System API by +applications, and the second principle minimizes indirect exploitability, which +is the exploitation by an exploited or buggy Sentry (e.g. chaining an exploit). The first principle is similar to the security basis for a Virtual Machine (VM). With a VM, an application’s interactions with the host are replaced by @@ -210,9 +212,9 @@ crashes are recorded and triaged to similarly identify material issues. ### Is this more or less secure than a Virtual Machine? The security of a VM depends to a large extent on what is exposed from the host -kernel and user space support code. For example, device emulation code in the +kernel and userspace support code. For example, device emulation code in the host kernel (e.g. APIC) or optimizations (e.g. vhost) can be more complex than a -simple system call, and exploits carry the same risks. Similarly, the user space +simple system call, and exploits carry the same risks. Similarly, the userspace support code is frequently unsandboxed, and exploits, while rare, may allow unfettered access to the system. @@ -245,8 +247,8 @@ In gVisor, the platforms that use ptrace operate differently. The stubs that are traced are never allowed to continue execution into the host kernel and complete a call directly. Instead, all system calls are interpreted and handled by the Sentry itself, who reflects resulting register state back into the tracee before -continuing execution in user space. This is very similar to the mechanism used -by User-Mode Linux (UML). +continuing execution in userspace. This is very similar to the mechanism used by +User-Mode Linux (UML). [dirtycow]: https://en.wikipedia.org/wiki/Dirty_COW [clang]: https://en.wikipedia.org/wiki/C_(programming_language) diff --git a/g3doc/architecture_guide/security.png b/g3doc/architecture_guide/security.png new file mode 100644 index 000000000..c29befbf6 Binary files /dev/null and b/g3doc/architecture_guide/security.png differ diff --git a/g3doc/architecture_guide/security.svg b/g3doc/architecture_guide/security.svg new file mode 100644 index 000000000..0575e2dec --- /dev/null +++ b/g3doc/architecture_guide/security.svg @@ -0,0 +1,153 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/g3doc/user_guide/filesystem.md b/g3doc/user_guide/filesystem.md index 6c69f42a1..cd00762dd 100644 --- a/g3doc/user_guide/filesystem.md +++ b/g3doc/user_guide/filesystem.md @@ -4,8 +4,8 @@ gVisor accesses the filesystem through a file proxy, called the Gofer. The gofer runs as a separate process, that is isolated from the sandbox. Gofer instances -communicate with their respective sentry using the 9P protocol. For a more -detailed explanation see [Overview > Gofer](../../architecture_guide/#gofer). +communicate with their respective sentry using the 9P protocol. For another +explanation see [What is gVisor?](../README.md). ## Sandbox overlay diff --git a/g3doc/user_guide/platforms.md b/g3doc/user_guide/platforms.md index eefb6b222..752025881 100644 --- a/g3doc/user_guide/platforms.md +++ b/g3doc/user_guide/platforms.md @@ -1,56 +1,27 @@ -# Platforms (KVM) +# Changing Platforms [TOC] -This document will help you set up your system to use a different gVisor -platform. +This guide described how to change the +[platform](../architecture_guide/platforms.md) used by `runsc`. -## What is a Platform? +## Prerequisites -gVisor requires a *platform* to implement interception of syscalls, basic -context switching, and memory mapping functionality. These are described in more -depth in the [Platform Design](../../architecture_guide/platforms/). +If you intend to run the KVM platform, you will also to have KVM installed on +your system. If you are running a Debian based system like Debian or Ubuntu you +can usually do this by ensuring the module is loaded, and permissions are +appropriately set on the `/dev/kvm` device. -## Selecting a Platform - -The platform is selected by the `--platform` command line flag passed to -`runsc`. By default, the ptrace platform is selected. To select a different -platform, modify your Docker configuration (`/etc/docker/daemon.json`) to pass -this argument: - -```json -{ - "runtimes": { - "runsc": { - "path": "/usr/local/bin/runsc", - "runtimeArgs": [ - "--platform=kvm" - ] - } - } -} -``` - -You must restart the Docker daemon after making changes to this file, typically -this is done via `systemd`: +If you have an Intel CPU: ```bash -sudo systemctl restart docker +sudo modprobe kvm-intel && sudo chmod a+rw /dev/kvm ``` -## Example: Using the KVM Platform - -The KVM platform is currently experimental; however, it provides several -benefits over the default ptrace platform. - -### Prerequisites - -You will also to have KVM installed on your system. If you are running a Debian -based system like Debian or Ubuntu you can usually do this by installing the -`qemu-kvm` package. +If you have an AMD CPU: ```bash -sudo apt-get install qemu-kvm +sudo modprobe kvm-amd && sudo chmod a+rw /dev/kvm ``` If you are using a virtual machine you will need to make sure that nested @@ -68,31 +39,22 @@ cause of security issues (e.g. [CVE-2018-12904](https://nvd.nist.gov/vuln/detail/CVE-2018-12904)). It is not recommended for production.*** -### Configuring Docker - -Per above, you will need to configure Docker to use `runsc` with the KVM -platform. You will remember from the Docker Quick Start that you configured -Docker to use `runsc` as the runtime. Docker allows you to add multiple runtimes -to the Docker configuration. +## Configuring Docker -Add a new entry for the KVM platform entry to your Docker configuration -(`/etc/docker/daemon.json`) in order to provide the `--platform=kvm` runtime -argument. - -In the end, the file should look something like: +The platform is selected by the `--platform` command line flag passed to +`runsc`. By default, the ptrace platform is selected. For example, to select the +KVM platform, modify your Docker configuration (`/etc/docker/daemon.json`) to +pass the `--platform` argument: ```json { "runtimes": { "runsc": { - "path": "/usr/local/bin/runsc" - }, - "runsc-kvm": { "path": "/usr/local/bin/runsc", "runtimeArgs": [ "--platform=kvm" ] - } + } } } ``` @@ -104,13 +66,27 @@ this is done via `systemd`: sudo systemctl restart docker ``` -## Running a container +Note that you may configure multiple runtimes using different platforms. For +example, the following configuration has one configuration for ptrace and one +for the KVM platform: -Now run your container using the `runsc-kvm` runtime. This will run the -container using the KVM platform: - -```bash -docker run --runtime=runsc-kvm --rm hello-world +```json +{ + "runtimes": { + "runsc-ptrace": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--platform=ptrace" + ] + }, + "runsc-kvm": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--platform=kvm" + ] + } + } +} ``` [nested-azure]: https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nested-virtualization diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go index 92d062513..95dfd1e90 100644 --- a/pkg/sentry/arch/syscalls_arm64.go +++ b/pkg/sentry/arch/syscalls_arm64.go @@ -23,7 +23,7 @@ const restartSyscallNr = uintptr(128) // // In linux, at the entry of the syscall handler(el0_svc_common()), value of R0 // is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0 -// was not accessible to the user space application, so we have to do the same +// was not accessible to the userspace application, so we have to do the same // operation in the sentry code to save the R0 value into the App context. func (c *context64) SyscallSaveOrig() { c.OrigR0 = c.Regs.Regs[0] diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 5a1d4fd57..aacf28da2 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -144,7 +144,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume if v > math.MaxInt32 { v = math.MaxInt32 // Silently truncate. } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index c9db78e06..a5903b0b5 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -199,10 +199,10 @@ func (t *Task) doSyscall() taskRunState { // // On x86, register rax was shared by syscall number and return // value, and at the entry of the syscall handler, the rax was - // saved to regs.orig_rax which was exposed to user space. + // saved to regs.orig_rax which was exposed to userspace. // But on arm64, syscall number was passed through X8, and the X0 // was shared by the first syscall argument and return value. The - // X0 was saved to regs.orig_x0 which was not exposed to user space. + // X0 was saved to regs.orig_x0 which was not exposed to userspace. // So we have to do the same operation here to save the X0 value // into the task context. t.Arch().SyscallSaveOrig() diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 9dea2b5ff..60df51dae 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -2718,7 +2718,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2787,7 +2787,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc if v > math.MaxInt32 { v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2803,7 +2803,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index b39ffa9fb..0ab4c3e19 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -235,11 +235,11 @@ type RcvBufAutoTuneParams struct { // was started. MeasureTime time.Time - // CopiedBytes is the number of bytes copied to user space since + // CopiedBytes is the number of bytes copied to userspace since // this measure began. CopiedBytes int - // PrevCopiedBytes is the number of bytes copied to user space in + // PrevCopiedBytes is the number of bytes copied to userspace in // the previous RTT period. PrevCopiedBytes int diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 71735029e..b5ba972f1 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1097,7 +1097,7 @@ func (e *endpoint) initialReceiveWindow() int { } // ModerateRecvBuf adjusts the receive buffer and the advertised window -// based on the number of bytes copied to user space. +// based on the number of bytes copied to userspace. func (e *endpoint) ModerateRecvBuf(copied int) { e.LockUser() defer e.UnlockUser() diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 0b4512c65..6ef32a1b3 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -5869,7 +5869,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) { // Invoke the moderation API. This is required for auto-tuning // to happen. This method is normally expected to be invoked // from a higher layer than tcpip.Endpoint. So we simulate - // copying to user-space by invoking it explicitly here. + // copying to userspace by invoking it explicitly here. c.EP.ModerateRecvBuf(totalCopied) // Now send a keep-alive packet to trigger an ACK so that we can diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go index c7d210140..cd85dabbb 100644 --- a/runsc/cmd/help.go +++ b/runsc/cmd/help.go @@ -65,16 +65,10 @@ func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{} switch f.NArg() { case 0: fmt.Fprintf(h.cdr.Output, "Usage: %s \n\n", h.cdr.Name()) - fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open -Container Initiative (OCI) format. Applications run by runsc are run in an -isolated gVisor sandbox that emulates a Linux environment. + fmt.Fprintf(h.cdr.Output, `runsc is the gVisor container runtime. -gVisor is a user-space kernel, written in Go, that implements a substantial -portion of the Linux system call interface. It provides an additional layer -of isolation between running applications and the host operating system. - -Functionality is provided by subcommands. For additonal help on individual -subcommands use "%s %s ". +Functionality is provided by subcommands. For help with a specific subcommand, +use "%s %s ". `, h.cdr.Name(), h.Name()) h.cdr.VisitGroups(func(g *subcommands.CommandGroup) { diff --git a/website/BUILD b/website/BUILD index d6afd5f44..c97b2560b 100644 --- a/website/BUILD +++ b/website/BUILD @@ -138,7 +138,6 @@ docs( "//g3doc:community", "//g3doc:index", "//g3doc:roadmap", - "//g3doc/architecture_guide:index", "//g3doc/architecture_guide:performance", "//g3doc/architecture_guide:platforms", "//g3doc/architecture_guide:resources", diff --git a/website/_layouts/docs.html b/website/_layouts/docs.html index 33ea8e1de..549305089 100644 --- a/website/_layouts/docs.html +++ b/website/_layouts/docs.html @@ -51,7 +51,9 @@ categories: Create issue

{% endif %} +
{{ content }} +
diff --git a/website/_sass/front.scss b/website/_sass/front.scss index 44a7e3473..0e4208f3c 100644 --- a/website/_sass/front.scss +++ b/website/_sass/front.scss @@ -4,12 +4,14 @@ background-repeat: no-repeat; background-size: cover; background-blend-mode: darken; - background-color: rgba(0, 0, 0, 0.1); + background-color: rgba(0, 0, 0, 0.3); p { color: #fff; margin-top: 0; margin-bottom: 0; font-weight: 300; + font-size: 24px; + line-height: 30px; } } diff --git a/website/_sass/style.scss b/website/_sass/style.scss index 520ea469a..4deb945d4 100644 --- a/website/_sass/style.scss +++ b/website/_sass/style.scss @@ -142,3 +142,13 @@ table th { margin-top: 10px; margin-bottom: 20px; } + +.docs-content * img { + display: block; + margin: 20px auto; +} + +.blog-content * img { + display: block; + margin: 20px auto; +} diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md index ed6d97ffe..fbdd511dd 100644 --- a/website/blog/2019-11-18-security-basics.md +++ b/website/blog/2019-11-18-security-basics.md @@ -56,15 +56,9 @@ in combination: redundant walls, scattered draw bridges, small bottle-neck entrances, moats, etc. A simplified version of the design is below -([more detailed version](/docs/architecture_guide/))[^2]: +([more detailed version](/docs/))[^2]: --------------------------------------------------------------------------------- - -![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png) - -Figure 1: Simplified design of gVisor. - --------------------------------------------------------------------------------- +![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png "Simplified design of gVisor.") In order to discuss design principles, the following components are important to know: @@ -134,13 +128,7 @@ minimum level of permission is required for it to perform its function. Specifically, the closer you are to the untrusted application, the less privilege you have. --------------------------------------------------------------------------------- - -![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png) - -Figure 2: runsc components and their privileges. - --------------------------------------------------------------------------------- +![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png "runsc components and their privileges.") This is evident in how runsc (the drop in gVisor binary for Docker/Kubernetes) constructs the sandbox. The Sentry has the least privilege possible (it can't @@ -222,15 +210,7 @@ the host Linux syscalls. In other words, with gVisor, applications get the vast majority (and growing) functionality of Linux containers for only 68 possible syscalls to the Host OS. 350 syscalls to 68 is attack surface reduction. --------------------------------------------------------------------------------- - -![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png) - -Figure 3: Reduction of Attack Surface of the Syscall Table. Note that the -Senty's Syscall Emulation Layer keeps the Containerized Process from ever -calling the Host OS. - --------------------------------------------------------------------------------- +![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png "Reduction of Attack Surface of the Syscall Table. Note that the Senty's Syscall Emulation Layer keeps the Containerized Process from ever calling the Host OS.") ## Secure-by-default diff --git a/website/blog/2020-04-02-networking-security.md b/website/blog/2020-04-02-networking-security.md index 78f0a6714..5a5e38fd7 100644 --- a/website/blog/2020-04-02-networking-security.md +++ b/website/blog/2020-04-02-networking-security.md @@ -69,13 +69,7 @@ a similar syscall). Moreover, because packets typically come from off-host (e.g. the internet), the Host OS's packet processing code has received a lot of scrutiny, hopefully resulting in a high degree of hardening. --------------------------------------------------------------------------------- - -![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png) - -Figure 1: Netstack and gVisor - --------------------------------------------------------------------------------- +![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png "Network and gVisor.") ## Writing a network stack diff --git a/website/index.md b/website/index.md index 95d5d16f0..84f877d49 100644 --- a/website/index.md +++ b/website/index.md @@ -3,10 +3,10 @@
-

gVisor is an application kernel and container runtime providing defense-in-depth for containers anywhere.

+

gVisor is an application kernel for containers that provides efficient defense-in-depth anywhere.

+ Quick start  Learn More  - GitHub 

@@ -19,8 +19,8 @@

Container-native Security

-

By providing each container with its own userspace kernel, gVisor limits - the attack surface of the host. This protection does not limit +

By providing each container with its own application kernel, gVisor + limits the attack surface of the host. This protection does not limit functionality: gVisor runs unmodified binaries and integrates with container orchestration systems, such as Docker and Kubernetes, and supports features such as volumes and sidecars.

@@ -43,7 +43,7 @@ The pluggable platform architecture of gVisor allows it to run anywhere, enabling consistent security policies across multiple environments without having to rearchitect your infrastructure.

- Get Started » + Read More »
-- cgit v1.2.3 From af3121a52383fb60579d769994be5d91bd788015 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 26 May 2020 21:42:07 -0700 Subject: Implement splice(2) and tee(2) for VFS2. Updates #138 PiperOrigin-RevId: 313326354 --- pkg/buffer/safemem.go | 82 ++++----- pkg/sentry/fsimpl/tmpfs/regular_file.go | 2 +- pkg/sentry/kernel/pipe/BUILD | 2 + pkg/sentry/kernel/pipe/pipe.go | 6 + pkg/sentry/kernel/pipe/pipe_unsafe.go | 35 ++++ pkg/sentry/kernel/pipe/vfs.go | 219 ++++++++++++++++++++++- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/splice.go | 286 +++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 4 +- pkg/sentry/vfs/file_description.go | 5 + test/syscalls/linux/splice.cc | 49 ++++++ 11 files changed, 648 insertions(+), 43 deletions(-) create mode 100644 pkg/sentry/kernel/pipe/pipe_unsafe.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/splice.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go index 0e5b86344..b789e56e9 100644 --- a/pkg/buffer/safemem.go +++ b/pkg/buffer/safemem.go @@ -28,12 +28,11 @@ func (b *buffer) ReadBlock() safemem.Block { return safemem.BlockFromSafeSlice(b.ReadSlice()) } -// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. -// -// This will advance the write index. -func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - need := int(srcs.NumBytes()) - if need == 0 { +// WriteFromSafememReader writes up to count bytes from r to v and advances the +// write index by the number of bytes written. It calls r.ReadToBlocks() at +// most once. +func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) { + if count == 0 { return 0, nil } @@ -50,32 +49,33 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { } // Does the last block have sufficient capacity alone? - if l := firstBuf.WriteSize(); l >= need { - dst = safemem.BlockSeqOf(firstBuf.WriteBlock()) + if l := uint64(firstBuf.WriteSize()); l >= count { + dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count)) } else { // Append blocks until sufficient. - need -= l + count -= l blocks = append(blocks, firstBuf.WriteBlock()) - for need > 0 { + for count > 0 { emptyBuf := bufferPool.Get().(*buffer) v.data.PushBack(emptyBuf) - need -= emptyBuf.WriteSize() - blocks = append(blocks, emptyBuf.WriteBlock()) + block := emptyBuf.WriteBlock().TakeFirst64(count) + count -= uint64(block.Len()) + blocks = append(blocks, block) } dst = safemem.BlockSeqFromSlice(blocks) } - // Perform the copy. - n, err := safemem.CopySeq(dst, srcs) + // Perform I/O. + n, err := r.ReadToBlocks(dst) v.size += int64(n) // Update all indices. - for left := int(n); left > 0; firstBuf = firstBuf.Next() { - if l := firstBuf.WriteSize(); left >= l { + for left := n; left > 0; firstBuf = firstBuf.Next() { + if l := firstBuf.WriteSize(); left >= uint64(l) { firstBuf.WriteMove(l) // Whole block. - left -= l + left -= uint64(l) } else { - firstBuf.WriteMove(left) // Partial block. + firstBuf.WriteMove(int(left)) // Partial block. left = 0 } } @@ -83,14 +83,16 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { return n, err } -// ReadToBlocks implements safemem.Reader.ReadToBlocks. -// -// This will not advance the read index; the caller should follow -// this call with a call to TrimFront in order to remove the read -// data from the buffer. This is done to support pipe sematics. -func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - need := int(dsts.NumBytes()) - if need == 0 { +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the +// write index by the number of bytes written. +func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes()) +} + +// ReadToSafememWriter reads up to count bytes from v to w. It does not advance +// the read index. It calls w.WriteFromBlocks() at most once. +func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) { + if count == 0 { return 0, nil } @@ -105,25 +107,27 @@ func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { } // Is all the data in a single block? - if l := firstBuf.ReadSize(); l >= need { - src = safemem.BlockSeqOf(firstBuf.ReadBlock()) + if l := uint64(firstBuf.ReadSize()); l >= count { + src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count)) } else { // Build a list of all the buffers. - need -= l + count -= l blocks = append(blocks, firstBuf.ReadBlock()) - for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() { - need -= buf.ReadSize() - blocks = append(blocks, buf.ReadBlock()) + for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() { + block := buf.ReadBlock().TakeFirst64(count) + count -= uint64(block.Len()) + blocks = append(blocks, block) } src = safemem.BlockSeqFromSlice(blocks) } - // Perform the copy. - n, err := safemem.CopySeq(dsts, src) - - // See above: we would normally advance the read index here, but we - // don't do that in order to support pipe semantics. We rely on a - // separate call to TrimFront() in this case. + // Perform I/O. As documented, we don't advance the read index. + return w.WriteFromBlocks(src) +} - return n, err +// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the +// read index by the number of bytes read, such that it's only safe to call if +// the caller guarantees that ReadToBlocks will only be called once. +func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes()) } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 3f433d666..fee174375 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -312,7 +312,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off f := fd.inode().impl.(*regularFile) if end := offset + srclen; end < offset { // Overflow. - return 0, syserror.EFBIG + return 0, syserror.EINVAL } var err error diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index f29dc0472..7bfa9075a 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -8,6 +8,7 @@ go_library( "device.go", "node.go", "pipe.go", + "pipe_unsafe.go", "pipe_util.go", "reader.go", "reader_writer.go", @@ -20,6 +21,7 @@ go_library( "//pkg/amutex", "//pkg/buffer", "//pkg/context", + "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 62c8691f1..79645d7d2 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -207,7 +207,10 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { p.mu.Lock() defer p.mu.Unlock() + return p.readLocked(ctx, ops) +} +func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) { // Is the pipe empty? if p.view.Size() == 0 { if !p.HasWriters() { @@ -246,7 +249,10 @@ type writeOps struct { func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { p.mu.Lock() defer p.mu.Unlock() + return p.writeLocked(ctx, ops) +} +func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { // Can't write to a pipe with no readers. if !p.HasReaders() { return 0, syscall.EPIPE diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go new file mode 100644 index 000000000..dd60cba24 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "unsafe" +) + +// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be +// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that +// concurrent calls cannot deadlock. +// +// Preconditions: x != y. +func lockTwoPipes(x, y *Pipe) { + // Lock the two pipes in order of increasing address. + if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) { + x.mu.Lock() + y.mu.Lock() + } else { + y.mu.Lock() + x.mu.Lock() + } +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index b54f08a30..2602bed72 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -16,7 +16,9 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -150,7 +152,9 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) * return &fd.vfsfd } -// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. +// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements +// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to +// other FileDescriptions for splice(2) and tee(2). type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -229,3 +233,216 @@ func (fd *VFSPipeFD) PipeSize() int64 { func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { return fd.pipe.SetFifoSize(size) } + +// IOSequence returns a useremm.IOSequence that reads up to count bytes from, +// or writes up to count bytes to, fd. +func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence { + return usermem.IOSequence{ + IO: fd, + Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), + } +} + +// CopyIn implements usermem.IO.CopyIn. +func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { + origCount := int64(len(dst)) + n, err := fd.pipe.read(ctx, readOps{ + left: func() int64 { + return int64(len(dst)) + }, + limit: func(l int64) { + dst = dst[:l] + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadAt(dst, 0) + view.TrimFront(int64(n)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventOut) + } + if err == nil && n != origCount { + return int(n), syserror.ErrWouldBlock + } + return int(n), err +} + +// CopyOut implements usermem.IO.CopyOut. +func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { + origCount := int64(len(src)) + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return int64(len(src)) + }, + limit: func(l int64) { + src = src[:l] + }, + write: func(view *buffer.View) (int64, error) { + view.Append(src) + return int64(len(src)), nil + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return int(n), syserror.ErrWouldBlock + } + return int(n), err +} + +// ZeroOut implements usermem.IO.ZeroOut. +func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { + origCount := toZero + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return toZero + }, + limit: func(l int64) { + toZero = l + }, + write: func(view *buffer.View) (int64, error) { + view.Grow(view.Size()+toZero, true /* zero */) + return toZero, nil + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// CopyInTo implements usermem.IO.CopyInTo. +func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { + count := ars.NumBytes() + if count == 0 { + return 0, nil + } + origCount := count + n, err := fd.pipe.read(ctx, readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadToSafememWriter(dst, uint64(count)) + view.TrimFront(int64(n)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventOut) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// CopyOutFrom implements usermem.IO.CopyOutFrom. +func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { + count := ars.NumBytes() + if count == 0 { + return 0, nil + } + origCount := count + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(view *buffer.View) (int64, error) { + n, err := view.WriteFromSafememReader(src, uint64(count)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// SwapUint32 implements usermem.IO.SwapUint32. +func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { + // How did a pipe get passed as the virtual address space to futex(2)? + panic("VFSPipeFD.SwapUint32 called unexpectedly") +} + +// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. +func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { + panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly") +} + +// LoadUint32 implements usermem.IO.LoadUint32. +func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) { + panic("VFSPipeFD.LoadUint32 called unexpectedly") +} + +// Splice reads up to count bytes from src and writes them to dst. It returns +// the number of bytes moved. +// +// Preconditions: count > 0. +func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { + return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */) +} + +// Tee reads up to count bytes from src and writes them to dst, without +// removing the read bytes from src. It returns the number of bytes copied. +// +// Preconditions: count > 0. +func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { + return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */) +} + +// Preconditions: count > 0. +func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) { + if dst.pipe == src.pipe { + return 0, syserror.EINVAL + } + + lockTwoPipes(dst.pipe, src.pipe) + defer dst.pipe.mu.Unlock() + defer src.pipe.mu.Unlock() + + n, err := dst.pipe.writeLocked(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(dstView *buffer.View) (int64, error) { + return src.pipe.readLocked(ctx, readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(srcView *buffer.View) (int64, error) { + n, err := srcView.ReadToSafememWriter(dstView, uint64(count)) + if n > 0 && removeFromSrc { + srcView.TrimFront(int64(n)) + } + return int64(n), err + }, + }) + }, + }) + if n > 0 { + dst.pipe.Notify(waiter.EventIn) + src.pipe.Notify(waiter.EventOut) + } + return n, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index f882ef840..d56927ff5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -22,6 +22,7 @@ go_library( "setstat.go", "signal.go", "socket.go", + "splice.go", "stat.go", "stat_amd64.go", "stat_arm64.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go new file mode 100644 index 000000000..8f3c22a02 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -0,0 +1,286 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Splice implements Linux syscall splice(2). +func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + inOffsetPtr := args[1].Pointer() + outFD := args[2].Int() + outOffsetPtr := args[3].Pointer() + count := int64(args[4].SizeT()) + flags := args[5].Int() + + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get file descriptions. + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + // Check that both files support the required directionality. + if !inFile.IsReadable() || !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // At least one file description must represent a pipe. + inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + if !inIsPipe && !outIsPipe { + return 0, nil, syserror.EINVAL + } + + // Copy in offsets. + inOffset := int64(-1) + if inOffsetPtr != 0 { + if inIsPipe { + return 0, nil, syserror.ESPIPE + } + if inFile.Options().DenyPRead { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil { + return 0, nil, err + } + if inOffset < 0 { + return 0, nil, syserror.EINVAL + } + } + outOffset := int64(-1) + if outOffsetPtr != 0 { + if outIsPipe { + return 0, nil, syserror.ESPIPE + } + if outFile.Options().DenyPWrite { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil { + return 0, nil, err + } + if outOffset < 0 { + return 0, nil, syserror.EINVAL + } + } + + // Move data. + var ( + n int64 + err error + inCh chan struct{} + outCh chan struct{} + ) + for { + // If both input and output are pipes, delegate to the pipe + // implementation. Otherwise, exactly one end is a pipe, which we + // ensure is consistently ordered after the non-pipe FD's locks by + // passing the pipe FD as usermem.IO to the non-pipe end. + switch { + case inIsPipe && outIsPipe: + n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) + case inIsPipe: + if outOffset != -1 { + n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{}) + outOffset += n + } else { + n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{}) + } + case outIsPipe: + if inOffset != -1 { + n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{}) + inOffset += n + } else { + n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{}) + } + } + if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + break + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the splice operation. + if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, eventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err = t.Block(inCh); err != nil { + break + } + } + if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, eventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err = t.Block(outCh); err != nil { + break + } + } + } + + // Copy updated offsets out. + if inOffsetPtr != 0 { + if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil { + return 0, nil, err + } + } + if outOffsetPtr != 0 { + if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil { + return 0, nil, err + } + } + + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Tee implements Linux syscall tee(2). +func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + outFD := args[1].Int() + count := int64(args[2].SizeT()) + flags := args[3].Int() + + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get file descriptions. + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + // Check that both files support the required directionality. + if !inFile.IsReadable() || !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // Both file descriptions must represent pipes. + inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + if !inIsPipe || !outIsPipe { + return 0, nil, syserror.EINVAL + } + + // Copy data. + var ( + inCh chan struct{} + outCh chan struct{} + ) + for { + n, err := pipe.Tee(t, outPipeFD, inPipeFD, count) + if n != 0 { + return uintptr(n), nil, nil + } + if err != syserror.ErrWouldBlock || nonBlock { + return 0, nil, err + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the tee operation. + if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, eventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err := t.Block(inCh); err != nil { + return 0, nil, err + } + } + if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, eventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err := t.Block(outCh); err != nil { + return 0, nil, err + } + } + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index a332d01bd..083fdcf82 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -134,8 +134,8 @@ func Override() { s.Table[269] = syscalls.Supported("faccessat", Faccessat) s.Table[270] = syscalls.Supported("pselect", Pselect) s.Table[271] = syscalls.Supported("ppoll", Ppoll) - delete(s.Table, 275) // splice - delete(s.Table, 276) // tee + s.Table[275] = syscalls.Supported("splice", Splice) + s.Table[276] = syscalls.Supported("tee", Tee) s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange) s.Table[280] = syscalls.Supported("utimensat", Utimensat) s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index cfabd936c..bb294563d 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -210,6 +210,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry { return fd.vd } +// Options returns the options passed to fd.Init(). +func (fd *FileDescription) Options() FileDescriptionOptions { + return fd.opts +} + // StatusFlags returns file description status flags, as for fcntl(F_GETFL). func (fd *FileDescription) StatusFlags() uint32 { return atomic.LoadUint32(&fd.statusFlags) diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc index f103e2e56..08fc4b1b7 100644 --- a/test/syscalls/linux/splice.cc +++ b/test/syscalls/linux/splice.cc @@ -430,6 +430,55 @@ TEST(SpliceTest, TwoPipes) { EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0); } +TEST(SpliceTest, TwoPipesCircular) { + // This test deadlocks the sentry on VFS1 because VFS1 splice ordering is + // based on fs.File.UniqueID, which does not prevent circular ordering between + // e.g. inode-level locks taken by fs.FileOperations. + SKIP_IF(IsRunningWithVFS1()); + + // Create two pipes. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor first_rfd(fds[0]); + const FileDescriptor first_wfd(fds[1]); + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor second_rfd(fds[0]); + const FileDescriptor second_wfd(fds[1]); + + // On Linux, each pipe is normally limited to + // include/linux/pipe_fs_i.h:PIPE_DEF_BUFFERS buffers worth of data. + constexpr size_t PIPE_DEF_BUFFERS = 16; + + // Write some data to each pipe. Below we splice 1 byte at a time between + // pipes, which very quickly causes each byte to be stored in a separate + // buffer, so we must ensure that the total amount of data in the system is <= + // PIPE_DEF_BUFFERS bytes. + std::vector buf(PIPE_DEF_BUFFERS / 2); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(write(first_wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(buf.size())); + ASSERT_THAT(write(second_wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(buf.size())); + + // Have another thread splice from the second pipe to the first, while we + // splice from the first to the second. The test passes if this does not + // deadlock. + const int kIterations = 1000; + DisableSave ds; + ScopedThread t([&]() { + for (int i = 0; i < kIterations; i++) { + ASSERT_THAT( + splice(second_rfd.get(), nullptr, first_wfd.get(), nullptr, 1, 0), + SyscallSucceedsWithValue(1)); + } + }); + for (int i = 0; i < kIterations; i++) { + ASSERT_THAT( + splice(first_rfd.get(), nullptr, second_wfd.get(), nullptr, 1, 0), + SyscallSucceedsWithValue(1)); + } +} + TEST(SpliceTest, Blocking) { // Create two new pipes. int first[2], second[2]; -- cgit v1.2.3 From fe464f44b7d3696bafd9a2faf3750e1dc4d56d80 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 29 May 2020 08:07:44 -0700 Subject: Port inotify to vfs2, with support in tmpfs. Support in other filesystem impls is still needed. Unlike in Linux and vfs1, we need to plumb inotify down to each filesystem implementation in order to keep track of links/inode structures properly. IN_EXCL_UNLINK still needs to be implemented, as well as a few inotify hooks that are not present in either vfs1 or vfs2. Those will be addressed in subsequent changes. Updates #1479. PiperOrigin-RevId: 313781995 --- pkg/sentry/fsimpl/ext/dentry.go | 12 + pkg/sentry/fsimpl/gofer/gofer.go | 12 + pkg/sentry/fsimpl/kernfs/kernfs.go | 12 + pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/directory.go | 1 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 40 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 60 ++- pkg/sentry/kernel/fd_table.go | 8 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/inotify.go | 134 ++++++ pkg/sentry/syscalls/linux/vfs2/read_write.go | 36 ++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 8 +- pkg/sentry/vfs/BUILD | 15 + pkg/sentry/vfs/anonfs.go | 12 + pkg/sentry/vfs/dentry.go | 27 ++ pkg/sentry/vfs/inotify.go | 675 +++++++++++++++++++++++++++ pkg/sentry/vfs/vfs.go | 1 + test/syscalls/linux/BUILD | 5 +- test/syscalls/linux/inotify.cc | 176 ++++++- 19 files changed, 1211 insertions(+), 25 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/inotify.go create mode 100644 pkg/sentry/vfs/inotify.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index bfbd7c3d4..4d0deaf03 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -60,3 +60,15 @@ func (d *dentry) DecRef() { // inode.decRef(). d.inode.decRef() } + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) Watches() *vfs.Watches { + return nil +} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 131da332f..850482a19 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1039,6 +1039,18 @@ func (d *dentry) decRefLocked() { } } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) Watches() *vfs.Watches { + return nil +} + // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index a83151ad3..682545994 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -225,6 +225,18 @@ func (d *Dentry) destroy() { } } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *Dentry) Watches() *vfs.Watches { + return nil +} + // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on // it's own isn't sufficient to insert a child into a directory. InsertChild diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 007be1572..062321cbc 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -59,6 +59,7 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/vfs/lock", diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index f2399981b..8bc475f88 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -112,6 +112,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba dir.iterMu.Lock() defer dir.iterMu.Unlock() + fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0) fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 7c04570f1..b4159f904 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if err := create(parentDir, name); err != nil { return err } + + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parentDir.inode.watches.Notify(name, uint32(ev), 0) parentDir.inode.touchCMtime() return nil } @@ -241,6 +247,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } d.inode.incLinksLocked() + d.inode.watches.Notify("", linux.IN_ATTRIB, 0) parentDir.insertChildLocked(fs.newDentry(d.inode), name) return nil }) @@ -354,6 +361,7 @@ afterTrailingSymlink: if err != nil { return nil, err } + parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0) parentDir.inode.touchCMtime() return fd, nil } @@ -559,6 +567,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newParentDir.inode.touchCMtime() } renamed.inode.touchCtime() + + vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir()) return nil } @@ -603,6 +613,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } parentDir.removeChildLocked(child) + parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0) // Remove links for child, child/., and child/.. child.inode.decLinksLocked() child.inode.decLinksLocked() @@ -620,7 +631,14 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if err != nil { return err } - return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat) + if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil { + return err + } + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0) + } + return nil } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -700,6 +718,12 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } + + // Generate inotify events. Note that this must take place before the link + // count of the child is decremented, or else the watches may be dropped + // before these events are added. + vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name) + parentDir.removeChildLocked(child) child.inode.decLinksLocked() vfsObj.CommitDeleteDentry(&child.vfsd) @@ -756,7 +780,12 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt if err != nil { return err } - return d.inode.setxattr(rp.Credentials(), &opts) + if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil { + return err + } + + d.InotifyWithParent(linux.IN_ATTRIB, 0) + return nil } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. @@ -767,7 +796,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, if err != nil { return err } - return d.inode.removexattr(rp.Credentials(), name) + if err := d.inode.removexattr(rp.Credentials(), name); err != nil { + return err + } + + d.InotifyWithParent(linux.IN_ATTRIB, 0) + return nil } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index b739095b7..1d83b6840 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -201,6 +201,26 @@ func (d *dentry) DecRef() { d.inode.decRef() } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32) { + if d.inode.isDir() { + events |= linux.IN_ISDIR + } + + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + // Note that d.parent or d.name may be stale if there is a concurrent + // rename operation. Inotify does not provide consistency guarantees. + d.parent.inode.watches.Notify(d.name, events, cookie) + } + d.inode.watches.Notify("", events, cookie) +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + return &d.inode.watches +} + // inode represents a filesystem object. type inode struct { // fs is the owning filesystem. fs is immutable. @@ -236,6 +256,9 @@ type inode struct { // Advisory file locks, which lock at the inode level. locks lock.FileLocks + // Inotify watches for this inode. + watches vfs.Watches + impl interface{} // immutable } @@ -257,6 +280,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.ctime = now i.mtime = now // i.nlink initialized by caller + i.watches = vfs.Watches{} i.impl = impl } @@ -307,6 +331,7 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + i.watches.HandleDeletion() if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is // no longer usable, we don't need to grab any locks or update any @@ -628,8 +653,12 @@ func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + func (fd *fileDescription) inode() *inode { - return fd.vfsfd.Dentry().Impl().(*dentry).inode + return fd.dentry().inode } // Stat implements vfs.FileDescriptionImpl.Stat. @@ -642,7 +671,16 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - return fd.inode().setStat(ctx, creds, &opts.Stat) + d := fd.dentry() + if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil { + return err + } + + // Generate inotify events. + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0) + } + return nil } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. @@ -657,12 +695,26 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts) + d := fd.dentry() + if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil { + return err + } + + // Generate inotify events. + d.InotifyWithParent(linux.IN_ATTRIB, 0) + return nil } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) + d := fd.dentry() + if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil { + return err + } + + // Generate inotify events. + d.InotifyWithParent(linux.IN_ATTRIB, 0) + return nil } // NewMemfd creates a new tmpfs regular file and file description that can back diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index ed40b5303..ef73e1169 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -152,7 +152,13 @@ func (f *FDTable) drop(file *fs.File) { // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(file *vfs.FileDescription) { // TODO(gvisor.dev/issue/1480): Release locks. - // TODO(gvisor.dev/issue/1479): Send inotify events. + + // Generate inotify events. + ev := uint32(linux.IN_CLOSE_NOWRITE) + if file.IsWritable() { + ev = linux.IN_CLOSE_WRITE + } + file.Dentry().InotifyWithParent(ev, 0) // Drop the table reference. file.DecRef() diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index d56927ff5..9c8b44f64 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -12,6 +12,7 @@ go_library( "filesystem.go", "fscontext.go", "getdents.go", + "inotify.go", "ioctl.go", "memfd.go", "mmap.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go new file mode 100644 index 000000000..7d50b6a16 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -0,0 +1,134 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC + +// InotifyInit1 implements the inotify_init1() syscalls. +func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags&^allFlags != 0 { + return 0, nil, syserror.EINVAL + } + + ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags)) + if err != nil { + return 0, nil, err + } + defer ino.DecRef() + + fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{ + CloseOnExec: flags&linux.IN_CLOEXEC != 0, + }) + + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// InotifyInit implements the inotify_init() syscalls. +func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[0].Value = 0 + return InotifyInit1(t, args) +} + +// fdToInotify resolves an fd to an inotify object. If successful, the file will +// have an extra ref and the caller is responsible for releasing the ref. +func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) { + f := t.GetFileVFS2(fd) + if f == nil { + // Invalid fd. + return nil, nil, syserror.EBADF + } + + ino, ok := f.Impl().(*vfs.Inotify) + if !ok { + // Not an inotify fd. + f.DecRef() + return nil, nil, syserror.EINVAL + } + + return ino, f, nil +} + +// InotifyAddWatch implements the inotify_add_watch() syscall. +func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + mask := args[2].Uint() + + // "EINVAL: The given event mask contains no valid events." + // -- inotify_add_watch(2) + if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { + return 0, nil, syserror.EINVAL + } + + // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." + // -- inotify(7) + follow := followFinalSymlink + if mask&linux.IN_DONT_FOLLOW == 0 { + follow = nofollowFinalSymlink + } + + ino, f, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer f.DecRef() + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + if mask&linux.IN_ONLYDIR != 0 { + path.Dir = true + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) + if err != nil { + return 0, nil, err + } + defer d.DecRef() + + fd = ino.AddWatch(d.Dentry(), mask) + return uintptr(fd), nil, err +} + +// InotifyRmWatch implements the inotify_rm_watch() syscall. +func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + wd := args[1].Int() + + ino, f, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer f.DecRef() + return 0, nil, ino.RmWatch(wd) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 3a7ef24f5..92b5631a3 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -93,11 +93,17 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } @@ -128,6 +134,9 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return total, err } @@ -248,11 +257,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } @@ -283,6 +298,9 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return total, err } @@ -345,11 +363,17 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + } return n, err } @@ -380,6 +404,9 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + } return total, err } @@ -500,11 +527,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } @@ -535,6 +568,9 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return total, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 083fdcf82..ef8358b8a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -116,9 +116,9 @@ func Override() { s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) s.Table[235] = syscalls.Supported("utimes", Utimes) - delete(s.Table, 253) // inotify_init - delete(s.Table, 254) // inotify_add_watch - delete(s.Table, 255) // inotify_rm_watch + s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil) + s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil) s.Table[257] = syscalls.Supported("openat", Openat) s.Table[258] = syscalls.Supported("mkdirat", Mkdirat) s.Table[259] = syscalls.Supported("mknodat", Mknodat) @@ -151,7 +151,7 @@ func Override() { s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1) s.Table[292] = syscalls.Supported("dup3", Dup3) s.Table[293] = syscalls.Supported("pipe2", Pipe2) - delete(s.Table, 294) // inotify_init1 + s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil) s.Table[295] = syscalls.Supported("preadv", Preadv) s.Table[296] = syscalls.Supported("pwritev", Pwritev) s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 94d69c1cc..774cc66cc 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -15,6 +15,18 @@ go_template_instance( }, ) +go_template_instance( + name = "event_list", + out = "event_list.go", + package = "vfs", + prefix = "event", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Event", + "Linker": "*Event", + }, +) + go_library( name = "vfs", srcs = [ @@ -25,11 +37,13 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", + "event_list.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", "filesystem_impl_util.go", "filesystem_type.go", + "inotify.go", "mount.go", "mount_unsafe.go", "options.go", @@ -57,6 +71,7 @@ go_library( "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index caf770fd5..55a3d54cc 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -297,3 +297,15 @@ func (d *anonDentry) TryIncRef() bool { func (d *anonDentry) DecRef() { // no-op } + +// InotifyWithParent implements DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32) {} + +// Watches implements DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *anonDentry) Watches() *Watches { + return nil +} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 8624dbd5d..d61b9e09b 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -103,6 +103,22 @@ type DentryImpl interface { // DecRef decrements the Dentry's reference count. DecRef() + + // InotifyWithParent notifies all watches on the targets represented by this + // dentry and its parent. The parent's watches are notified first, followed + // by this dentry's. + // + // InotifyWithParent automatically adds the IN_ISDIR flag for dentries + // representing directories. + // + // Note that the events may not actually propagate up to the user, depending + // on the event masks. + InotifyWithParent(events uint32, cookie uint32) + + // Watches returns the set of inotify watches for the file corresponding to + // the Dentry. Dentries that are hard links to the same underlying file + // share the same watches. + Watches() *Watches } // IncRef increments d's reference count. @@ -133,6 +149,17 @@ func (d *Dentry) isMounted() bool { return atomic.LoadUint32(&d.mounts) != 0 } +// InotifyWithParent notifies all watches on the inodes for this dentry and +// its parent of events. +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) { + d.impl.InotifyWithParent(events, cookie) +} + +// Watches returns the set of inotify watches associated with d. +func (d *Dentry) Watches() *Watches { + return d.impl.Watches() +} + // The following functions are exported so that filesystem implementations can // use them. The vfs package, and users of VFS, should not call these // functions. diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go new file mode 100644 index 000000000..1d28ccb46 --- /dev/null +++ b/pkg/sentry/vfs/inotify.go @@ -0,0 +1,675 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// inotifyEventBaseSize is the base size of linux's struct inotify_event. This +// must be a power 2 for rounding below. +const inotifyEventBaseSize = 16 + +// Inotify represents an inotify instance created by inotify_init(2) or +// inotify_init1(2). Inotify implements FileDescriptionImpl. +// +// Lock ordering: +// Inotify.mu -> Watches.mu -> Inotify.evMu +// +// +stateify savable +type Inotify struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + + // Unique identifier for this inotify instance. We don't just reuse the + // inotify fd because fds can be duped. These should not be exposed to the + // user, since we may aggressively reuse an id on S/R. + id uint64 + + // queue is used to notify interested parties when the inotify instance + // becomes readable or writable. + queue waiter.Queue `state:"nosave"` + + // evMu *only* protects the events list. We need a separate lock while + // queuing events: using mu may violate lock ordering, since at that point + // the calling goroutine may already hold Watches.mu. + evMu sync.Mutex `state:"nosave"` + + // A list of pending events for this inotify instance. Protected by evMu. + events eventList + + // A scratch buffer, used to serialize inotify events. Allocate this + // ahead of time for the sake of performance. Protected by evMu. + scratch []byte + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // nextWatchMinusOne is used to allocate watch descriptors on this Inotify + // instance. Note that Linux starts numbering watch descriptors from 1. + nextWatchMinusOne int32 + + // Map from watch descriptors to watch objects. + watches map[int32]*Watch +} + +var _ FileDescriptionImpl = (*Inotify)(nil) + +// NewInotifyFD constructs a new Inotify instance. +func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) { + // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs. + flags &^= linux.O_CLOEXEC + if flags&^linux.O_NONBLOCK != 0 { + return nil, syserror.EINVAL + } + + id := uniqueid.GlobalFromContext(ctx) + vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) + defer vd.DecRef() + fd := &Inotify{ + id: id, + scratch: make([]byte, inotifyEventBaseSize), + watches: make(map[int32]*Watch), + } + if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// Release implements FileDescriptionImpl.Release. Release removes all +// watches and frees all resources for an inotify instance. +func (i *Inotify) Release() { + // We need to hold i.mu to avoid a race with concurrent calls to + // Inotify.handleDeletion from Watches. There's no risk of Watches + // accessing this Inotify after the destructor ends, because we remove all + // references to it below. + i.mu.Lock() + defer i.mu.Unlock() + for _, w := range i.watches { + // Remove references to the watch from the watches set on the target. We + // don't need to worry about the references from i.watches, since this + // file description is about to be destroyed. + w.set.Remove(i.id) + } +} + +// EventRegister implements waiter.Waitable. +func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + i.queue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable. +func (i *Inotify) EventUnregister(e *waiter.Entry) { + i.queue.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +// +// Readiness indicates whether there are pending events for an inotify instance. +func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + i.evMu.Lock() + defer i.evMu.Unlock() + + if !i.events.Empty() { + ready |= waiter.EventIn + } + + return mask & ready +} + +// PRead implements FileDescriptionImpl. +func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// PWrite implements FileDescriptionImpl. +func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements FileDescriptionImpl.Write. +func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements FileDescriptionImpl.Read. +func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if dst.NumBytes() < inotifyEventBaseSize { + return 0, syserror.EINVAL + } + + i.evMu.Lock() + defer i.evMu.Unlock() + + if i.events.Empty() { + // Nothing to read yet, tell caller to block. + return 0, syserror.ErrWouldBlock + } + + var writeLen int64 + for it := i.events.Front(); it != nil; { + // Advance `it` before the element is removed from the list, or else + // it.Next() will always be nil. + event := it + it = it.Next() + + // Does the buffer have enough remaining space to hold the event we're + // about to write out? + if dst.NumBytes() < int64(event.sizeOf()) { + if writeLen > 0 { + // Buffer wasn't big enough for all pending events, but we did + // write some events out. + return writeLen, nil + } + return 0, syserror.EINVAL + } + + // Linux always dequeues an available event as long as there's enough + // buffer space to copy it out, even if the copy below fails. Emulate + // this behaviour. + i.events.Remove(event) + + // Buffer has enough space, copy event to the read buffer. + n, err := event.CopyTo(ctx, i.scratch, dst) + if err != nil { + return 0, err + } + + writeLen += n + dst = dst.DropFirst64(n) + } + return writeLen, nil +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch args[1].Int() { + case linux.FIONREAD: + i.evMu.Lock() + defer i.evMu.Unlock() + var n uint32 + for e := i.events.Front(); e != nil; e = e.Next() { + n += uint32(e.sizeOf()) + } + var buf [4]byte + usermem.ByteOrder.PutUint32(buf[:], n) + _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +func (i *Inotify) queueEvent(ev *Event) { + i.evMu.Lock() + + // Check if we should coalesce the event we're about to queue with the last + // one currently in the queue. Events are coalesced if they are identical. + if last := i.events.Back(); last != nil { + if ev.equals(last) { + // "Coalesce" the two events by simply not queuing the new one. We + // don't need to raise a waiter.EventIn notification because no new + // data is available for reading. + i.evMu.Unlock() + return + } + } + + i.events.PushBack(ev) + + // Release mutex before notifying waiters because we don't control what they + // can do. + i.evMu.Unlock() + + i.queue.Notify(waiter.EventIn) +} + +// newWatchLocked creates and adds a new watch to target. +// +// Precondition: i.mu must be locked. +func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch { + targetWatches := target.Watches() + w := &Watch{ + owner: i, + wd: i.nextWatchIDLocked(), + set: targetWatches, + mask: mask, + } + + // Hold the watch in this inotify instance as well as the watch set on the + // target. + i.watches[w.wd] = w + targetWatches.Add(w) + return w +} + +// newWatchIDLocked allocates and returns a new watch descriptor. +// +// Precondition: i.mu must be locked. +func (i *Inotify) nextWatchIDLocked() int32 { + i.nextWatchMinusOne++ + return i.nextWatchMinusOne +} + +// handleDeletion handles the deletion of the target of watch w. It removes w +// from i.watches and a watch removal event is generated. +func (i *Inotify) handleDeletion(w *Watch) { + i.mu.Lock() + _, found := i.watches[w.wd] + delete(i.watches, w.wd) + i.mu.Unlock() + + if found { + i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0)) + } +} + +// AddWatch constructs a new inotify watch and adds it to the target. It +// returns the watch descriptor returned by inotify_add_watch(2). +func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { + // Note: Locking this inotify instance protects the result returned by + // Lookup() below. With the lock held, we know for sure the lookup result + // won't become stale because it's impossible for *this* instance to + // add/remove watches on target. + i.mu.Lock() + defer i.mu.Unlock() + + // Does the target already have a watch from this inotify instance? + if existing := target.Watches().Lookup(i.id); existing != nil { + newmask := mask + if mask&linux.IN_MASK_ADD != 0 { + // "Add (OR) events to watch mask for this pathname if it already + // exists (instead of replacing mask)." -- inotify(7) + newmask |= atomic.LoadUint32(&existing.mask) + } + atomic.StoreUint32(&existing.mask, newmask) + return existing.wd + } + + // No existing watch, create a new watch. + w := i.newWatchLocked(target, mask) + return w.wd +} + +// RmWatch looks up an inotify watch for the given 'wd' and configures the +// target to stop sending events to this inotify instance. +func (i *Inotify) RmWatch(wd int32) error { + i.mu.Lock() + + // Find the watch we were asked to removed. + w, ok := i.watches[wd] + if !ok { + i.mu.Unlock() + return syserror.EINVAL + } + + // Remove the watch from this instance. + delete(i.watches, wd) + + // Remove the watch from the watch target. + w.set.Remove(w.OwnerID()) + i.mu.Unlock() + + // Generate the event for the removal. + i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0)) + + return nil +} + +// Watches is the collection of all inotify watches on a single file. +// +// +stateify savable +type Watches struct { + // mu protects the fields below. + mu sync.RWMutex `state:"nosave"` + + // ws is the map of active watches in this collection, keyed by the inotify + // instance id of the owner. + ws map[uint64]*Watch +} + +// Lookup returns the watch owned by an inotify instance with the given id. +// Returns nil if no such watch exists. +// +// Precondition: the inotify instance with the given id must be locked to +// prevent the returned watch from being concurrently modified or replaced in +// Inotify.watches. +func (w *Watches) Lookup(id uint64) *Watch { + w.mu.Lock() + defer w.mu.Unlock() + return w.ws[id] +} + +// Add adds watch into this set of watches. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Add(watch *Watch) { + w.mu.Lock() + defer w.mu.Unlock() + + owner := watch.OwnerID() + // Sanity check, we should never have two watches for one owner on the + // same target. + if _, exists := w.ws[owner]; exists { + panic(fmt.Sprintf("Watch collision with ID %+v", owner)) + } + if w.ws == nil { + w.ws = make(map[uint64]*Watch) + } + w.ws[owner] = watch +} + +// Remove removes a watch with the given id from this set of watches and +// releases it. The caller is responsible for generating any watch removal +// event, as appropriate. The provided id must match an existing watch in this +// collection. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Remove(id uint64) { + w.mu.Lock() + defer w.mu.Unlock() + + if w.ws == nil { + // This watch set is being destroyed. The thread executing the + // destructor is already in the process of deleting all our watches. We + // got here with no references on the target because we raced with the + // destructor notifying all the watch owners of destruction. See the + // comment in Watches.HandleDeletion for why this race exists. + return + } + + if _, ok := w.ws[id]; !ok { + // While there's technically no problem with silently ignoring a missing + // watch, this is almost certainly a bug. + panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id)) + } + delete(w.ws, id) +} + +// Notify queues a new event with all watches in this set. +func (w *Watches) Notify(name string, events, cookie uint32) { + // N.B. We don't defer the unlocks because Notify is in the hot path of + // all IO operations, and the defer costs too much for small IO + // operations. + w.mu.RLock() + for _, watch := range w.ws { + // TODO(gvisor.dev/issue/1479): Skip for IN_EXCL_UNLINK cases. + watch.Notify(name, events, cookie) + } + w.mu.RUnlock() +} + +// HandleDeletion is called when the watch target is destroyed to emit +// the appropriate events. +func (w *Watches) HandleDeletion() { + w.Notify("", linux.IN_DELETE_SELF, 0) + + // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied + // by value. Ideally, we wouldn't have this circular locking so we can just + // notify of IN_DELETE_SELF in the same loop below. + // + // We can't hold w.mu while calling watch.handleDeletion to preserve lock + // ordering w.r.t to the owner inotify instances. Instead, atomically move + // the watches map into a local variable so we can iterate over it safely. + // + // Because of this however, it is possible for the watches' owners to reach + // this inode while the inode has no refs. This is still safe because the + // owners can only reach the inode until this function finishes calling + // watch.handleDeletion below and the inode is guaranteed to exist in the + // meantime. But we still have to be very careful not to rely on inode state + // that may have been already destroyed. + var ws map[uint64]*Watch + w.mu.Lock() + ws = w.ws + w.ws = nil + w.mu.Unlock() + + for _, watch := range ws { + // TODO(gvisor.dev/issue/1479): consider refactoring this. + watch.handleDeletion() + } +} + +// Watch represent a particular inotify watch created by inotify_add_watch. +// +// +stateify savable +type Watch struct { + // Inotify instance which owns this watch. + owner *Inotify + + // Descriptor for this watch. This is unique across an inotify instance. + wd int32 + + // set is the watch set containing this watch. It belongs to the target file + // of this watch. + set *Watches + + // Events being monitored via this watch. Must be accessed with atomic + // memory operations. + mask uint32 +} + +// OwnerID returns the id of the inotify instance that owns this watch. +func (w *Watch) OwnerID() uint64 { + return w.owner.id +} + +// ExcludeUnlinkedChildren indicates whether the watched object should continue +// to be notified of events of its children after they have been unlinked, e.g. +// for an open file descriptor. +// +// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK. +// We can do this by keeping track of the set of unlinked children in Watches +// to skip notification. +func (w *Watch) ExcludeUnlinkedChildren() bool { + return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0 +} + +// Notify queues a new event on this watch. +func (w *Watch) Notify(name string, events uint32, cookie uint32) { + mask := atomic.LoadUint32(&w.mask) + if mask&events == 0 { + // We weren't watching for this event. + return + } + + // Event mask should include bits matched from the watch plus all control + // event bits. + unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS + effectiveMask := unmaskableBits | mask + matchedEvents := effectiveMask & events + w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) +} + +// handleDeletion handles the deletion of w's target. +func (w *Watch) handleDeletion() { + w.owner.handleDeletion(w) +} + +// Event represents a struct inotify_event from linux. +// +// +stateify savable +type Event struct { + eventEntry + + wd int32 + mask uint32 + cookie uint32 + + // len is computed based on the name field is set automatically by + // Event.setName. It should be 0 when no name is set; otherwise it is the + // length of the name slice. + len uint32 + + // The name field has special padding requirements and should only be set by + // calling Event.setName. + name []byte +} + +func newEvent(wd int32, name string, events, cookie uint32) *Event { + e := &Event{ + wd: wd, + mask: events, + cookie: cookie, + } + if name != "" { + e.setName(name) + } + return e +} + +// paddedBytes converts a go string to a null-terminated c-string, padded with +// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes +// in the 's' plus at least one null byte. +func paddedBytes(s string, l uint32) []byte { + if l < uint32(len(s)+1) { + panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!") + } + b := make([]byte, l) + copy(b, s) + + // b was zero-value initialized during make(), so the rest of the slice is + // already filled with null bytes. + + return b +} + +// setName sets the optional name for this event. +func (e *Event) setName(name string) { + // We need to pad the name such that the entire event length ends up a + // multiple of inotifyEventBaseSize. + unpaddedLen := len(name) + 1 + // Round up to nearest multiple of inotifyEventBaseSize. + e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1)) + // Make sure we haven't overflowed and wrapped around when rounding. + if unpaddedLen > int(e.len) { + panic("Overflow when rounding inotify event size, the 'name' field was too big.") + } + e.name = paddedBytes(name, e.len) +} + +func (e *Event) sizeOf() int { + s := inotifyEventBaseSize + int(e.len) + if s < inotifyEventBaseSize { + panic("overflow") + } + return s +} + +// CopyTo serializes this event to dst. buf is used as a scratch buffer to +// construct the output. We use a buffer allocated ahead of time for +// performance. buf must be at least inotifyEventBaseSize bytes. +func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) { + usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd)) + usermem.ByteOrder.PutUint32(buf[4:], e.mask) + usermem.ByteOrder.PutUint32(buf[8:], e.cookie) + usermem.ByteOrder.PutUint32(buf[12:], e.len) + + writeLen := 0 + + n, err := dst.CopyOut(ctx, buf) + if err != nil { + return 0, err + } + writeLen += n + dst = dst.DropFirst(n) + + if e.len > 0 { + n, err = dst.CopyOut(ctx, e.name) + if err != nil { + return 0, err + } + writeLen += n + } + + // Santiy check. + if writeLen != e.sizeOf() { + panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen)) + } + + return int64(writeLen), nil +} + +func (e *Event) equals(other *Event) bool { + return e.wd == other.wd && + e.mask == other.mask && + e.cookie == other.cookie && + e.len == other.len && + bytes.Equal(e.name, other.name) +} + +// InotifyEventFromStatMask generates the appropriate events for an operation +// that set the stats specified in mask. +func InotifyEventFromStatMask(mask uint32) uint32 { + var ev uint32 + if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 { + ev |= linux.IN_ATTRIB + } + if mask&linux.STATX_SIZE != 0 { + ev |= linux.IN_MODIFY + } + + if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) { + // Both times indicates a utime(s) call. + ev |= linux.IN_ATTRIB + } else if mask&linux.STATX_ATIME != 0 { + ev |= linux.IN_ACCESS + } else if mask&linux.STATX_MTIME != 0 { + mask |= linux.IN_MODIFY + } + return ev +} + +// InotifyRemoveChild sends the appriopriate notifications to the watch sets of +// the child being removed and its parent. +func InotifyRemoveChild(self, parent *Watches, name string) { + self.Notify("", linux.IN_ATTRIB, 0) + parent.Notify(name, linux.IN_DELETE, 0) + // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK. +} + +// InotifyRename sends the appriopriate notifications to the watch sets of the +// file being renamed and its old/new parents. +func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) { + var dirEv uint32 + if isDir { + dirEv = linux.IN_ISDIR + } + cookie := uniqueid.InotifyCookie(ctx) + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie) + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie) + // Somewhat surprisingly, self move events do not have a cookie. + renamed.Notify("", linux.IN_MOVE_SELF, 0) +} diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 6d2ba53ea..be6f21dba 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -422,6 +422,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential } } + fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0) return fd, nil } if !rp.handleError(err) { diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 5acdb8438..f4b5de18d 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -951,6 +951,7 @@ cc_binary( "//test/util:epoll_util", "//test/util:file_descriptor", "//test/util:fs_util", + "//test/util:posix_error", "//test/util:temp_path", "//test/util:test_main", "//test/util:test_util", @@ -1382,7 +1383,7 @@ cc_binary( srcs = ["partial_bad_buffer.cc"], linkstatic = 1, deps = [ - "//test/syscalls/linux:socket_test_util", + ":socket_test_util", "//test/util:file_descriptor", "//test/util:fs_util", "@com_google_absl//absl/time", @@ -3461,7 +3462,7 @@ cc_binary( deps = [ ":socket_test_util", gtest, - "//test/syscalls/linux:socket_netlink_route_util", + ":socket_netlink_route_util", "//test/util:capability_util", "//test/util:file_descriptor", "//test/util:fs_util", diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 0e13ad190..e4565467b 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -33,6 +33,7 @@ #include "test/util/epoll_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -335,6 +336,11 @@ TEST(Inotify, InotifyFdNotWritable) { EXPECT_THAT(write(fd.get(), "x", 1), SyscallFailsWithErrno(EBADF)); } +TEST(Inotify, InitFlags) { + EXPECT_THAT(inotify_init1(IN_NONBLOCK | IN_CLOEXEC), SyscallSucceeds()); + EXPECT_THAT(inotify_init1(12345), SyscallFailsWithErrno(EINVAL)); +} + TEST(Inotify, NonBlockingReadReturnsEagain) { const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); @@ -395,7 +401,7 @@ TEST(Inotify, CanDeleteFileAfterRemovingWatch) { file1.reset(); } -TEST(Inotify, CanRemoveWatchAfterDeletingFile) { +TEST(Inotify, RemoveWatchAfterDeletingFileFails) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path())); @@ -491,17 +497,23 @@ TEST(Inotify, DeletingChildGeneratesEvents) { Event(IN_DELETE, root_wd, Basename(file1_path))})); } +// Creating a file in "parent/child" should generate events for child, but not +// parent. TEST(Inotify, CreatingFileGeneratesEvents) { - const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath child = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path())); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS)); const int wd = ASSERT_NO_ERRNO_AND_VALUE( - InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS)); + InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS)); // Create a new file in the directory. const TempPath file1 = - ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path())); + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(child.path())); const std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); @@ -554,6 +566,47 @@ TEST(Inotify, WritingFileGeneratesModifyEvent) { ASSERT_THAT(events, Are({Event(IN_MODIFY, wd, Basename(file1.path()))})); } +TEST(Inotify, SizeZeroReadWriteGeneratesNothing) { + const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const TempPath file1 = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path())); + + const FileDescriptor file1_fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS)); + + // Read from the empty file. + int val; + ASSERT_THAT(read(file1_fd.get(), &val, sizeof(val)), + SyscallSucceedsWithValue(0)); + + // Write zero bytes. + ASSERT_THAT(write(file1_fd.get(), "", 0), SyscallSucceedsWithValue(0)); + + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + ASSERT_THAT(events, Are({})); +} + +TEST(Inotify, FailedFileCreationGeneratesNoEvents) { + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), dir.path(), IN_ALL_EVENTS)); + + const char* p = dir.path().c_str(); + ASSERT_THAT(mkdir(p, 0777), SyscallFails()); + ASSERT_THAT(mknod(p, S_IFIFO, 0777), SyscallFails()); + ASSERT_THAT(symlink(p, p), SyscallFails()); + ASSERT_THAT(link(p, p), SyscallFails()); + std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + ASSERT_THAT(events, Are({})); +} + TEST(Inotify, WatchSetAfterOpenReportsCloseFdEvent) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const FileDescriptor fd = @@ -602,7 +655,7 @@ TEST(Inotify, ChildrenDeletionInWatchedDirGeneratesEvent) { Event(IN_DELETE | IN_ISDIR, wd, Basename(dir1_path))})); } -TEST(Inotify, WatchTargetDeletionGeneratesEvent) { +TEST(Inotify, RmdirOnWatchedTargetGeneratesEvent) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); @@ -1228,7 +1281,7 @@ TEST(Inotify, LinkGeneratesAttribAndCreateEvents) { InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS)); const int rc = link(file1.path().c_str(), link1.path().c_str()); - // link(2) is only supported on tmpfs in the sandbox. + // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox. SKIP_IF(IsRunningOnGvisor() && rc != 0 && (errno == EPERM || errno == ENOENT)); ASSERT_THAT(rc, SyscallSucceeds()); @@ -1322,21 +1375,27 @@ TEST(Inotify, HardlinksReuseSameWatch) { Event(IN_DELETE, root_wd, Basename(file1_path))})); } +// Calling mkdir within "parent/child" should generate an event for child, but +// not parent. TEST(Inotify, MkdirGeneratesCreateEventWithDirFlag) { - const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath child = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path())); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); - const int root_wd = ASSERT_NO_ERRNO_AND_VALUE( - InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS)); + const int child_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS)); - const TempPath dir1(NewTempAbsPathInDir(root.path())); + const TempPath dir1(NewTempAbsPathInDir(child.path())); ASSERT_THAT(mkdir(dir1.path().c_str(), 0777), SyscallSucceeds()); const std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); ASSERT_THAT( events, - Are({Event(IN_CREATE | IN_ISDIR, root_wd, Basename(dir1.path()))})); + Are({Event(IN_CREATE | IN_ISDIR, child_wd, Basename(dir1.path()))})); } TEST(Inotify, MultipleInotifyInstancesAndWatchesAllGetEvents) { @@ -1597,6 +1656,8 @@ TEST(Inotify, EpollNoDeadlock) { } TEST(Inotify, SpliceEvent) { + // TODO(gvisor.dev/issue/138): Implement splice in VFS2. + SKIP_IF(IsRunningOnGvisor() && !IsRunningWithVFS1()); int pipes[2]; ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds()); @@ -1624,6 +1685,99 @@ TEST(Inotify, SpliceEvent) { ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)})); } +// Watches on a parent should not be triggered by actions on a hard link to one +// of its children that has a different parent. +TEST(Inotify, LinkOnOtherParent) { + const TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path())); + std::string link_path = NewTempAbsPathInDir(dir2.path()); + + const int rc = link(file.path().c_str(), link_path.c_str()); + // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox. + SKIP_IF(IsRunningOnGvisor() && rc != 0 && + (errno == EPERM || errno == ENOENT)); + ASSERT_THAT(rc, SyscallSucceeds()); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), dir1.path(), IN_ALL_EVENTS)); + + // Perform various actions on the link outside of dir1, which should trigger + // no inotify events. + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(link_path.c_str(), O_RDWR)); + int val = 0; + ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds()); + ASSERT_THAT(unlink(link_path.c_str()), SyscallSucceeds()); + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({})); +} + +TEST(Inotify, Exec) { + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath bin = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(dir.path(), "/bin/true")); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), bin.path(), IN_ALL_EVENTS)); + + // Perform exec. + ScopedThread t([&bin]() { + ASSERT_THAT(execl(bin.path().c_str(), bin.path().c_str(), (char*)nullptr), + SyscallSucceeds()); + }); + t.Join(); + + std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + EXPECT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd)})); +} + +// Watches without IN_EXCL_UNLINK, should continue to emit events for file +// descriptors after their corresponding files have been unlinked. +// +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) { + const DisableSave ds; + + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(dir.path(), "123", TempPath::kDefaultFileMode)); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS)); + const int file_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS)); + + ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds()); + int val = 0; + ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_ATTRIB, file_wd), + Event(IN_DELETE, dir_wd, Basename(file.path())), + Event(IN_ACCESS, dir_wd, Basename(file.path())), + Event(IN_ACCESS, file_wd), + Event(IN_MODIFY, dir_wd, Basename(file.path())), + Event(IN_MODIFY, file_wd), + })); +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From ccf69bdd7e05a4e5f404fbef89a7f49f218645e2 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 29 May 2020 12:27:15 -0700 Subject: Implement IN_EXCL_UNLINK inotify option in vfs2. Limited to tmpfs. Inotify support in other filesystem implementations to follow. Updates #1479 PiperOrigin-RevId: 313828648 --- pkg/sentry/fsimpl/ext/dentry.go | 2 +- pkg/sentry/fsimpl/gofer/gofer.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/tmpfs/directory.go | 3 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 14 +-- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 18 +-- pkg/sentry/kernel/fd_table.go | 2 +- pkg/sentry/syscalls/linux/vfs2/read_write.go | 24 ++-- pkg/sentry/vfs/anonfs.go | 2 +- pkg/sentry/vfs/dentry.go | 6 +- pkg/sentry/vfs/inotify.go | 38 +++++-- pkg/sentry/vfs/vfs.go | 2 +- test/syscalls/linux/inotify.cc | 164 +++++++++++++++++++++++++++ 13 files changed, 235 insertions(+), 44 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 4d0deaf03..6bd1a9fc6 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -64,7 +64,7 @@ func (d *dentry) DecRef() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {} +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 850482a19..3f3bd56f0 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1042,7 +1042,7 @@ func (d *dentry) decRefLocked() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {} +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 3f1220791..bbee8ccda 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -228,7 +228,7 @@ func (d *Dentry) destroy() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) {} +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 8bc475f88..70387cb9c 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -79,6 +79,7 @@ func (dir *directory) removeChildLocked(child *dentry) { dir.iterMu.Lock() dir.childList.Remove(child) dir.iterMu.Unlock() + child.unlinked = true } type directoryFD struct { @@ -112,7 +113,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba dir.iterMu.Lock() defer dir.iterMu.Unlock() - fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0) + fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index b4159f904..183eb975c 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -182,7 +182,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if dir { ev |= linux.IN_ISDIR } - parentDir.inode.watches.Notify(name, uint32(ev), 0) + parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) parentDir.inode.touchCMtime() return nil } @@ -247,7 +247,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } d.inode.incLinksLocked() - d.inode.watches.Notify("", linux.IN_ATTRIB, 0) + d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent) parentDir.insertChildLocked(fs.newDentry(d.inode), name) return nil }) @@ -361,7 +361,7 @@ afterTrailingSymlink: if err != nil { return nil, err } - parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0) + parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent) parentDir.inode.touchCMtime() return fd, nil } @@ -613,7 +613,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } parentDir.removeChildLocked(child) - parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0) + parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent) // Remove links for child, child/., and child/.. child.inode.decLinksLocked() child.inode.decLinksLocked() @@ -636,7 +636,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0) + d.InotifyWithParent(ev, 0, vfs.InodeEvent) } return nil } @@ -784,7 +784,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0) + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -800,7 +800,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0) + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 1d83b6840..f0e098702 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -163,6 +163,11 @@ type dentry struct { // filesystem.mu. name string + // unlinked indicates whether this dentry has been unlinked from its parent. + // It is only set to true on an unlink operation, and never set from true to + // false. unlinked is protected by filesystem.mu. + unlinked bool + // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry @@ -202,7 +207,7 @@ func (d *dentry) DecRef() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32) { +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { if d.inode.isDir() { events |= linux.IN_ISDIR } @@ -211,9 +216,9 @@ func (d *dentry) InotifyWithParent(events uint32, cookie uint32) { if d.parent != nil { // Note that d.parent or d.name may be stale if there is a concurrent // rename operation. Inotify does not provide consistency guarantees. - d.parent.inode.watches.Notify(d.name, events, cookie) + d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked) } - d.inode.watches.Notify("", events, cookie) + d.inode.watches.Notify("", events, cookie, et) } // Watches implements vfs.DentryImpl.Watches. @@ -676,9 +681,8 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) return err } - // Generate inotify events. if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0) + d.InotifyWithParent(ev, 0, vfs.InodeEvent) } return nil } @@ -701,7 +705,7 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0) + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -713,7 +717,7 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0) + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index ef73e1169..dbfcef0fa 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -158,7 +158,7 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { if file.IsWritable() { ev = linux.IN_CLOSE_WRITE } - file.Dentry().InotifyWithParent(ev, 0) + file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) // Drop the table reference. file.DecRef() diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 92b5631a3..7f9debd4a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -94,7 +94,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -102,7 +102,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -135,7 +135,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -258,7 +258,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -266,7 +266,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -299,7 +299,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -364,7 +364,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -372,7 +372,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -405,7 +405,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) } return total, err } @@ -528,7 +528,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -536,7 +536,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -569,7 +569,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 55a3d54cc..b7c6b60b8 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -301,7 +301,7 @@ func (d *anonDentry) DecRef() { // InotifyWithParent implements DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32) {} +func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {} // Watches implements DentryImpl.Watches. // diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index d61b9e09b..24af13eb1 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -113,7 +113,7 @@ type DentryImpl interface { // // Note that the events may not actually propagate up to the user, depending // on the event masks. - InotifyWithParent(events uint32, cookie uint32) + InotifyWithParent(events uint32, cookie uint32, et EventType) // Watches returns the set of inotify watches for the file corresponding to // the Dentry. Dentries that are hard links to the same underlying file @@ -151,8 +151,8 @@ func (d *Dentry) isMounted() bool { // InotifyWithParent notifies all watches on the inodes for this dentry and // its parent of events. -func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) { - d.impl.InotifyWithParent(events, cookie) +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) { + d.impl.InotifyWithParent(events, cookie, et) } // Watches returns the set of inotify watches associated with d. diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 1d28ccb46..05a3051a4 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -33,6 +33,19 @@ import ( // must be a power 2 for rounding below. const inotifyEventBaseSize = 16 +// EventType defines different kinds of inotfiy events. +// +// The way events are labelled appears somewhat arbitrary, but they must match +// Linux so that IN_EXCL_UNLINK behaves as it does in Linux. +type EventType uint8 + +// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and +// FSNOTIFY_EVENT_INODE in Linux. +const ( + PathEvent EventType = iota + InodeEvent EventType = iota +) + // Inotify represents an inotify instance created by inotify_init(2) or // inotify_init1(2). Inotify implements FileDescriptionImpl. // @@ -419,13 +432,22 @@ func (w *Watches) Remove(id uint64) { } // Notify queues a new event with all watches in this set. -func (w *Watches) Notify(name string, events, cookie uint32) { +func (w *Watches) Notify(name string, events, cookie uint32, et EventType) { + w.NotifyWithExclusions(name, events, cookie, et, false) +} + +// NotifyWithExclusions queues a new event with watches in this set. Watches +// with IN_EXCL_UNLINK are skipped if the event is coming from a child that +// has been unlinked. +func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) { // N.B. We don't defer the unlocks because Notify is in the hot path of // all IO operations, and the defer costs too much for small IO // operations. w.mu.RLock() for _, watch := range w.ws { - // TODO(gvisor.dev/issue/1479): Skip for IN_EXCL_UNLINK cases. + if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent { + continue + } watch.Notify(name, events, cookie) } w.mu.RUnlock() @@ -434,7 +456,7 @@ func (w *Watches) Notify(name string, events, cookie uint32) { // HandleDeletion is called when the watch target is destroyed to emit // the appropriate events. func (w *Watches) HandleDeletion() { - w.Notify("", linux.IN_DELETE_SELF, 0) + w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent) // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied // by value. Ideally, we wouldn't have this circular locking so we can just @@ -655,8 +677,8 @@ func InotifyEventFromStatMask(mask uint32) uint32 { // InotifyRemoveChild sends the appriopriate notifications to the watch sets of // the child being removed and its parent. func InotifyRemoveChild(self, parent *Watches, name string) { - self.Notify("", linux.IN_ATTRIB, 0) - parent.Notify(name, linux.IN_DELETE, 0) + self.Notify("", linux.IN_ATTRIB, 0, InodeEvent) + parent.Notify(name, linux.IN_DELETE, 0, InodeEvent) // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK. } @@ -668,8 +690,8 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, dirEv = linux.IN_ISDIR } cookie := uniqueid.InotifyCookie(ctx) - oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie) - newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie) + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent) + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent) // Somewhat surprisingly, self move events do not have a cookie. - renamed.Notify("", linux.IN_MOVE_SELF, 0) + renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent) } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index be6f21dba..52643a7c5 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -422,7 +422,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential } } - fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0) + fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) return fd, nil } if !rp.handleError(err) { diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index e4565467b..2306d9cab 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -1778,6 +1778,170 @@ TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) { })); } +// Watches created with IN_EXCL_UNLINK will stop emitting events on fds for +// children that have already been unlinked. +// +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, ExcludeUnlink_NoRandomSave) { + const DisableSave ds; + // TODO(gvisor.dev/issue/1624): This test fails on VFS1. + SKIP_IF(IsRunningWithVFS1()); + + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path())); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + + // Unlink the child, which should cause further operations on the open file + // descriptor to be ignored. + ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds()); + int val = 0; + ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({Event(IN_DELETE, wd, Basename(file.path()))})); +} + +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) { + const DisableSave ds; + + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + TempPath dir = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path())); + std::string dirPath = dir.path(); + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(dirPath.c_str(), O_RDONLY | O_DIRECTORY)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), parent.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + + // Unlink the dir, and then close the open fd. + ASSERT_THAT(rmdir(dirPath.c_str()), SyscallSucceeds()); + dir.reset(); + + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + // No close event should appear. + ASSERT_THAT(events, + Are({Event(IN_DELETE | IN_ISDIR, wd, Basename(dirPath))})); +} + +// If "dir/child" and "dir/child2" are links to the same file, and "dir/child" +// is unlinked, a watch on "dir" with IN_EXCL_UNLINK will exclude future events +// for fds on "dir/child" but not "dir/child2". +// +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, ExcludeUnlinkMultipleChildren_NoRandomSave) { + const DisableSave ds; + // TODO(gvisor.dev/issue/1624): This test fails on VFS1. + SKIP_IF(IsRunningWithVFS1()); + + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path())); + std::string path1 = file.path(); + std::string path2 = NewTempAbsPathInDir(dir.path()); + + const int rc = link(path1.c_str(), path2.c_str()); + // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox. + SKIP_IF(IsRunningOnGvisor() && rc != 0 && + (errno == EPERM || errno == ENOENT)); + ASSERT_THAT(rc, SyscallSucceeds()); + const FileDescriptor fd1 = + ASSERT_NO_ERRNO_AND_VALUE(Open(path1.c_str(), O_RDWR)); + const FileDescriptor fd2 = + ASSERT_NO_ERRNO_AND_VALUE(Open(path2.c_str(), O_RDWR)); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + + // After unlinking path1, only events on the fd for path2 should be generated. + ASSERT_THAT(unlink(path1.c_str()), SyscallSucceeds()); + ASSERT_THAT(write(fd1.get(), "x", 1), SyscallSucceeds()); + ASSERT_THAT(write(fd2.get(), "x", 1), SyscallSucceeds()); + + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_DELETE, wd, Basename(path1)), + Event(IN_MODIFY, wd, Basename(path2)), + })); +} + +// On native Linux, actions of data type FSNOTIFY_EVENT_INODE are not affected +// by IN_EXCL_UNLINK (see +// fs/notify/inotify/inotify_fsnotify.c:inotify_handle_event). Inode-level +// events include changes to metadata and extended attributes. +// +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) { + const DisableSave ds; + + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path())); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR)); + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + + // NOTE(b/157163751): Create another link before unlinking. This is needed for + // the gofer filesystem in gVisor, where open fds will not work once the link + // count hits zero. In VFS2, we end up skipping the gofer test anyway, because + // hard links are not supported for gofer fs. + if (IsRunningOnGvisor()) { + std::string link_path = NewTempAbsPath(); + const int rc = link(file.path().c_str(), link_path.c_str()); + // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox. + SKIP_IF(rc != 0 && (errno == EPERM || errno == ENOENT)); + ASSERT_THAT(rc, SyscallSucceeds()); + } + + // Even after unlinking, inode-level operations will trigger events regardless + // of IN_EXCL_UNLINK. + ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds()); + + // Perform various actions on fd. + ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds()); + std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_DELETE, wd, Basename(file.path())), + Event(IN_MODIFY, wd, Basename(file.path())), + })); + + struct timeval times[2] = {{1, 0}, {2, 0}}; + ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds()); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))})); + + // S/R is disabled on this entire test due to behavior with unlink; it must + // also be disabled after this point because of fchmod. + ASSERT_THAT(fchmod(fd.get(), 0777), SyscallSucceeds()); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))})); +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From 8c1f5b5cd8b634a5e7255944f42e82c5c9de3149 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 5 Jun 2020 14:43:56 -0700 Subject: Unshare files on exec The current task can share its fdtable with a few other tasks, but after exec, this should be a completely separate process. PiperOrigin-RevId: 314999565 --- pkg/sentry/kernel/task_exec.go | 4 ++++ test/syscalls/linux/exec.cc | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 00c425cca..9b69f3cbe 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -198,6 +198,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) t.tg.pidns.owner.mu.Unlock() + oldFDTable := t.fdTable + t.fdTable = t.fdTable.Fork() + oldFDTable.DecRef() + // Remove FDs with the CloseOnExec flag set. t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc index 12c9b05ca..e09afafe9 100644 --- a/test/syscalls/linux/exec.cc +++ b/test/syscalls/linux/exec.cc @@ -673,6 +673,33 @@ TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) { EXPECT_EQ(execve_errno, ELOOP); } +TEST(ExecveatTest, UnshareFiles) { + TempPath tempFile = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "bar", 0755)); + const FileDescriptor fd_closed_on_exec = + ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC)); + + pid_t child; + EXPECT_THAT(child = syscall(__NR_clone, SIGCHLD | CLONE_VFORK | CLONE_FILES, + 0, 0, 0, 0), + SyscallSucceeds()); + if (child == 0) { + ExecveArray argv = {"test"}; + ExecveArray envp; + ASSERT_THAT( + execve(RunfilePath(kBasicWorkload).c_str(), argv.get(), envp.get()), + SyscallSucceeds()); + _exit(1); + } + + int status; + ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds()); + EXPECT_EQ(status, 0); + + struct stat st; + EXPECT_THAT(fstat(fd_closed_on_exec.get(), &st), SyscallSucceeds()); +} + TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) { std::string parent_dir = "/tmp"; TempPath link = ASSERT_NO_ERRNO_AND_VALUE( -- cgit v1.2.3 From 21b6bc7280f68f43360a008ffd02a4f461ec9fc8 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Fri, 5 Jun 2020 19:10:28 -0700 Subject: Implement mount(2) and umount2(2) for VFS2. This is mostly syscall plumbing, VFS2 already implements the internals of mounts. In addition to the syscall defintions, the following mount-related mechanisms are updated: - Implement MS_NOATIME for VFS2, but only for tmpfs and goferfs. The other VFS2 filesystems don't implement node-level timestamps yet. - Implement the 'mode', 'uid' and 'gid' mount options for VFS2's tmpfs. - Plumb mount namespace ownership, which is necessary for checking appropriate capabilities during mount(2). Updates #1035 PiperOrigin-RevId: 315035352 --- pkg/sentry/fsimpl/gofer/time.go | 3 + pkg/sentry/fsimpl/tmpfs/tmpfs.go | 41 +++++++- pkg/sentry/kernel/auth/credentials.go | 28 +++++ pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/mount.go | 145 ++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 4 +- pkg/sentry/vfs/genericfstree/genericfstree.go | 3 +- pkg/sentry/vfs/mount.go | 34 +++--- pkg/sentry/vfs/options.go | 4 + pkg/sentry/vfs/vfs.go | 2 +- runsc/boot/vfs.go | 2 +- 11 files changed, 247 insertions(+), 20 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/mount.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 2608e7e1d..1d5aa82dc 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -38,6 +38,9 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { // Preconditions: fs.interop != InteropModeShared. func (d *dentry) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime { + return + } if err := mnt.CheckBeginWrite(); err != nil { return } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index f0e098702..3777ebdf2 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -30,6 +30,7 @@ package tmpfs import ( "fmt" "math" + "strconv" "strings" "sync/atomic" @@ -124,14 +125,45 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } fs.vfsfs.Init(vfsObj, newFSType, &fs) + mopts := vfs.GenericParseMountOptions(opts.Data) + + defaultMode := linux.FileMode(0777) + if modeStr, ok := mopts["mode"]; ok { + mode, err := strconv.ParseUint(modeStr, 8, 32) + if err != nil { + return nil, nil, fmt.Errorf("Mount option \"mode='%v'\" not parsable: %v", modeStr, err) + } + defaultMode = linux.FileMode(mode) + } + + defaultOwnerCreds := creds.Fork() + if uidStr, ok := mopts["uid"]; ok { + uid, err := strconv.ParseInt(uidStr, 10, 32) + if err != nil { + return nil, nil, fmt.Errorf("Mount option \"uid='%v'\" not parsable: %v", uidStr, err) + } + if err := defaultOwnerCreds.SetUID(auth.UID(uid)); err != nil { + return nil, nil, fmt.Errorf("Error using mount option \"uid='%v'\": %v", uidStr, err) + } + } + if gidStr, ok := mopts["gid"]; ok { + gid, err := strconv.ParseInt(gidStr, 10, 32) + if err != nil { + return nil, nil, fmt.Errorf("Mount option \"gid='%v'\" not parsable: %v", gidStr, err) + } + if err := defaultOwnerCreds.SetGID(auth.GID(gid)); err != nil { + return nil, nil, fmt.Errorf("Error using mount option \"gid='%v'\": %v", gidStr, err) + } + } + var root *dentry switch rootFileType { case linux.S_IFREG: - root = fs.newDentry(fs.newRegularFile(creds, 0777)) + root = fs.newDentry(fs.newRegularFile(defaultOwnerCreds, defaultMode)) case linux.S_IFLNK: - root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)) + root = fs.newDentry(fs.newSymlink(defaultOwnerCreds, tmpfsOpts.RootSymlinkTarget)) case linux.S_IFDIR: - root = &fs.newDirectory(creds, 01777).dentry + root = &fs.newDirectory(defaultOwnerCreds, defaultMode).dentry default: fs.vfsfs.DecRef() return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) @@ -562,6 +594,9 @@ func (i *inode) isDir() bool { } func (i *inode) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime { + return + } if err := mnt.CheckBeginWrite(); err != nil { return } diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go index e057d2c6d..6862f2ef5 100644 --- a/pkg/sentry/kernel/auth/credentials.go +++ b/pkg/sentry/kernel/auth/credentials.go @@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) { } return NoID, syserror.EPERM } + +// SetUID translates the provided uid to the root user namespace and updates c's +// uids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetUID(uid UID) error { + kuid := c.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + c.RealKUID = kuid + c.EffectiveKUID = kuid + c.SavedKUID = kuid + return nil +} + +// SetGID translates the provided gid to the root user namespace and updates c's +// gids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetGID(gid GID) error { + kgid := c.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + c.RealKGID = kgid + c.EffectiveKGID = kgid + c.SavedKGID = kgid + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 9c8b44f64..c0d005247 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -16,6 +16,7 @@ go_library( "ioctl.go", "memfd.go", "mmap.go", + "mount.go", "path.go", "pipe.go", "poll.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go new file mode 100644 index 000000000..adeaa39cc --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -0,0 +1,145 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mount implements Linux syscall mount(2). +func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sourceAddr := args[0].Pointer() + targetAddr := args[1].Pointer() + typeAddr := args[2].Pointer() + flags := args[3].Uint64() + dataAddr := args[4].Pointer() + + // For null-terminated strings related to mount(2), Linux copies in at most + // a page worth of data. See fs/namespace.c:copy_mount_string(). + fsType, err := t.CopyInString(typeAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + source, err := t.CopyInString(sourceAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + + targetPath, err := copyInPath(t, targetAddr) + if err != nil { + return 0, nil, err + } + + data := "" + if dataAddr != 0 { + // In Linux, a full page is always copied in regardless of null + // character placement, and the address is passed to each file system. + // Most file systems always treat this data as a string, though, and so + // do all of the ones we implement. + data, err = t.CopyInString(dataAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + } + + // Ignore magic value that was required before Linux 2.4. + if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL { + flags = flags &^ linux.MS_MGC_MSK + } + + // Must have CAP_SYS_ADMIN in the current mount namespace's associated user + // namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { + return 0, nil, syserror.EPERM + } + + const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | + linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE | + linux.MS_UNBINDABLE | linux.MS_MOVE + + // Silently allow MS_NOSUID, since we don't implement set-id bits + // anyway. + const unsupportedFlags = linux.MS_NODEV | + linux.MS_NODIRATIME | linux.MS_STRICTATIME + + // Linux just allows passing any flags to mount(2) - it won't fail when + // unknown or unsupported flags are passed. Since we don't implement + // everything, we fail explicitly on flags that are unimplemented. + if flags&(unsupportedOps|unsupportedFlags) != 0 { + return 0, nil, syserror.EINVAL + } + + var opts vfs.MountOptions + if flags&linux.MS_NOATIME == linux.MS_NOATIME { + opts.Flags.NoATime = true + } + if flags&linux.MS_NOEXEC == linux.MS_NOEXEC { + opts.Flags.NoExec = true + } + if flags&linux.MS_RDONLY == linux.MS_RDONLY { + opts.ReadOnly = true + } + opts.GetFilesystemOptions.Data = data + + target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer target.Release() + + return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) +} + +// Umount2 implements Linux syscall umount2(2). +func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + + // Must have CAP_SYS_ADMIN in the mount namespace's associated user + // namespace. + // + // Currently, this is always the init task's user namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { + return 0, nil, syserror.EPERM + } + + const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE + if flags&unsupported != 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + opts := vfs.UmountOptions{ + Flags: uint32(flags), + } + + return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index ef8358b8a..7b6e7571a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -90,8 +90,8 @@ func Override() { s.Table[138] = syscalls.Supported("fstatfs", Fstatfs) s.Table[161] = syscalls.Supported("chroot", Chroot) s.Table[162] = syscalls.Supported("sync", Sync) - delete(s.Table, 165) // mount - delete(s.Table, 166) // umount2 + s.Table[165] = syscalls.Supported("mount", Mount) + s.Table[166] = syscalls.Supported("umount2", Umount2) delete(s.Table, 187) // readahead s.Table[188] = syscalls.Supported("setxattr", Setxattr) s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr) diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go index 286510195..8882fa84a 100644 --- a/pkg/sentry/vfs/genericfstree/genericfstree.go +++ b/pkg/sentry/vfs/genericfstree/genericfstree.go @@ -43,7 +43,7 @@ type Dentry struct { // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func IsAncestorDentry(d, d2 *Dentry) bool { - for { + for d2 != nil { // Stop at root, where d2.parent == nil. if d2.parent == d { return true } @@ -52,6 +52,7 @@ func IsAncestorDentry(d, d2 *Dentry) bool { } d2 = d2.parent } + return false } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 3adb7c97d..32f901bd8 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -55,6 +55,10 @@ type Mount struct { // ID is the immutable mount ID. ID uint64 + // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except + // for MS_RDONLY which is tracked in "writers". Immutable. + Flags MountFlags + // key is protected by VirtualFilesystem.mountMu and // VirtualFilesystem.mounts.seq, and may be nil. References are held on // key.parent and key.point if they are not nil. @@ -81,10 +85,6 @@ type Mount struct { // umounted is true. umounted is protected by VirtualFilesystem.mountMu. umounted bool - // flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except - // for MS_RDONLY which is tracked in "writers". - flags MountFlags - // The lower 63 bits of writers is the number of calls to // Mount.CheckBeginWrite() that have not yet been paired with a call to // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. @@ -95,10 +95,10 @@ type Mount struct { func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { mnt := &Mount{ ID: atomic.AddUint64(&vfs.lastMountID, 1), + Flags: opts.Flags, vfs: vfs, fs: fs, root: root, - flags: opts.Flags, ns: mntns, refs: 1, } @@ -113,13 +113,12 @@ func (mnt *Mount) Options() MountOptions { mnt.vfs.mountMu.Lock() defer mnt.vfs.mountMu.Unlock() return MountOptions{ - Flags: mnt.flags, + Flags: mnt.Flags, ReadOnly: mnt.readOnly(), } } -// A MountNamespace is a collection of Mounts. -// +// A MountNamespace is a collection of Mounts.// // MountNamespaces are reference-counted. Unless otherwise specified, all // MountNamespace methods require that a reference is held. // @@ -127,6 +126,9 @@ func (mnt *Mount) Options() MountOptions { // // +stateify savable type MountNamespace struct { + // Owner is the usernamespace that owns this mount namespace. + Owner *auth.UserNamespace + // root is the MountNamespace's root mount. root is immutable. root *Mount @@ -163,6 +165,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth return nil, err } mntns := &MountNamespace{ + Owner: creds.UserNamespace, refs: 1, mountpoints: make(map[*Dentry]uint32), } @@ -279,6 +282,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti } // MNT_FORCE is currently unimplemented except for the permission check. + // Force unmounting specifically requires CAP_SYS_ADMIN in the root user + // namespace, and not in the owner user namespace for the target mount. See + // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { return syserror.EPERM } @@ -753,7 +759,10 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi if mnt.readOnly() { opts = "ro" } - if mnt.flags.NoExec { + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { opts += ",noexec" } @@ -838,11 +847,12 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo if mnt.readOnly() { opts = "ro" } - if mnt.flags.NoExec { + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { opts += ",noexec" } - // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is - // set. fmt.Fprintf(buf, "%s ", opts) // (7) Optional fields: zero or more fields of the form "tag[:value]". diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 53d364c5c..f223aeda8 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -75,6 +75,10 @@ type MknodOptions struct { type MountFlags struct { // NoExec is equivalent to MS_NOEXEC. NoExec bool + + // NoATime is equivalent to MS_NOATIME and indicates that the + // filesystem should not update access time in-place. + NoATime bool } // MountOptions contains options to VirtualFilesystem.MountAt(). diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 52643a7c5..9acca8bc7 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -405,7 +405,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential vfs.putResolvingPath(rp) if opts.FileExec { - if fd.Mount().flags.NoExec { + if fd.Mount().Flags.NoExec { fd.DecRef() return nil, syserror.EACCES } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 6c84f0794..7ed6801b4 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -272,7 +272,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF case "ro": opts.ReadOnly = true case "noatime": - // TODO(gvisor.dev/issue/1193): Implement MS_NOATIME. + opts.Flags.NoATime = true case "noexec": opts.Flags.NoExec = true default: -- cgit v1.2.3 From 67565078bbcdd8f797206d996605df8f6658d00a Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 9 Jun 2020 18:44:57 -0700 Subject: Implement flock(2) in VFS2 LockFD is the generic implementation that can be embedded in FileDescriptionImpl implementations. Unique lock ID is maintained in vfs.FileDescription and is created on demand. Updates #1480 PiperOrigin-RevId: 315604825 --- pkg/sentry/devices/memdev/full.go | 1 + pkg/sentry/devices/memdev/null.go | 1 + pkg/sentry/devices/memdev/random.go | 1 + pkg/sentry/devices/memdev/zero.go | 1 + pkg/sentry/fs/file.go | 2 +- pkg/sentry/fs/lock/lock.go | 41 +++----- pkg/sentry/fs/lock/lock_set_functions.go | 8 +- pkg/sentry/fs/lock/lock_test.go | 111 +++++++++++----------- pkg/sentry/fsimpl/devpts/BUILD | 1 + pkg/sentry/fsimpl/devpts/devpts.go | 5 +- pkg/sentry/fsimpl/devpts/master.go | 5 + pkg/sentry/fsimpl/devpts/slave.go | 5 + pkg/sentry/fsimpl/eventfd/eventfd.go | 1 + pkg/sentry/fsimpl/ext/BUILD | 1 + pkg/sentry/fsimpl/ext/file_description.go | 1 + pkg/sentry/fsimpl/ext/inode.go | 6 ++ pkg/sentry/fsimpl/ext/regular_file.go | 1 + pkg/sentry/fsimpl/gofer/BUILD | 2 + pkg/sentry/fsimpl/gofer/filesystem.go | 9 +- pkg/sentry/fsimpl/gofer/gofer.go | 23 +++++ pkg/sentry/fsimpl/gofer/special_file.go | 4 +- pkg/sentry/fsimpl/host/BUILD | 1 + pkg/sentry/fsimpl/host/host.go | 8 +- pkg/sentry/fsimpl/kernfs/BUILD | 2 + pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 10 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 9 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 5 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 13 ++- pkg/sentry/fsimpl/pipefs/BUILD | 1 + pkg/sentry/fsimpl/pipefs/pipefs.go | 6 +- pkg/sentry/fsimpl/proc/BUILD | 1 + pkg/sentry/fsimpl/proc/subtasks.go | 5 +- pkg/sentry/fsimpl/proc/task.go | 5 +- pkg/sentry/fsimpl/proc/task_fds.go | 7 +- pkg/sentry/fsimpl/proc/task_files.go | 10 +- pkg/sentry/fsimpl/proc/tasks.go | 5 +- pkg/sentry/fsimpl/signalfd/signalfd.go | 1 + pkg/sentry/fsimpl/sys/BUILD | 1 + pkg/sentry/fsimpl/sys/sys.go | 7 +- pkg/sentry/fsimpl/timerfd/timerfd.go | 1 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 6 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 23 ----- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 41 +------- pkg/sentry/kernel/fd_table.go | 15 +-- pkg/sentry/kernel/kernel.go | 5 - pkg/sentry/kernel/pipe/BUILD | 1 + pkg/sentry/kernel/pipe/vfs.go | 13 ++- pkg/sentry/socket/hostinet/BUILD | 1 + pkg/sentry/socket/hostinet/socket_vfs2.go | 3 + pkg/sentry/socket/netlink/BUILD | 1 + pkg/sentry/socket/netlink/socket_vfs2.go | 8 +- pkg/sentry/socket/netstack/BUILD | 1 + pkg/sentry/socket/netstack/netstack_vfs2.go | 3 + pkg/sentry/socket/unix/BUILD | 1 + pkg/sentry/socket/unix/unix_vfs2.go | 7 +- pkg/sentry/syscalls/linux/sys_file.go | 39 ++------ pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/lock.go | 64 +++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 2 +- pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/epoll.go | 1 + pkg/sentry/vfs/file_description.go | 25 ++++- pkg/sentry/vfs/file_description_impl_util.go | 80 ++++++++++++---- pkg/sentry/vfs/file_description_impl_util_test.go | 1 + pkg/sentry/vfs/inotify.go | 1 + test/syscalls/linux/BUILD | 3 + test/syscalls/linux/flock.cc | 75 ++++++++++++--- 67 files changed, 470 insertions(+), 281 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/lock.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go index c7e197691..af66fe4dc 100644 --- a/pkg/sentry/devices/memdev/full.go +++ b/pkg/sentry/devices/memdev/full.go @@ -42,6 +42,7 @@ type fullFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go index 33d060d02..92d3d71be 100644 --- a/pkg/sentry/devices/memdev/null.go +++ b/pkg/sentry/devices/memdev/null.go @@ -43,6 +43,7 @@ type nullFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go index acfa23149..6b81da5ef 100644 --- a/pkg/sentry/devices/memdev/random.go +++ b/pkg/sentry/devices/memdev/random.go @@ -48,6 +48,7 @@ type randomFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // off is the "file offset". off is accessed using atomic memory // operations. diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go index 3b1372b9e..c6f15054d 100644 --- a/pkg/sentry/devices/memdev/zero.go +++ b/pkg/sentry/devices/memdev/zero.go @@ -44,6 +44,7 @@ type zeroFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 2a278fbe3..ca41520b4 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -146,7 +146,7 @@ func (f *File) DecRef() { f.DecRefWithDestructor(func() { // Drop BSD style locks. lockRng := lock.LockRange{Start: 0, End: lock.LockEOF} - f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng) + f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng) // Release resources held by the FileOperations. f.FileOperations.Release() diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 926538d90..8a5d9c7eb 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -62,7 +62,7 @@ import ( type LockType int // UniqueID is a unique identifier of the holder of a regional file lock. -type UniqueID uint64 +type UniqueID interface{} const ( // ReadLock describes a POSIX regional file lock to be taken @@ -98,12 +98,7 @@ type Lock struct { // If len(Readers) > 0 then HasWriter must be false. Readers map[UniqueID]bool - // HasWriter indicates that this is a write lock held by a single - // UniqueID. - HasWriter bool - - // Writer is only valid if HasWriter is true. It identifies a - // single write lock holder. + // Writer holds the writer unique ID. It's nil if there are no writers. Writer UniqueID } @@ -186,7 +181,6 @@ func makeLock(uid UniqueID, t LockType) Lock { case ReadLock: value.Readers[uid] = true case WriteLock: - value.HasWriter = true value.Writer = uid default: panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) @@ -196,10 +190,7 @@ func makeLock(uid UniqueID, t LockType) Lock { // isHeld returns true if uid is a holder of Lock. func (l Lock) isHeld(uid UniqueID) bool { - if l.HasWriter && l.Writer == uid { - return true - } - return l.Readers[uid] + return l.Writer == uid || l.Readers[uid] } // lock sets uid as a holder of a typed lock on Lock. @@ -214,20 +205,20 @@ func (l *Lock) lock(uid UniqueID, t LockType) { } // We cannot downgrade a write lock to a read lock unless the // uid is the same. - if l.HasWriter { + if l.Writer != nil { if l.Writer != uid { panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) } // Ensure that there is only one reader if upgrading. l.Readers = make(map[UniqueID]bool) // Ensure that there is no longer a writer. - l.HasWriter = false + l.Writer = nil } l.Readers[uid] = true return case WriteLock: // If we are already the writer, then this is a no-op. - if l.HasWriter && l.Writer == uid { + if l.Writer == uid { return } // We can only upgrade a read lock to a write lock if there @@ -243,7 +234,6 @@ func (l *Lock) lock(uid UniqueID, t LockType) { } // Ensure that there is only a writer. l.Readers = make(map[UniqueID]bool) - l.HasWriter = true l.Writer = uid default: panic(fmt.Sprintf("lock: invalid lock type %d", t)) @@ -277,9 +267,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { switch t { case ReadLock: return l.lockable(r, func(value Lock) bool { - // If there is no writer, there's no problem adding - // another reader. - if !value.HasWriter { + // If there is no writer, there's no problem adding another reader. + if value.Writer == nil { return true } // If there is a writer, then it must be the same uid @@ -289,10 +278,9 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { case WriteLock: return l.lockable(r, func(value Lock) bool { // If there are only readers. - if !value.HasWriter { - // Then this uid can only take a write lock if - // this is a private upgrade, meaning that the - // only reader is uid. + if value.Writer == nil { + // Then this uid can only take a write lock if this is a private + // upgrade, meaning that the only reader is uid. return len(value.Readers) == 1 && value.Readers[uid] } // If the uid is already a writer on this region, then @@ -304,7 +292,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { } } -// lock returns true if uid took a lock of type t on the entire range of LockRange. +// lock returns true if uid took a lock of type t on the entire range of +// LockRange. // // Preconditions: r.Start <= r.End (will panic otherwise). func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { @@ -339,7 +328,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { seg, _ = l.SplitUnchecked(seg, r.End) } - // Set the lock on the segment. This is guaranteed to + // Set the lock on the segment. This is guaranteed to // always be safe, given canLock above. value := seg.ValuePtr() value.lock(uid, t) @@ -386,7 +375,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) { value := seg.Value() var remove bool - if value.HasWriter && value.Writer == uid { + if value.Writer == uid { // If we are unlocking a writer, then since there can // only ever be one writer and no readers, then this // lock should always be removed from the set. diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go index 8a3ace0c1..50a16e662 100644 --- a/pkg/sentry/fs/lock/lock_set_functions.go +++ b/pkg/sentry/fs/lock/lock_set_functions.go @@ -44,14 +44,9 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) return Lock{}, false } } - if val1.HasWriter != val2.HasWriter { + if val1.Writer != val2.Writer { return Lock{}, false } - if val1.HasWriter { - if val1.Writer != val2.Writer { - return Lock{}, false - } - } return val1, true } @@ -62,7 +57,6 @@ func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) for k, v := range val.Readers { val0.Readers[k] = v } - val0.HasWriter = val.HasWriter val0.Writer = val.Writer return val, val0 diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go index ba002aeb7..fad90984b 100644 --- a/pkg/sentry/fs/lock/lock_test.go +++ b/pkg/sentry/fs/lock/lock_test.go @@ -42,9 +42,6 @@ func equals(e0, e1 []entry) bool { if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) { return false } - if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter { - return false - } if e0[i].Lock.Writer != e1[i].Lock.Writer { return false } @@ -105,7 +102,7 @@ func TestCanLock(t *testing.T) { LockRange: LockRange{2048, 3072}, }, { - Lock: Lock{HasWriter: true, Writer: 1}, + Lock: Lock{Writer: 1}, LockRange: LockRange{3072, 4096}, }, }) @@ -241,7 +238,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -254,7 +251,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -273,7 +270,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 4096}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -301,7 +298,7 @@ func TestSetLock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 4096}, }, { @@ -318,7 +315,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -550,7 +547,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 4096}, }, { @@ -594,7 +591,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 3072}, }, { @@ -633,7 +630,7 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -663,11 +660,11 @@ func TestSetLock(t *testing.T) { // 0 1024 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -675,28 +672,30 @@ func TestSetLock(t *testing.T) { } for _, test := range tests { - l := fill(test.before) + t.Run(test.name, func(t *testing.T) { + l := fill(test.before) - r := LockRange{Start: test.start, End: test.end} - success := l.lock(test.uid, test.lockType, r) - var got []entry - for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - got = append(got, entry{ - Lock: seg.Value(), - LockRange: seg.Range(), - }) - } + r := LockRange{Start: test.start, End: test.end} + success := l.lock(test.uid, test.lockType, r) + var got []entry + for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + got = append(got, entry{ + Lock: seg.Value(), + LockRange: seg.Range(), + }) + } - if success != test.success { - t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success) - continue - } + if success != test.success { + t.Errorf("setlock(%v, %+v, %d, %d) got success %v, want %v", test.before, r, test.uid, test.lockType, success, test.success) + return + } - if success { - if !equals(got, test.after) { - t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after) + if success { + if !equals(got, test.after) { + t.Errorf("got set %+v, want %+v", got, test.after) + } } - } + }) } } @@ -782,7 +781,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -824,7 +823,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -837,7 +836,7 @@ func TestUnlock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -876,7 +875,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -889,7 +888,7 @@ func TestUnlock(t *testing.T) { // 0 4096 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 4096}, }, }, @@ -906,7 +905,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 4096}, }, { @@ -974,7 +973,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -991,7 +990,7 @@ func TestUnlock(t *testing.T) { // 0 8 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 8}, }, { @@ -1008,7 +1007,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -1025,7 +1024,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 8192 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -1041,19 +1040,21 @@ func TestUnlock(t *testing.T) { } for _, test := range tests { - l := fill(test.before) + t.Run(test.name, func(t *testing.T) { + l := fill(test.before) - r := LockRange{Start: test.start, End: test.end} - l.unlock(test.uid, r) - var got []entry - for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - got = append(got, entry{ - Lock: seg.Value(), - LockRange: seg.Range(), - }) - } - if !equals(got, test.after) { - t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after) - } + r := LockRange{Start: test.start, End: test.end} + l.unlock(test.uid, r) + var got []entry + for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + got = append(got, entry{ + Lock: seg.Value(), + LockRange: seg.Range(), + }) + } + if !equals(got, test.after) { + t.Errorf("got set %+v, want %+v", got, test.after) + } + }) } } diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index 585764223..cf440dce8 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -23,6 +23,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index c03c65445..9b0e0cca2 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -28,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -116,6 +117,8 @@ type rootInode struct { kernfs.InodeNotSymlink kernfs.OrderedChildren + locks lock.FileLocks + // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -183,7 +186,7 @@ func (i *rootInode) masterClose(t *Terminal) { // Open implements kernfs.Inode.Open. func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 7a7ce5d81..1d22adbe3 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -34,6 +35,8 @@ type masterInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink + locks lock.FileLocks + // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -55,6 +58,7 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf inode: mi, t: t, } + fd.LockFD.Init(&mi.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { mi.DecRef() return nil, err @@ -85,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds type masterFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD inode *masterInode t *Terminal diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 526cd406c..7fe475080 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -33,6 +34,8 @@ type slaveInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink + locks lock.FileLocks + // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -51,6 +54,7 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs fd := &slaveFileDescription{ inode: si, } + fd.LockFD.Init(&si.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { si.DecRef() return nil, err @@ -91,6 +95,7 @@ func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds type slaveFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD inode *slaveInode } diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index c573d7935..d12d78b84 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -37,6 +37,7 @@ type EventFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // queue is used to notify interested parties when the event object // becomes readable or writable. diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index ff861d0fe..973fa0def 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -60,6 +60,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 92f7da40d..90b086468 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -26,6 +26,7 @@ import ( type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD } func (fd *fileDescription) filesystem() *filesystem { diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 485f86f4b..e4b434b13 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -54,6 +55,8 @@ type inode struct { // diskInode gives us access to the inode struct on disk. Immutable. diskInode disklayout.Inode + locks lock.FileLocks + // This is immutable. The first field of the implementations must have inode // as the first field to ensure temporality. impl interface{} @@ -157,6 +160,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt switch in.impl.(type) { case *regularFile: var fd regularFileFD + fd.LockFD.Init(&in.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -168,6 +172,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt return nil, syserror.EISDIR } var fd directoryFD + fd.LockFD.Init(&in.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -178,6 +183,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt return nil, syserror.ELOOP } var fd symlinkFD + fd.LockFD.Init(&in.locks) fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 30135ddb0..f7015c44f 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -77,6 +77,7 @@ func (in *inode) isRegular() bool { // vfs.FileDescriptionImpl. type regularFileFD struct { fileDescription + vfs.LockFD // off is the file offset. off is accessed using atomic memory operations. off int64 diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index f5f35a3bc..5cdeeaeb5 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -54,6 +54,7 @@ go_library( "//pkg/p9", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/host", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", @@ -68,6 +69,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 36e0e1856..40933b74b 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -801,6 +801,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } fd := ®ularFileFD{} + fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { @@ -826,6 +827,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf } } fd := &directoryFD{} + fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -842,7 +844,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf } case linux.S_IFIFO: if d.isSynthetic() { - return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) + return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) } } return d.openSpecialFileLocked(ctx, mnt, opts) @@ -902,7 +904,7 @@ retry: return nil, err } } - fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) + fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags) if err != nil { h.close(ctx) return nil, err @@ -989,6 +991,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving var childVFSFD *vfs.FileDescription if useRegularFileFD { fd := ®ularFileFD{} + fd.LockFD.Init(&child.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { @@ -1003,7 +1006,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving if fdobj != nil { h.fd = int32(fdobj.Release()) } - fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) + fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags) if err != nil { h.close(ctx) return nil, err diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 3f3bd56f0..0d88a328e 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -45,6 +45,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -52,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/usermem" @@ -662,6 +664,8 @@ type dentry struct { // If this dentry represents a synthetic named pipe, pipe is the pipe // endpoint bound to this file. pipe *pipe.VFSPipe + + locks lock.FileLocks } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -1366,6 +1370,9 @@ func (d *dentry) decLinks() { type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD + + lockLogging sync.Once } func (fd *fileDescription) filesystem() *filesystem { @@ -1416,3 +1423,19 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name) } + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + fd.lockLogging.Do(func() { + log.Infof("File lock using gofer file handled internally.") + }) + return fd.LockFD.LockBSD(ctx, uid, t, block) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + fd.lockLogging.Do(func() { + log.Infof("Range lock using gofer file handled internally.") + }) + return fd.LockFD.LockPOSIX(ctx, uid, t, rng, block) +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index ff6126b87..289efdd25 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -51,7 +52,7 @@ type specialFileFD struct { off int64 } -func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *lock.FileLocks, flags uint32) (*specialFileFD, error) { ftype := d.fileType() seekable := ftype == linux.S_IFREG mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK @@ -60,6 +61,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci seekable: seekable, mayBlock: mayBlock, } + fd.LockFD.Init(locks) if mayBlock && h.fd >= 0 { if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index ca0fe6d2b..54f16ad63 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -39,6 +39,7 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/uniqueid", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 18b127521..5ec5100b8 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -34,6 +34,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -182,6 +183,8 @@ type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink + locks lock.FileLocks + // When the reference count reaches zero, the host fd is closed. refs.AtomicRefCount @@ -468,7 +471,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u return nil, err } // Currently, we only allow Unix sockets to be imported. - return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d) + return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks) } // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that @@ -478,6 +481,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u fileDescription: fileDescription{inode: i}, termios: linux.DefaultSlaveTermios, } + fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err @@ -486,6 +490,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u } fd := &fileDescription{inode: i} + fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err @@ -497,6 +502,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but // cached to reduce indirections and casting. fileDescription does not hold diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index ef34cb28a..0299dbde9 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -49,6 +49,7 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", @@ -67,6 +68,7 @@ go_test( "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/usermem", "@com_github_google_go-cmp//cmp:go_default_library", diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 1568a9d49..6418de0a3 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -38,7 +39,8 @@ type DynamicBytesFile struct { InodeNotDirectory InodeNotSymlink - data vfs.DynamicBytesSource + locks lock.FileLocks + data vfs.DynamicBytesSource } var _ Inode = (*DynamicBytesFile)(nil) @@ -55,7 +57,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint // Open implements Inode.Open. func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil { + if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil @@ -77,13 +79,15 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent type DynamicBytesFD struct { vfs.FileDescriptionDefaultImpl vfs.DynamicBytesFileDescriptionImpl + vfs.LockFD vfsfd vfs.FileDescription inode Inode } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *lock.FileLocks, flags uint32) error { + fd.LockFD.Init(locks) if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 8284e76a7..33a5968ca 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -42,6 +43,7 @@ import ( type GenericDirectoryFD struct { vfs.FileDescriptionDefaultImpl vfs.DirectoryFileDescriptionDefaultImpl + vfs.LockFD vfsfd vfs.FileDescription children *OrderedChildren @@ -55,9 +57,9 @@ type GenericDirectoryFD struct { // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. -func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { +func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} - if err := fd.Init(children, opts); err != nil { + if err := fd.Init(children, locks, opts); err != nil { return nil, err } if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { @@ -69,11 +71,12 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre // Init initializes a GenericDirectoryFD. Use it when overriding // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the // correct implementation. -func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error { +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } + fd.LockFD.Init(locks) fd.children = children return nil } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 982daa2e6..0e4927215 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -555,6 +556,8 @@ type StaticDirectory struct { InodeAttrs InodeNoDynamicLookup OrderedChildren + + locks lock.FileLocks } var _ Inode = (*StaticDirectory)(nil) @@ -584,7 +587,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3 // Open implements kernfs.Inode. func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts) + fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 412cf6ac9..6749facf7 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -100,8 +101,10 @@ type readonlyDir struct { kernfs.InodeNotSymlink kernfs.InodeNoDynamicLookup kernfs.InodeDirectoryNoNewChildren - kernfs.OrderedChildren + + locks lock.FileLocks + dentry kernfs.Dentry } @@ -117,7 +120,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod } func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) if err != nil { return nil, err } @@ -128,10 +131,12 @@ type dir struct { attrs kernfs.InodeNotSymlink kernfs.InodeNoDynamicLookup + kernfs.OrderedChildren + + locks lock.FileLocks fs *filesystem dentry kernfs.Dentry - kernfs.OrderedChildren } func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { @@ -147,7 +152,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte } func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index 5950a2d59..c618dbe6c 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -15,6 +15,7 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/usermem", ], diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index cab771211..e4dabaa33 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -81,7 +82,8 @@ type inode struct { kernfs.InodeNotSymlink kernfs.InodeNoopRefCount - pipe *pipe.VFSPipe + locks lock.FileLocks + pipe *pipe.VFSPipe ino uint64 uid auth.KUID @@ -147,7 +149,7 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags) + return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks) } // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 17c1342b5..351ba4ee9 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -35,6 +35,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/tcpip/header", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 36a911db4..e2cdb7ee9 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -37,6 +38,8 @@ type subtasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid + locks lock.FileLocks + fs *filesystem task *kernel.Task pidns *kernel.PIDNamespace @@ -153,7 +156,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro // Open implements kernfs.Inode. func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &subtasksFD{task: i.task} - if err := fd.Init(&i.OrderedChildren, &opts); err != nil { + if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil { return nil, err } if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 482055db1..44078a765 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -38,6 +39,8 @@ type taskInode struct { kernfs.InodeAttrs kernfs.OrderedChildren + locks lock.FileLocks + task *kernel.Task } @@ -103,7 +106,7 @@ func (i *taskInode) Valid(ctx context.Context) bool { // Open implements kernfs.Inode. func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 44ccc9e4a..ef6c1d04f 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -53,6 +54,8 @@ func taskFDExists(t *kernel.Task, fd int32) bool { } type fdDir struct { + locks lock.FileLocks + fs *filesystem task *kernel.Task @@ -143,7 +146,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro // Open implements kernfs.Inode. func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } @@ -270,7 +273,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, // Open implements kernfs.Inode. func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 2f297e48a..e5eaa91cd 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -30,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -775,6 +776,8 @@ type namespaceInode struct { kernfs.InodeNoopRefCount kernfs.InodeNotDirectory kernfs.InodeNotSymlink + + locks lock.FileLocks } var _ kernfs.Inode = (*namespaceInode)(nil) @@ -791,6 +794,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &namespaceFD{inode: i} i.IncRef() + fd.LockFD.Init(&i.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -801,6 +805,7 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * // /proc/[pid]/ns/*. type namespaceFD struct { vfs.FileDescriptionDefaultImpl + vfs.LockFD vfsfd vfs.FileDescription inode *namespaceInode @@ -825,8 +830,3 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err func (fd *namespaceFD) Release() { fd.inode.DecRef() } - -// OnClose implements FileDescriptionImpl. -func (*namespaceFD) OnClose(context.Context) error { - return nil -} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index b51d43954..58c8b9d05 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -43,6 +44,8 @@ type tasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid + locks lock.FileLocks + fs *filesystem pidns *kernel.PIDNamespace @@ -197,7 +200,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback // Open implements kernfs.Inode. func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index d29ef3f83..242ba9b5d 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -31,6 +31,7 @@ type SignalFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // target is the original signal target task. // diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index a741e2bb6..237f17def 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -15,6 +15,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 0af373604..b84463d3a 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -98,8 +99,10 @@ type dir struct { kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren - kernfs.OrderedChildren + + locks lock.FileLocks + dentry kernfs.Dentry } @@ -121,7 +124,7 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set // Open implements kernfs.Inode.Open. func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 60c92d626..2dc90d484 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -32,6 +32,7 @@ type TimerFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD events waiter.Queue timer *ktime.Timer diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e801680e8..72399b321 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -399,6 +399,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD + fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -414,15 +415,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return nil, syserror.EISDIR } var fd directoryFD + fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil case *symlink: - // Can't open symlinks without O_PATH (which is unimplemented). + // TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH. return nil, syserror.ELOOP case *namedPipe: - return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags) + return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) case *socketFile: diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 4f2ae04d2..77447b32c 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -366,28 +365,6 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { return nil } -// LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { - return fd.inode().lockBSD(uid, t, block) -} - -// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. -func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { - fd.inode().unlockBSD(uid) - return nil -} - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { - return fd.inode().lockPOSIX(uid, t, rng, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { - fd.inode().unlockPOSIX(uid, rng) - return nil -} - // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { file := fd.inode().impl.(*regularFile) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 7ce1b86c7..71a7522af 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -36,7 +36,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -311,7 +310,6 @@ type inode struct { ctime int64 // nanoseconds mtime int64 // nanoseconds - // Advisory file locks, which lock at the inode level. locks lock.FileLocks // Inotify watches for this inode. @@ -539,44 +537,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return nil } -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - switch i.impl.(type) { - case *regularFile: - return i.locks.LockBSD(uid, t, block) - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) unlockBSD(uid fslock.UniqueID) error { - switch i.impl.(type) { - case *regularFile: - i.locks.UnlockBSD(uid) - return nil - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { - switch i.impl.(type) { - case *regularFile: - return i.locks.LockPOSIX(uid, t, rng, block) - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error { - switch i.impl.(type) { - case *regularFile: - i.locks.UnlockPOSIX(uid, rng) - return nil - } - return syserror.EBADF -} - // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block @@ -708,6 +668,7 @@ func (i *inode) userXattrSupported() bool { type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD } func (fd *fileDescription) filesystem() *filesystem { diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index dbfcef0fa..b35afafe3 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -80,9 +80,6 @@ type FDTable struct { refs.AtomicRefCount k *Kernel - // uid is a unique identifier. - uid uint64 - // mu protects below. mu sync.Mutex `state:"nosave"` @@ -130,7 +127,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { // drop drops the table reference. func (f *FDTable) drop(file *fs.File) { // Release locks. - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF}) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF}) // Send inotify events. d := file.Dirent @@ -164,17 +161,9 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { file.DecRef() } -// ID returns a unique identifier for this FDTable. -func (f *FDTable) ID() uint64 { - return f.uid -} - // NewFDTable allocates a new FDTable that may be used by tasks in k. func (k *Kernel) NewFDTable() *FDTable { - f := &FDTable{ - k: k, - uid: atomic.AddUint64(&k.fdMapUids, 1), - } + f := &FDTable{k: k} f.init() return f } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 5efeb3767..bcbeb6a39 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -194,11 +194,6 @@ type Kernel struct { // cpuClockTickerSetting is protected by runningTasksMu. cpuClockTickerSetting ktime.Setting - // fdMapUids is an ever-increasing counter for generating FDTable uids. - // - // fdMapUids is mutable, and is accessed using atomic memory operations. - fdMapUids uint64 - // uniqueID is used to generate unique identifiers. // // uniqueID is mutable, and is accessed using atomic memory operations. diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 7bfa9075a..0db546b98 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 2602bed72..c0e9ee1f4 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -61,11 +62,13 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { // // Preconditions: statusFlags should not contain an open access mode. func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { - return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags) + // Connected pipes share the same locks. + locks := &lock.FileLocks{} + return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) } // Open opens the pipe represented by vp. -func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) { +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -75,7 +78,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s return nil, syserror.EINVAL } - fd := vp.newFD(mnt, vfsd, statusFlags) + fd := vp.newFD(mnt, vfsd, statusFlags, locks) // Named pipes have special blocking semantics during open: // @@ -127,10 +130,11 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription { +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) *vfs.FileDescription { fd := &VFSPipeFD{ pipe: &vp.pipe, } + fd.LockFD.Init(locks) fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, @@ -159,6 +163,7 @@ type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD pipe *Pipe } diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index e82d6cd1e..60c9896fc 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -34,6 +34,7 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 677743113..027add1fd 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -35,6 +36,7 @@ import ( type socketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD // We store metadata for hostinet sockets internally. Technically, we should // access metadata (e.g. through stat, chmod) on the host for correctness, @@ -59,6 +61,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in fd: fd, }, } + s.LockFD.Init(&lock.FileLocks{}) if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) } diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 7212d8644..420e573c9 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -29,6 +29,7 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index b854bf990..8bfee5193 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -40,6 +41,7 @@ type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD socketOpsCommon } @@ -66,7 +68,7 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV return nil, err } - return &SocketVFS2{ + fd := &SocketVFS2{ socketOpsCommon: socketOpsCommon{ ports: t.Kernel().NetlinkPorts(), protocol: protocol, @@ -75,7 +77,9 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV connection: connection, sendBufferSize: defaultSendBufferSize, }, - }, nil + } + fd.LockFD.Init(&lock.FileLocks{}) + return fd, nil } // Readiness implements waiter.Waitable.Readiness. diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 8f0f5466e..0f592ecc3 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -37,6 +37,7 @@ go_library( "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index fcd8013c0..1412a4810 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -38,6 +39,7 @@ type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD socketOpsCommon } @@ -64,6 +66,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu protocol: protocol, }, } + s.LockFD.Init(&lock.FileLocks{}) vfsfd := &s.vfsfd if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index de2cc4bdf..7d4cc80fe 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -29,6 +29,7 @@ go_library( "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 45e109361..8c32371a2 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -39,6 +40,7 @@ type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD socketOpsCommon } @@ -51,7 +53,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) mnt := t.Kernel().SocketMount() d := sockfs.NewDentry(t.Credentials(), mnt) - fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d) + fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &lock.FileLocks{}) if err != nil { return nil, syserr.FromError(err) } @@ -60,7 +62,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) // NewFileDescription creates and returns a socket file description // corresponding to the given mount and dentry. -func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) { +func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *lock.FileLocks) (*vfs.FileDescription, error) { // You can create AF_UNIX, SOCK_RAW sockets. They're the same as // SOCK_DGRAM and don't require CAP_NET_RAW. if stype == linux.SOCK_RAW { @@ -73,6 +75,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3 stype: stype, }, } + sock.LockFD.Init(locks) vfsfd := &sock.vfsfd if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 35a98212a..8347617bd 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -998,9 +998,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } - // The lock uid is that of the Task's FDTable. - lockUniqueID := lock.UniqueID(t.FDTable().ID()) - // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. switch flock.Type { @@ -1010,12 +1007,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } @@ -1026,18 +1023,18 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } return 0, nil, nil case linux.F_UNLCK: - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng) return 0, nil, nil default: return 0, nil, syserror.EINVAL @@ -2157,22 +2154,6 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB - // flock(2): - // Locks created by flock() are associated with an open file table entry. This means that - // duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the - // same lock, and this lock may be modified or released using any of these descriptors. Furthermore, - // the lock is released either by an explicit LOCK_UN operation on any of these duplicate - // descriptors, or when all such descriptors have been closed. - // - // If a process uses open(2) (or similar) to obtain more than one descriptor for the same file, - // these descriptors are treated independently by flock(). An attempt to lock the file using - // one of these file descriptors may be denied by a lock that the calling process has already placed via - // another descriptor. - // - // We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2) - // and fork(2). - lockUniqueID := lock.UniqueID(file.UniqueID) - // A BSD style lock spans the entire file. rng := lock.LockRange{ Start: 0, @@ -2183,29 +2164,29 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.LOCK_EX: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_SH: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_UN: - file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng) + file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng) default: // flock(2): EINVAL operation is invalid. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index c0d005247..9f93f4354 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -14,6 +14,7 @@ go_library( "getdents.go", "inotify.go", "ioctl.go", + "lock.go", "memfd.go", "mmap.go", "mount.go", @@ -42,6 +43,7 @@ go_library( "//pkg/fspath", "//pkg/gohacks", "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/eventfd", "//pkg/sentry/fsimpl/pipefs", diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go new file mode 100644 index 000000000..bf19028c4 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -0,0 +1,64 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Flock implements linux syscall flock(2). +func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + operation := args[1].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + // flock(2): EBADF fd is not an open file descriptor. + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + nonblocking := operation&linux.LOCK_NB != 0 + operation &^= linux.LOCK_NB + + var blocker lock.Blocker + if !nonblocking { + blocker = t + } + + switch operation { + case linux.LOCK_EX: + if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil { + return 0, nil, err + } + case linux.LOCK_SH: + if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil { + return 0, nil, err + } + case linux.LOCK_UN: + if err := file.UnlockBSD(t); err != nil { + return 0, nil, err + } + default: + // flock(2): EINVAL operation is invalid. + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 7b6e7571a..954c82f97 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -62,7 +62,7 @@ func Override() { s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt) s.Table[59] = syscalls.Supported("execve", Execve) s.Table[72] = syscalls.Supported("fcntl", Fcntl) - delete(s.Table, 73) // flock + s.Table[73] = syscalls.Supported("fcntl", Flock) s.Table[74] = syscalls.Supported("fsync", Fsync) s.Table[75] = syscalls.Supported("fdatasync", Fdatasync) s.Table[76] = syscalls.Supported("truncate", Truncate) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 774cc66cc..16d9f3a28 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -72,6 +72,7 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/uniqueid", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 8297f964b..599c3131c 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -31,6 +31,7 @@ type EpollInstance struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl + NoLockFD // q holds waiters on this EpollInstance. q waiter.Queue diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index bb294563d..97b9b18d7 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -73,6 +73,8 @@ type FileDescription struct { // writable is analogous to Linux's FMODE_WRITE. writable bool + usedLockBSD uint32 + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl @@ -175,6 +177,12 @@ func (fd *FileDescription) DecRef() { } ep.interestMu.Unlock() } + + // If BSD locks were used, release any lock that it may have acquired. + if atomic.LoadUint32(&fd.usedLockBSD) != 0 { + fd.impl.UnlockBSD(context.Background(), fd) + } + // Release implementation resources. fd.impl.Release() if fd.writable { @@ -420,13 +428,9 @@ type FileDescriptionImpl interface { Removexattr(ctx context.Context, name string) error // LockBSD tries to acquire a BSD-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): BSD-style file locking LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error - // LockBSD releases a BSD-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): BSD-style file locking + // UnlockBSD releases a BSD-style advisory file lock. UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. @@ -736,3 +740,14 @@ func (fd *FileDescription) InodeID() uint64 { func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { return fd.Sync(ctx) } + +// LockBSD tries to acquire a BSD-style advisory file lock. +func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error { + atomic.StoreUint32(&fd.usedLockBSD, 1) + return fd.impl.LockBSD(ctx, fd, lockType, blocker) +} + +// UnlockBSD releases a BSD-style advisory file lock. +func (fd *FileDescription) UnlockBSD(ctx context.Context) error { + return fd.impl.UnlockBSD(ctx, fd) +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index f4c111926..af7213dfd 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -21,8 +21,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -153,26 +154,6 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) return syserror.ENOTSUP } -// LockBSD implements FileDescriptionImpl.LockBSD. -func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { - return syserror.EBADF -} - -// UnlockBSD implements FileDescriptionImpl.UnlockBSD. -func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { - return syserror.EBADF -} - -// LockPOSIX implements FileDescriptionImpl.LockPOSIX. -func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { - return syserror.EBADF -} - -// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. -func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { - return syserror.EBADF -} - // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. @@ -384,3 +365,60 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M fd.IncRef() return nil } + +// LockFD may be used by most implementations of FileDescriptionImpl.Lock* +// functions. Caller must call Init(). +type LockFD struct { + locks *lock.FileLocks +} + +// Init initializes fd with FileLocks to use. +func (fd *LockFD) Init(locks *lock.FileLocks) { + fd.locks = locks +} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return fd.locks.LockBSD(uid, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + fd.locks.UnlockBSD(uid) + return nil +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + return fd.locks.LockPOSIX(uid, t, rng, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error { + fd.locks.UnlockPOSIX(uid, rng) + return nil +} + +// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface +// returning ENOLCK. +type NoLockFD struct{} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return syserror.ENOLCK +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error { + return syserror.ENOLCK +} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 3a75d4d62..5061f6ac9 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -33,6 +33,7 @@ import ( type fileDescription struct { vfsfd FileDescription FileDescriptionDefaultImpl + NoLockFD } // genCount contains the number of times its DynamicBytesSource.Generate() diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 05a3051a4..7fa7d2d0c 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -57,6 +57,7 @@ type Inotify struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl + NoLockFD // Unique identifier for this inotify instance. We don't just reuse the // inotify fd because fds can be duped. These should not be exposed to the diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index ae2aa44dc..4a1486e14 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -802,10 +802,13 @@ cc_binary( ], linkstatic = 1, deps = [ + ":socket_test_util", "//test/util:file_descriptor", "@com_google_absl//absl/strings", "@com_google_absl//absl/time", gtest, + "//test/util:epoll_util", + "//test/util:eventfd_util", "//test/util:posix_error", "//test/util:temp_path", "//test/util:test_main", diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc index 3ecb8db8e..638a93979 100644 --- a/test/syscalls/linux/flock.cc +++ b/test/syscalls/linux/flock.cc @@ -21,6 +21,7 @@ #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/syscalls/linux/file_base.h" +#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" @@ -34,11 +35,6 @@ namespace { class FlockTest : public FileTest {}; -TEST_F(FlockTest, BadFD) { - // EBADF: fd is not an open file descriptor. - ASSERT_THAT(flock(-1, 0), SyscallFailsWithErrno(EBADF)); -} - TEST_F(FlockTest, InvalidOpCombinations) { // The operation cannot be both exclusive and shared. EXPECT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_SH | LOCK_NB), @@ -57,15 +53,6 @@ TEST_F(FlockTest, NoOperationSpecified) { SyscallFailsWithErrno(EINVAL)); } -TEST(FlockTestNoFixture, FlockSupportsPipes) { - int fds[2]; - ASSERT_THAT(pipe(fds), SyscallSucceeds()); - - EXPECT_THAT(flock(fds[0], LOCK_EX | LOCK_NB), SyscallSucceeds()); - EXPECT_THAT(close(fds[0]), SyscallSucceeds()); - EXPECT_THAT(close(fds[1]), SyscallSucceeds()); -} - TEST_F(FlockTest, TestSimpleExLock) { // Test that we can obtain an exclusive lock (no other holders) // and that we can unlock it. @@ -583,6 +570,66 @@ TEST_F(FlockTest, BlockingLockFirstExclusiveSecondExclusive_NoRandomSave) { EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds()); } +TEST(FlockTestNoFixture, BadFD) { + // EBADF: fd is not an open file descriptor. + ASSERT_THAT(flock(-1, 0), SyscallFailsWithErrno(EBADF)); +} + +TEST(FlockTestNoFixture, FlockDir) { + auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0000)); + EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds()); +} + +TEST(FlockTestNoFixture, FlockSymlink) { + // TODO(gvisor.dev/issue/2782): Replace with IsRunningWithVFS1() when O_PATH + // is supported. + SKIP_IF(IsRunningOnGvisor()); + + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + auto symlink = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), file.path())); + + auto fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(symlink.path(), O_RDONLY | O_PATH, 0000)); + EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallFailsWithErrno(EBADF)); +} + +TEST(FlockTestNoFixture, FlockProc) { + auto fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/status", O_RDONLY, 0000)); + EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds()); +} + +TEST(FlockTestNoFixture, FlockPipe) { + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + + EXPECT_THAT(flock(fds[0], LOCK_EX | LOCK_NB), SyscallSucceeds()); + // Check that the pipe was locked above. + EXPECT_THAT(flock(fds[1], LOCK_EX | LOCK_NB), SyscallFailsWithErrno(EAGAIN)); + + EXPECT_THAT(flock(fds[0], LOCK_UN), SyscallSucceeds()); + EXPECT_THAT(flock(fds[1], LOCK_EX | LOCK_NB), SyscallSucceeds()); + + EXPECT_THAT(close(fds[0]), SyscallSucceeds()); + EXPECT_THAT(close(fds[1]), SyscallSucceeds()); +} + +TEST(FlockTestNoFixture, FlockSocket) { + int sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_THAT(sock, SyscallSucceeds()); + + struct sockaddr_un addr = + ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(true /* abstract */, AF_UNIX)); + ASSERT_THAT( + bind(sock, reinterpret_cast(&addr), sizeof(addr)), + SyscallSucceeds()); + + EXPECT_THAT(flock(sock, LOCK_EX | LOCK_NB), SyscallSucceeds()); + EXPECT_THAT(close(sock), SyscallSucceeds()); +} + } // namespace } // namespace testing -- cgit v1.2.3 From f1f85f475d5f98ad9e9d574a11997c0829c3b189 Mon Sep 17 00:00:00 2001 From: Gaurav Singh Date: Fri, 22 May 2020 23:45:21 -0400 Subject: sentry: use defer wg.Done() unconditionally Signed-off-by: Gaurav Singh --- pkg/sentry/kernel/timekeeper.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index da0ea7bb5..0adf25691 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -186,6 +186,7 @@ func (t *Timekeeper) startUpdater() { timer := time.NewTicker(sentrytime.ApproxUpdateInterval) t.wg.Add(1) go func() { // S/R-SAFE: stopped during save. + defer t.wg.Done() for { // Start with an update immediately, so the clocks are // ready ASAP. @@ -220,7 +221,6 @@ func (t *Timekeeper) startUpdater() { select { case <-timer.C: case <-t.stop: - t.wg.Done() return } } -- cgit v1.2.3 From 6ec9d60403fdf7a33072eaa023e62bfd56ed9f5c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 12 Jun 2020 11:56:43 -0700 Subject: vfs2: implement fcntl(fd, F_SETFL, flags) PiperOrigin-RevId: 316148074 --- pkg/sentry/kernel/fd_table.go | 23 +++++++++++++++++++++++ pkg/sentry/syscalls/linux/sys_file.go | 4 ++-- pkg/sentry/syscalls/linux/vfs2/fd.go | 4 ++-- test/syscalls/linux/fcntl.cc | 9 +++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index b35afafe3..48911240f 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -458,6 +458,29 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { return nil } +// SetFlagsVFS2 sets the flags for the given file descriptor. +// +// True is returned iff flags were changed. +func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + file, _, _ := f.getVFS2(fd) + if file == nil { + // No file found. + return syscall.EBADF + } + + // Update the flags. + f.setVFS2(fd, file, flags) + return nil +} + // Get returns a reference to the file and the flags for the FD or nil if no // file is defined for the given fd. // diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 8347617bd..696e1c8d3 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -935,10 +935,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() - t.FDTable().SetFlags(fd, kernel.FDFlags{ + err := t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) - return 0, nil, nil + return 0, nil, err case linux.F_GETFL: return uintptr(file.Flags().ToLinux()), nil, nil case linux.F_SETFL: diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 6006758a5..f9ccb303c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -134,10 +134,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() - t.FDTable().SetFlags(fd, kernel.FDFlags{ + err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) - return 0, nil, nil + return 0, nil, err case linux.F_GETFL: return uintptr(file.StatusFlags()), nil, nil case linux.F_SETFL: diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index c7cc5816e..35e8a4ff3 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -115,6 +115,15 @@ PosixErrorOr SubprocessLock(std::string const& path, bool for_write, return std::move(cleanup); } +TEST(FcntlTest, SetCloExecBadFD) { + // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag not set. + FileDescriptor f = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0)); + auto fd = f.get(); + f.reset(); + ASSERT_THAT(fcntl(fd, F_GETFD), SyscallFailsWithErrno(EBADF)); + ASSERT_THAT(fcntl(fd, F_SETFD, FD_CLOEXEC), SyscallFailsWithErrno(EBADF)); +} + TEST(FcntlTest, SetCloExec) { // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag not set. FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0)); -- cgit v1.2.3 From 3b0b1f104d963a1d11973c444934e6744ab7e79b Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 16 Jun 2020 00:14:07 -0700 Subject: Miscellaneous VFS2 fixes. PiperOrigin-RevId: 316627764 --- pkg/sentry/fsimpl/gofer/gofer.go | 26 ++++++++++++++++---------- pkg/sentry/kernel/kernel.go | 2 +- pkg/sentry/syscalls/linux/vfs2/ioctl.go | 29 +++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 11 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 0d88a328e..ac051b3a7 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -835,6 +835,14 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME stat.Blksize = atomic.LoadUint32(&d.blockSize) stat.Nlink = atomic.LoadUint32(&d.nlink) + if stat.Nlink == 0 { + // The remote filesystem doesn't support link count; just make + // something up. This is consistent with Linux, where + // fs/inode.c:inode_init_always() initializes link count to 1, and + // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if + // it's not provided by the remote filesystem. + stat.Nlink = 1 + } stat.UID = atomic.LoadUint32(&d.uid) stat.GID = atomic.LoadUint32(&d.gid) stat.Mode = uint16(atomic.LoadUint32(&d.mode)) @@ -1346,23 +1354,21 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool } // incLinks increments link count. -// -// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32. func (d *dentry) incLinks() { - v := atomic.AddUint32(&d.nlink, 1) - if v < 2 { - panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v)) + if atomic.LoadUint32(&d.nlink) == 0 { + // The remote filesystem doesn't support link count. + return } + atomic.AddUint32(&d.nlink, 1) } // decLinks decrements link count. -// -// Preconditions: d.nlink > 1. func (d *dentry) decLinks() { - v := atomic.AddUint32(&d.nlink, ^uint32(0)) - if v == 0 { - panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v)) + if atomic.LoadUint32(&d.nlink) == 0 { + // The remote filesystem doesn't support link count. + return } + atomic.AddUint32(&d.nlink, ^uint32(0)) } // fileDescription is embedded by gofer implementations of diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index bcbeb6a39..52491da7a 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -892,7 +892,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if mntnsVFS2 == nil { // MountNamespaceVFS2 adds a reference to the namespace, which is // transferred to the new process. - mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2() + mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2() } // Get the root directory from the MountNamespace. root := args.MountNamespaceVFS2.Root() diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index 5a2418da9..0399c0db4 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -15,6 +15,7 @@ package vfs2 import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -30,6 +31,34 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } defer file.DecRef() + // Handle ioctls that apply to all FDs. + switch args[1].Int() { + case linux.FIONCLEX: + t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{ + CloseOnExec: false, + }) + return 0, nil, nil + + case linux.FIOCLEX: + t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{ + CloseOnExec: true, + }) + return 0, nil, nil + + case linux.FIONBIO: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.StatusFlags() + if set != 0 { + flags |= linux.O_NONBLOCK + } else { + flags &^= linux.O_NONBLOCK + } + return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) + } + ret, err := file.Ioctl(t, t.MemoryManager(), args) return ret, nil, err } -- cgit v1.2.3 From 810748f5c9c72f713d81d14bcc89a8eb4ca49eb6 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Tue, 16 Jun 2020 08:47:04 -0700 Subject: Port aio to VFS2. In order to make sure all aio goroutines have stopped during S/R, a new WaitGroup was added to TaskSet, analagous to runningGoroutines. This WaitGroup is incremented with each aio goroutine, and waited on during kernel.Pause. The old VFS1 aio code was changed to use this new WaitGroup, rather than fs.Async. The only uses of fs.Async are now inode and mount Release operations, which do not call fs.Async recursively. This fixes a lock-ordering violation that can cause deadlocks. Updates #1035. PiperOrigin-RevId: 316689380 --- pkg/abi/linux/aio.go | 60 ++++++++- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/aio.go | 81 +++++++++++++ pkg/sentry/kernel/context.go | 53 -------- pkg/sentry/kernel/kernel.go | 10 +- pkg/sentry/kernel/threads.go | 7 ++ pkg/sentry/syscalls/linux/sys_aio.go | 169 +++++++++----------------- pkg/sentry/syscalls/linux/vfs2/BUILD | 3 + pkg/sentry/syscalls/linux/vfs2/aio.go | 216 +++++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 6 +- 10 files changed, 431 insertions(+), 175 deletions(-) create mode 100644 pkg/sentry/kernel/aio.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/aio.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go index 3c6e0079d..86ee3f8b5 100644 --- a/pkg/abi/linux/aio.go +++ b/pkg/abi/linux/aio.go @@ -14,7 +14,63 @@ package linux +import "encoding/binary" + +// AIORingSize is sizeof(struct aio_ring). +const AIORingSize = 32 + +// I/O commands. const ( - // AIORingSize is sizeof(struct aio_ring). - AIORingSize = 32 + IOCB_CMD_PREAD = 0 + IOCB_CMD_PWRITE = 1 + IOCB_CMD_FSYNC = 2 + IOCB_CMD_FDSYNC = 3 + // 4 was the experimental IOCB_CMD_PREADX. + IOCB_CMD_POLL = 5 + IOCB_CMD_NOOP = 6 + IOCB_CMD_PREADV = 7 + IOCB_CMD_PWRITEV = 8 ) + +// I/O flags. +const ( + IOCB_FLAG_RESFD = 1 + IOCB_FLAG_IOPRIO = 2 +) + +// IOCallback describes an I/O request. +// +// The priority field is currently ignored in the implementation below. Also +// note that the IOCB_FLAG_RESFD feature is not supported. +type IOCallback struct { + Data uint64 + Key uint32 + _ uint32 + + OpCode uint16 + ReqPrio int16 + FD int32 + + Buf uint64 + Bytes uint64 + Offset int64 + + Reserved2 uint64 + Flags uint32 + + // eventfd to signal if IOCB_FLAG_RESFD is set in flags. + ResFD int32 +} + +// IOEvent describes an I/O result. +// +// +stateify savable +type IOEvent struct { + Data uint64 + Obj uint64 + Result int64 + Result2 int64 +} + +// IOEventSize is the size of an ioEvent encoded. +var IOEventSize = binary.Size(IOEvent{}) diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index a28eab8b8..1510a7c26 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -85,6 +85,7 @@ go_library( name = "kernel", srcs = [ "abstract_socket_namespace.go", + "aio.go", "context.go", "fd_table.go", "fd_table_unsafe.go", diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go new file mode 100644 index 000000000..0ac78c0b8 --- /dev/null +++ b/pkg/sentry/kernel/aio.go @@ -0,0 +1,81 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "time" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" +) + +// AIOCallback is an function that does asynchronous I/O on behalf of a task. +type AIOCallback func(context.Context) + +// QueueAIO queues an AIOCallback which will be run asynchronously. +func (t *Task) QueueAIO(cb AIOCallback) { + ctx := taskAsyncContext{t: t} + wg := &t.TaskSet().aioGoroutines + wg.Add(1) + go func() { + cb(ctx) + wg.Done() + }() +} + +type taskAsyncContext struct { + context.NoopSleeper + t *Task +} + +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) +} + +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) +} + +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} + +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return ctx.t.Deadline() +} + +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return ctx.t.Done() +} + +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return ctx.t.Err() +} + +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.Value(key) +} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index 0c40bf315..dd5f0f5fa 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -18,7 +18,6 @@ import ( "time" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" ) // contextID is the kernel package's type for context.Context.Value keys. @@ -113,55 +112,3 @@ func (*Task) Done() <-chan struct{} { func (*Task) Err() error { return nil } - -// AsyncContext returns a context.Context that may be used by goroutines that -// do work on behalf of t and therefore share its contextual values, but are -// not t's task goroutine (e.g. asynchronous I/O). -func (t *Task) AsyncContext() context.Context { - return taskAsyncContext{t: t} -} - -type taskAsyncContext struct { - context.NoopSleeper - t *Task -} - -// Debugf implements log.Logger.Debugf. -func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { - ctx.t.Debugf(format, v...) -} - -// Infof implements log.Logger.Infof. -func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { - ctx.t.Infof(format, v...) -} - -// Warningf implements log.Logger.Warningf. -func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { - ctx.t.Warningf(format, v...) -} - -// IsLogging implements log.Logger.IsLogging. -func (ctx taskAsyncContext) IsLogging(level log.Level) bool { - return ctx.t.IsLogging(level) -} - -// Deadline implements context.Context.Deadline. -func (ctx taskAsyncContext) Deadline() (time.Time, bool) { - return ctx.t.Deadline() -} - -// Done implements context.Context.Done. -func (ctx taskAsyncContext) Done() <-chan struct{} { - return ctx.t.Done() -} - -// Err implements context.Context.Err. -func (ctx taskAsyncContext) Err() error { - return ctx.t.Err() -} - -// Value implements context.Context.Value. -func (ctx taskAsyncContext) Value(key interface{}) interface{} { - return ctx.t.Value(key) -} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 52491da7a..554a42e05 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -452,9 +452,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { return err } - // Ensure that all pending asynchronous work is complete: - // - inode and mount release - // - asynchronuous IO + // Ensure that all inode and mount release operations have completed. fs.AsyncBarrier() // Once all fs work has completed (flushed references have all been released), @@ -1249,13 +1247,15 @@ func (k *Kernel) Kill(es ExitStatus) { } // Pause requests that all tasks in k temporarily stop executing, and blocks -// until all tasks in k have stopped. Multiple calls to Pause nest and require -// an equal number of calls to Unpause to resume execution. +// until all tasks and asynchronous I/O operations in k have stopped. Multiple +// calls to Pause nest and require an equal number of calls to Unpause to +// resume execution. func (k *Kernel) Pause() { k.extMu.Lock() k.tasks.BeginExternalStop() k.extMu.Unlock() k.tasks.runningGoroutines.Wait() + k.tasks.aioGoroutines.Wait() } // Unpause ends the effect of a previous call to Pause. If Unpause is called diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index bf2dabb6e..872e1a82d 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -87,6 +87,13 @@ type TaskSet struct { // at time of save (but note that this is not necessarily the same thing as // sync.WaitGroup's zero value). runningGoroutines sync.WaitGroup `state:"nosave"` + + // aioGoroutines is the number of goroutines running async I/O + // callbacks. + // + // aioGoroutines is not saved but is required to be zero at the time of + // save. + aioGoroutines sync.WaitGroup `state:"nosave"` } // newTaskSet returns a new, empty TaskSet. diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index d781d6a04..ba2557c52 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -15,8 +15,8 @@ package linux import ( - "encoding/binary" - + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -27,59 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// I/O commands. -const ( - _IOCB_CMD_PREAD = 0 - _IOCB_CMD_PWRITE = 1 - _IOCB_CMD_FSYNC = 2 - _IOCB_CMD_FDSYNC = 3 - _IOCB_CMD_NOOP = 6 - _IOCB_CMD_PREADV = 7 - _IOCB_CMD_PWRITEV = 8 -) - -// I/O flags. -const ( - _IOCB_FLAG_RESFD = 1 -) - -// ioCallback describes an I/O request. -// -// The priority field is currently ignored in the implementation below. Also -// note that the IOCB_FLAG_RESFD feature is not supported. -type ioCallback struct { - Data uint64 - Key uint32 - Reserved1 uint32 - - OpCode uint16 - ReqPrio int16 - FD int32 - - Buf uint64 - Bytes uint64 - Offset int64 - - Reserved2 uint64 - Flags uint32 - - // eventfd to signal if IOCB_FLAG_RESFD is set in flags. - ResFD int32 -} - -// ioEvent describes an I/O result. -// -// +stateify savable -type ioEvent struct { - Data uint64 - Obj uint64 - Result int64 - Result2 int64 -} - -// ioEventSize is the size of an ioEvent encoded. -var ioEventSize = binary.Size(ioEvent{}) - // IoSetup implements linux syscall io_setup(2). func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nrEvents := args[0].Int() @@ -192,7 +139,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } } - ev := v.(*ioEvent) + ev := v.(*linux.IOEvent) // Copy out the result. if _, err := t.CopyOut(eventsAddr, ev); err != nil { @@ -204,7 +151,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } // Keep rolling. - eventsAddr += usermem.Addr(ioEventSize) + eventsAddr += usermem.Addr(linux.IOEventSize) } // Everything finished. @@ -231,7 +178,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl } // memoryFor returns appropriate memory for the given callback. -func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. @@ -242,17 +189,17 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { // we have no guarantee that t's AddressSpace will be active during the // I/O. switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE: + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) - case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV: + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) - case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP: + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: return usermem.IOSequence{}, nil default: @@ -261,54 +208,62 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { } } -func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) { - if ctx.Dead() { - ctx.CancelPendingRequest() - return - } - ev := &ioEvent{ - Data: cb.Data, - Obj: uint64(cbAddr), - } +// IoCancel implements linux syscall io_cancel(2). +// +// It is not presently supported (ENOSYS indicates no support on this +// architecture). +func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ENOSYS +} - // Construct a context.Context that will not be interrupted if t is - // interrupted. - c := t.AsyncContext() +// LINT.IfChange - var err error - switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV: - ev.Result, err = file.Preadv(c, ioseq, cb.Offset) - case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: - ev.Result, err = file.Pwritev(c, ioseq, cb.Offset) - case _IOCB_CMD_FSYNC: - err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll) - case _IOCB_CMD_FDSYNC: - err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData) - } +func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback { + return func(ctx context.Context) { + if actx.Dead() { + actx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } - // Update the result. - if err != nil { - err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) - ev.Result = -int64(kernel.ExtractErrno(err, 0)) - } + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_FSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + case linux.IOCB_CMD_FDSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData) + } + + // Update the result. + if err != nil { + err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } - file.DecRef() + file.DecRef() - // Queue the result for delivery. - ctx.FinishRequest(ev) + // Queue the result for delivery. + actx.FinishRequest(ev) - // Notify the event file if one was specified. This needs to happen - // *after* queueing the result to avoid racing with the thread we may - // wake up. - if eventFile != nil { - eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) - eventFile.DecRef() + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFile != nil { + eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) + eventFile.DecRef() + } } } // submitCallback processes a single callback. -func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error { +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { file := t.GetFile(cb.FD) if file == nil { // File not found. @@ -318,7 +273,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Was there an eventFD? Extract it. var eventFile *fs.File - if cb.Flags&_IOCB_FLAG_RESFD != 0 { + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. @@ -340,7 +295,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Check offset for reads/writes. switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV, _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { return syserror.EINVAL } @@ -366,7 +321,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Perform the request asynchronously. file.IncRef() - fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) }) + t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile)) // All set. return nil @@ -395,7 +350,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Copy in this callback. - var cb ioCallback + var cb linux.IOCallback cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) if _, err := t.CopyIn(cbAddr, &cb); err != nil { @@ -424,10 +379,4 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return uintptr(nrEvents), nil, nil } -// IoCancel implements linux syscall io_cancel(2). -// -// It is not presently supported (ENOSYS indicates no support on this -// architecture). -func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ENOSYS -} +// LINT.ThenChange(vfs2/aio.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 9f93f4354..c301a0991 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "vfs2", srcs = [ + "aio.go", "epoll.go", "eventfd.go", "execve.go", @@ -40,6 +41,7 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/bits", + "//pkg/context", "//pkg/fspath", "//pkg/gohacks", "//pkg/sentry/arch", @@ -57,6 +59,7 @@ go_library( "//pkg/sentry/limits", "//pkg/sentry/loader", "//pkg/sentry/memmap", + "//pkg/sentry/mm", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go new file mode 100644 index 000000000..e5cdefc50 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -0,0 +1,216 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/mm" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// IoSubmit implements linux syscall io_submit(2). +func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + nrEvents := args[1].Int() + addr := args[2].Pointer() + + if nrEvents < 0 { + return 0, nil, syserror.EINVAL + } + + for i := int32(0); i < nrEvents; i++ { + // Copy in the address. + cbAddrNative := t.Arch().Native(0) + if _, err := t.CopyIn(addr, cbAddrNative); err != nil { + if i > 0 { + // Some successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Copy in this callback. + var cb linux.IOCallback + cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) + if _, err := t.CopyIn(cbAddr, &cb); err != nil { + if i > 0 { + // Some have been successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Process this callback. + if err := submitCallback(t, id, &cb, cbAddr); err != nil { + if i > 0 { + // Partial success. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Advance to the next one. + addr += usermem.Addr(t.Arch().Width()) + } + + return uintptr(nrEvents), nil, nil +} + +// submitCallback processes a single callback. +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { + if cb.Reserved2 != 0 { + return syserror.EINVAL + } + + fd := t.GetFileVFS2(cb.FD) + if fd == nil { + return syserror.EBADF + } + defer fd.DecRef() + + // Was there an eventFD? Extract it. + var eventFD *vfs.FileDescription + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { + eventFD = t.GetFileVFS2(cb.ResFD) + if eventFD == nil { + return syserror.EBADF + } + defer eventFD.DecRef() + + // Check that it is an eventfd. + if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { + return syserror.EINVAL + } + } + + ioseq, err := memoryFor(t, cb) + if err != nil { + return err + } + + // Check offset for reads/writes. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + if cb.Offset < 0 { + return syserror.EINVAL + } + } + + // Prepare the request. + aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return syserror.EINVAL + } + if ready := aioCtx.Prepare(); !ready { + // Context is busy. + return syserror.EAGAIN + } + + if eventFD != nil { + // The request is set. Make sure there's a ref on the file. + // + // This is necessary when the callback executes on completion, + // which is also what will release this reference. + eventFD.IncRef() + } + + // Perform the request asynchronously. + fd.IncRef() + t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx)) + return nil +} + +func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback { + return func(ctx context.Context) { + if aioCtx.Dead() { + aioCtx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } + + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{}) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{}) + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC: + err = fd.Sync(ctx) + } + + // Update the result. + if err != nil { + err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } + + fd.DecRef() + + // Queue the result for delivery. + aioCtx.FinishRequest(ev) + + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFD != nil { + eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) + eventFD.DecRef() + } + } +} + +// memoryFor returns appropriate memory for the given callback. +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { + bytes := int(cb.Bytes) + if bytes < 0 { + // Linux also requires that this field fit in ssize_t. + return usermem.IOSequence{}, syserror.EINVAL + } + + // Since this I/O will be asynchronous with respect to t's task goroutine, + // we have no guarantee that t's AddressSpace will be active during the + // I/O. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: + return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: + return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: + return usermem.IOSequence{}, nil + + default: + // Not a supported command. + return usermem.IOSequence{}, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 954c82f97..caa6a98ff 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -105,11 +105,7 @@ func Override() { s.Table[197] = syscalls.Supported("removexattr", Removexattr) s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr) s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr) - delete(s.Table, 206) // io_setup - delete(s.Table, 207) // io_destroy - delete(s.Table, 208) // io_getevents - delete(s.Table, 209) // io_submit - delete(s.Table, 210) // io_cancel + s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}) s.Table[213] = syscalls.Supported("epoll_create", EpollCreate) s.Table[217] = syscalls.Supported("getdents64", Getdents64) delete(s.Table, 221) // fdavise64 -- cgit v1.2.3 From 96519e2c9d3fa1f15537c4dfc081a19d8d1ce1a2 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 17 Jun 2020 10:02:41 -0700 Subject: Implement POSIX locks - Change FileDescriptionImpl Lock/UnlockPOSIX signature to take {start,length,whence}, so the correct offset can be calculated in the implementations. - Create PosixLocker interface to make it possible to share the same locking code from different implementations. Closes #1480 PiperOrigin-RevId: 316910286 --- pkg/sentry/fsimpl/devpts/BUILD | 2 +- pkg/sentry/fsimpl/devpts/devpts.go | 3 +- pkg/sentry/fsimpl/devpts/master.go | 14 +++- pkg/sentry/fsimpl/devpts/slave.go | 14 +++- pkg/sentry/fsimpl/ext/BUILD | 2 +- pkg/sentry/fsimpl/ext/directory.go | 11 +++ pkg/sentry/fsimpl/ext/inode.go | 3 +- pkg/sentry/fsimpl/ext/regular_file.go | 11 +++ pkg/sentry/fsimpl/ext/symlink.go | 1 + pkg/sentry/fsimpl/gofer/BUILD | 1 - pkg/sentry/fsimpl/gofer/gofer.go | 12 ++- pkg/sentry/fsimpl/gofer/special_file.go | 3 +- pkg/sentry/fsimpl/host/BUILD | 2 +- pkg/sentry/fsimpl/host/host.go | 14 +++- pkg/sentry/fsimpl/host/tty.go | 11 +++ pkg/sentry/fsimpl/kernfs/BUILD | 3 +- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 16 +++- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 16 +++- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 3 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 5 +- pkg/sentry/fsimpl/overlay/BUILD | 2 +- pkg/sentry/fsimpl/overlay/overlay.go | 14 +++- pkg/sentry/fsimpl/pipefs/BUILD | 1 - pkg/sentry/fsimpl/pipefs/pipefs.go | 3 +- pkg/sentry/fsimpl/proc/BUILD | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 3 +- pkg/sentry/fsimpl/proc/task.go | 3 +- pkg/sentry/fsimpl/proc/task_fds.go | 3 +- pkg/sentry/fsimpl/proc/task_files.go | 14 +++- pkg/sentry/fsimpl/proc/tasks.go | 3 +- pkg/sentry/fsimpl/sys/BUILD | 1 - pkg/sentry/fsimpl/sys/sys.go | 3 +- pkg/sentry/fsimpl/tmpfs/BUILD | 1 - pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 33 +++----- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 15 +++- pkg/sentry/kernel/fd_table.go | 10 ++- pkg/sentry/kernel/pipe/BUILD | 2 +- pkg/sentry/kernel/pipe/vfs.go | 18 +++- pkg/sentry/socket/hostinet/BUILD | 2 +- pkg/sentry/socket/hostinet/socket_vfs2.go | 14 +++- pkg/sentry/socket/netlink/BUILD | 2 +- pkg/sentry/socket/netlink/socket_vfs2.go | 14 +++- pkg/sentry/socket/netstack/BUILD | 2 +- pkg/sentry/socket/netstack/netstack_vfs2.go | 14 +++- pkg/sentry/socket/unix/BUILD | 2 +- pkg/sentry/socket/unix/unix_vfs2.go | 16 +++- pkg/sentry/syscalls/linux/vfs2/fd.go | 38 +++++++++ pkg/sentry/vfs/BUILD | 2 +- pkg/sentry/vfs/file_description.go | 18 ++-- pkg/sentry/vfs/file_description_impl_util.go | 25 ++---- pkg/sentry/vfs/lock.go | 109 +++++++++++++++++++++++++ pkg/sentry/vfs/lock/BUILD | 13 --- pkg/sentry/vfs/lock/lock.go | 72 ---------------- test/syscalls/linux/BUILD | 1 + test/syscalls/linux/fcntl.cc | 96 +++++++++++++++------- 55 files changed, 484 insertions(+), 234 deletions(-) create mode 100644 pkg/sentry/vfs/lock.go delete mode 100644 pkg/sentry/vfs/lock/BUILD delete mode 100644 pkg/sentry/vfs/lock/lock.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index cf440dce8..93512c9b6 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -18,12 +18,12 @@ go_library( "//pkg/context", "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 9b0e0cca2..e6fda2b4f 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -117,7 +116,7 @@ type rootInode struct { kernfs.InodeNotSymlink kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks // Keep a reference to this inode's dentry. dentry kernfs.Dentry diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 1d22adbe3..69879498a 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -18,11 +18,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -35,7 +35,7 @@ type masterInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -189,6 +189,16 @@ func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions return mfd.inode.Stat(fs, opts) } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence) +} + // maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { switch cmd { diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 7fe475080..cf1a0f0ac 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -18,10 +18,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -34,7 +34,7 @@ type slaveInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -185,3 +185,13 @@ func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() return sfd.inode.Stat(fs, opts) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 973fa0def..ef24f8159 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -54,13 +54,13 @@ go_library( "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 43be6928a..357512c7e 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -305,3 +306,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in fd.off = offset return offset, nil } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 5caaf14ed..30636cf66 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -55,7 +54,7 @@ type inode struct { // diskInode gives us access to the inode struct on disk. Immutable. diskInode disklayout.Inode - locks lock.FileLocks + locks vfs.FileLocks // This is immutable. The first field of the implementations must have inode // as the first field to ensure temporality. diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 152036b2e..66d14bb95 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -149,3 +150,13 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // TODO(b/134676337): Implement mmap(2). return syserror.ENODEV } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index acb28d85b..62efd4095 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -66,6 +66,7 @@ func (in *inode) isSymlink() bool { // O_PATH. For this reason most of the functions return EBADF. type symlinkFD struct { fileDescription + vfs.NoLockFD } // Compiles only if symlinkFD implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 5cdeeaeb5..4a800dcf9 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -69,7 +69,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index ac051b3a7..d8ae475ed 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -53,7 +53,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/usermem" @@ -665,7 +664,7 @@ type dentry struct { // endpoint bound to this file. pipe *pipe.VFSPipe - locks lock.FileLocks + locks vfs.FileLocks } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -1439,9 +1438,14 @@ func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t f } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { fd.lockLogging.Do(func() { log.Infof("Range lock using gofer file handled internally.") }) - return fd.LockFD.LockPOSIX(ctx, uid, t, rng, block) + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 289efdd25..e6e29b329 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -52,7 +51,7 @@ type specialFileFD struct { off int64 } -func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *lock.FileLocks, flags uint32) (*specialFileFD, error) { +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) { ftype := d.fileType() seekable := ftype == linux.S_IFREG mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 54f16ad63..44a09d87a 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", @@ -39,7 +40,6 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/uniqueid", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 5ec5100b8..7906242c9 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -28,13 +28,13 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -183,7 +183,7 @@ type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks // When the reference count reaches zero, the host fd is closed. refs.AtomicRefCount @@ -718,3 +718,13 @@ func (f *fileDescription) EventUnregister(e *waiter.Entry) { func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 68af6e5af..0fbc543b1 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -377,3 +378,13 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) return kernel.ERESTARTSYS } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 0299dbde9..179df6c1e 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -45,11 +45,11 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/refs", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", @@ -68,7 +68,6 @@ go_test( "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/usermem", "@com_github_google_go-cmp//cmp:go_default_library", diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 6418de0a3..c1215b70a 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -39,7 +39,7 @@ type DynamicBytesFile struct { InodeNotDirectory InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks data vfs.DynamicBytesSource } @@ -86,7 +86,7 @@ type DynamicBytesFD struct { } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *lock.FileLocks, flags uint32) error { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { fd.LockFD.Init(locks) if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { return err @@ -135,3 +135,13 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return syserror.EPERM } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 33a5968ca..5f7853a2a 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -57,7 +57,7 @@ type GenericDirectoryFD struct { // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. -func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { +func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} if err := fd.Init(children, locks, opts); err != nil { return nil, err @@ -71,7 +71,7 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre // Init initializes a GenericDirectoryFD. Use it when overriding // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the // correct implementation. -func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) error { +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR @@ -235,3 +235,13 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode return inode.SetStat(ctx, fd.filesystem(), creds, opts) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 0e4927215..650bd7b88 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -557,7 +556,7 @@ type StaticDirectory struct { InodeNoDynamicLookup OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks } var _ Inode = (*StaticDirectory)(nil) diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 6749facf7..dc407eb1d 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -103,7 +102,7 @@ type readonlyDir struct { kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks dentry kernfs.Dentry } @@ -133,7 +132,7 @@ type dir struct { kernfs.InodeNoDynamicLookup kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks fs *filesystem dentry kernfs.Dentry diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD index f9413bbdd..8cf5b35d3 100644 --- a/pkg/sentry/fsimpl/overlay/BUILD +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -29,11 +29,11 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index e660d0e2c..e11a3ff19 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -35,9 +35,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -415,7 +415,7 @@ type dentry struct { devMinor uint32 ino uint64 - locks lock.FileLocks + locks vfs.FileLocks } // newDentry creates a new dentry. The dentry initially has no references; it @@ -610,3 +610,13 @@ func (fd *fileDescription) filesystem() *filesystem { func (fd *fileDescription) dentry() *dentry { return fd.vfsfd.Dentry().Impl().(*dentry) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index c618dbe6c..5950a2d59 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -15,7 +15,6 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/usermem", ], diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index e4dabaa33..dd7eaf4a8 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -82,7 +81,7 @@ type inode struct { kernfs.InodeNotSymlink kernfs.InodeNoopRefCount - locks lock.FileLocks + locks vfs.FileLocks pipe *pipe.VFSPipe ino uint64 diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 351ba4ee9..6014138ff 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -22,6 +22,7 @@ go_library( "//pkg/log", "//pkg/refs", "//pkg/safemem", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", @@ -35,7 +36,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/tcpip/header", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index e2cdb7ee9..36a89540c 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -38,7 +37,7 @@ type subtasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid - locks lock.FileLocks + locks vfs.FileLocks fs *filesystem task *kernel.Task diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 44078a765..8bb2b0ce1 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -39,7 +38,7 @@ type taskInode struct { kernfs.InodeAttrs kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks task *kernel.Task } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index ef6c1d04f..7debdb07a 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -54,7 +53,7 @@ func taskFDExists(t *kernel.Task, fd int32) bool { } type fdDir struct { - locks lock.FileLocks + locks vfs.FileLocks fs *filesystem task *kernel.Task diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index e5eaa91cd..ba4405026 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -30,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -777,7 +777,7 @@ type namespaceInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks } var _ kernfs.Inode = (*namespaceInode)(nil) @@ -830,3 +830,13 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err func (fd *namespaceFD) Release() { fd.inode.DecRef() } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 58c8b9d05..2f214d0c2 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -44,7 +43,7 @@ type tasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid - locks lock.FileLocks + locks vfs.FileLocks fs *filesystem pidns *kernel.PIDNamespace diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 237f17def..a741e2bb6 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -15,7 +15,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index b84463d3a..fe02f7ee9 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -101,7 +100,7 @@ type dir struct { kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks dentry kernfs.Dentry } diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 062321cbc..e73732a6b 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -62,7 +62,6 @@ go_library( "//pkg/sentry/uniqueid", "//pkg/sentry/usage", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sentry/vfs/memxattr", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 64e1c40ad..146c7fdfe 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -138,48 +138,37 @@ func TestLocks(t *testing.T) { } defer cleanup() - var ( - uid1 lock.UniqueID - uid2 lock.UniqueID - // Non-blocking. - block lock.Blocker - ) - - uid1 = 123 - uid2 = 456 - - if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil { + uid1 := 123 + uid2 := 456 + if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - rng1 := lock.LockRange{0, 1} - rng2 := lock.LockRange{1, 2} - - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) } - if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil { + if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil { t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) } } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 71a7522af..d0a3e1a5c 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -36,11 +36,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -310,7 +310,7 @@ type inode struct { ctime int64 // nanoseconds mtime int64 // nanoseconds - locks lock.FileLocks + locks vfs.FileLocks // Inotify watches for this inode. watches vfs.Watches @@ -761,9 +761,20 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with // FMODE_READ | FMODE_WRITE. var fd regularFileFD + fd.Init(&inode.locks) flags := uint32(linux.O_RDWR) if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 48911240f..4b7d234a4 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" ) // FDFlags define flags for an individual descriptor. @@ -148,7 +149,12 @@ func (f *FDTable) drop(file *fs.File) { // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(file *vfs.FileDescription) { - // TODO(gvisor.dev/issue/1480): Release locks. + // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the + // entire file. + err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET) + if err != nil && err != syserror.ENOLCK { + panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) + } // Generate inotify events. ev := uint32(linux.IN_CLOSE_NOWRITE) @@ -157,7 +163,7 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { } file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) - // Drop the table reference. + // Drop the table's reference. file.DecRef() } diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 0db546b98..449643118 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -26,8 +26,8 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index c0e9ee1f4..a4519363f 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -63,12 +63,12 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { // Preconditions: statusFlags should not contain an open access mode. func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { // Connected pipes share the same locks. - locks := &lock.FileLocks{} + locks := &vfs.FileLocks{} return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) } // Open opens the pipe represented by vp. -func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) (*vfs.FileDescription, error) { +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -130,7 +130,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) *vfs.FileDescription { +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription { fd := &VFSPipeFD{ pipe: &vp.pipe, } @@ -451,3 +451,13 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr } return n, err } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 60c9896fc..ff81ea6e6 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -26,6 +26,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostfd", "//pkg/sentry/inet", @@ -34,7 +35,6 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 027add1fd..ad5f64799 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -21,12 +21,12 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -61,7 +61,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in fd: fd, }, } - s.LockFD.Init(&lock.FileLocks{}) + s.LockFD.Init(&vfs.FileLocks{}) if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) } @@ -134,6 +134,16 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs return int64(n), err } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} + type socketProviderVFS2 struct { family int } diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 420e573c9..d5ca3ac56 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -20,6 +20,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -29,7 +30,6 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index 8bfee5193..dbcd8b49a 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -18,12 +18,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -78,7 +78,7 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV sendBufferSize: defaultSendBufferSize, }, } - fd.LockFD.Init(&lock.FileLocks{}) + fd.LockFD.Init(&vfs.FileLocks{}) return fd, nil } @@ -140,3 +140,13 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) return int64(n), err.ToError() } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 0f592ecc3..ea6ebd0e2 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -37,7 +38,6 @@ go_library( "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index 1412a4810..d65a89316 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -19,13 +19,13 @@ import ( "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -66,7 +66,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu protocol: protocol, }, } - s.LockFD.Init(&lock.FileLocks{}) + s.LockFD.Init(&vfs.FileLocks{}) vfsfd := &s.vfsfd if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, @@ -318,3 +318,13 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return SetSockOpt(t, s, s.Endpoint, level, name, optVal) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 7d4cc80fe..cca5e70f1 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", @@ -29,7 +30,6 @@ go_library( "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 8c32371a2..ff2149250 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -26,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -53,7 +53,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) mnt := t.Kernel().SocketMount() d := sockfs.NewDentry(t.Credentials(), mnt) - fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &lock.FileLocks{}) + fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{}) if err != nil { return nil, syserr.FromError(err) } @@ -62,7 +62,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) // NewFileDescription creates and returns a socket file description // corresponding to the given mount and dentry. -func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *lock.FileLocks) (*vfs.FileDescription, error) { +func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) { // You can create AF_UNIX, SOCK_RAW sockets. They're the same as // SOCK_DGRAM and don't require CAP_NET_RAW. if stype == linux.SOCK_RAW { @@ -300,6 +300,16 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} + // providerVFS2 is a unix domain socket provider for VFS2. type providerVFS2 struct{} diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index f9ccb303c..f5eaa076b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -17,10 +17,12 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -167,8 +169,44 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } err := tmpfs.AddSeals(file, args[2].Uint()) return 0, nil, err + case linux.F_SETLK, linux.F_SETLKW: + return 0, nil, posixLock(t, args, file, cmd) default: // TODO(gvisor.dev/issue/2920): Everything else is not yet supported. return 0, nil, syserror.EINVAL } } + +func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error { + // Copy in the lock request. + flockAddr := args[2].Pointer() + var flock linux.Flock + if _, err := t.CopyIn(flockAddr, &flock); err != nil { + return err + } + + var blocker lock.Blocker + if cmd == linux.F_SETLKW { + blocker = t + } + + switch flock.Type { + case linux.F_RDLCK: + if !file.IsReadable() { + return syserror.EBADF + } + return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + + case linux.F_WRLCK: + if !file.IsWritable() { + return syserror.EBADF + } + return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + + case linux.F_UNLCK: + return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence) + + default: + return syserror.EINVAL + } +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 16d9f3a28..642769e7c 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -44,6 +44,7 @@ go_library( "filesystem_impl_util.go", "filesystem_type.go", "inotify.go", + "lock.go", "mount.go", "mount_unsafe.go", "options.go", @@ -72,7 +73,6 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/uniqueid", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 13c48824e..e0538ea53 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -438,14 +438,10 @@ type FileDescriptionImpl interface { UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): POSIX-style file locking - LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error + LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error // UnlockPOSIX releases a POSIX-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): POSIX-style file locking - UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error } // Dirent holds the information contained in struct linux_dirent64. @@ -764,3 +760,13 @@ func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, func (fd *FileDescription) UnlockBSD(ctx context.Context) error { return fd.impl.UnlockBSD(ctx, fd) } + +// LockPOSIX locks a POSIX-style file range lock. +func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error { + return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block) +} + +// UnlockPOSIX unlocks a POSIX-style file range lock. +func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error { + return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence) +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index af7213dfd..1e66997ce 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -369,14 +368,19 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M // LockFD may be used by most implementations of FileDescriptionImpl.Lock* // functions. Caller must call Init(). type LockFD struct { - locks *lock.FileLocks + locks *FileLocks } // Init initializes fd with FileLocks to use. -func (fd *LockFD) Init(locks *lock.FileLocks) { +func (fd *LockFD) Init(locks *FileLocks) { fd.locks = locks } +// Locks returns the locks associated with this file. +func (fd *LockFD) Locks() *FileLocks { + return fd.locks +} + // LockBSD implements vfs.FileDescriptionImpl.LockBSD. func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { return fd.locks.LockBSD(uid, t, block) @@ -388,17 +392,6 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { return nil } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { - return fd.locks.LockPOSIX(uid, t, rng, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error { - fd.locks.UnlockPOSIX(uid, rng) - return nil -} - // NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface // returning ENOLCK. type NoLockFD struct{} @@ -414,11 +407,11 @@ func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return syserror.ENOLCK } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error { +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { return syserror.ENOLCK } diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go new file mode 100644 index 000000000..6c7583a81 --- /dev/null +++ b/pkg/sentry/vfs/lock.go @@ -0,0 +1,109 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lock provides POSIX and BSD style file locking for VFS2 file +// implementations. +// +// The actual implementations can be found in the lock package under +// sentry/fs/lock. +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) +// and flock(2) respectively in Linux. It can be embedded into various file +// implementations for VFS2 that support locking. +// +// Note that in Linux these two types of locks are _not_ cooperative, because +// race and deadlock conditions make merging them prohibitive. We do the same +// and keep them oblivious to each other. +type FileLocks struct { + // bsd is a set of BSD-style advisory file wide locks, see flock(2). + bsd fslock.Locks + + // posix is a set of POSIX-style regional advisory locks, see fcntl(2). + posix fslock.Locks +} + +// LockBSD tries to acquire a BSD-style lock on the entire file. +func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockBSD releases a BSD-style lock on the entire file. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { + fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) +} + +// LockPOSIX tries to acquire a POSIX-style lock on a file region. +func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } + if fl.posix.LockRegion(uid, t, rng, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockPOSIX releases a POSIX-style lock on a file region. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } + fl.posix.UnlockRegion(uid, rng) + return nil +} + +func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) { + var off int64 + switch whence { + case linux.SEEK_SET: + off = 0 + case linux.SEEK_CUR: + // Note that Linux does not hold any mutexes while retrieving the file + // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) + if err != nil { + return fslock.LockRange{}, err + } + off = curOff + case linux.SEEK_END: + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) + if err != nil { + return fslock.LockRange{}, err + } + off = int64(stat.Size) + default: + return fslock.LockRange{}, syserror.EINVAL + } + + return fslock.ComputeRange(int64(start), int64(length), off) +} diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD deleted file mode 100644 index d9ab063b7..000000000 --- a/pkg/sentry/vfs/lock/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "lock", - srcs = ["lock.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/fs/lock", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go deleted file mode 100644 index 724dfe743..000000000 --- a/pkg/sentry/vfs/lock/lock.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package lock provides POSIX and BSD style file locking for VFS2 file -// implementations. -// -// The actual implementations can be found in the lock package under -// sentry/fs/lock. -package lock - -import ( - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/syserror" -) - -// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) -// and flock(2) respectively in Linux. It can be embedded into various file -// implementations for VFS2 that support locking. -// -// Note that in Linux these two types of locks are _not_ cooperative, because -// race and deadlock conditions make merging them prohibitive. We do the same -// and keep them oblivious to each other. -type FileLocks struct { - // bsd is a set of BSD-style advisory file wide locks, see flock(2). - bsd fslock.Locks - - // posix is a set of POSIX-style regional advisory locks, see fcntl(2). - posix fslock.Locks -} - -// LockBSD tries to acquire a BSD-style lock on the entire file. -func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { - return nil - } - return syserror.ErrWouldBlock -} - -// UnlockBSD releases a BSD-style lock on the entire file. -// -// This operation is always successful, even if there did not exist a lock on -// the requested region held by uid in the first place. -func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { - fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) -} - -// LockPOSIX tries to acquire a POSIX-style lock on a file region. -func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { - if fl.posix.LockRegion(uid, t, rng, block) { - return nil - } - return syserror.ErrWouldBlock -} - -// UnlockPOSIX releases a POSIX-style lock on a file region. -// -// This operation is always successful, even if there did not exist a lock on -// the requested region held by uid in the first place. -func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) { - fl.posix.UnlockRegion(uid, rng) -} diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 96044928e..078e4a284 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -791,6 +791,7 @@ cc_binary( deps = [ ":socket_test_util", "//test/util:cleanup", + "//test/util:epoll_util", "//test/util:eventfd_util", "//test/util:fs_util", "@com_google_absl//absl/base:core_headers", diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 35e8a4ff3..25bef2522 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -191,45 +191,85 @@ TEST(FcntlTest, SetFlags) { EXPECT_EQ(rflags, expected); } -TEST_F(FcntlLockTest, SetLockBadFd) { +void TestLock(int fd, short lock_type = F_RDLCK) { // NOLINT, type in flock struct flock fl; - fl.l_type = F_WRLCK; + fl.l_type = lock_type; fl.l_whence = SEEK_SET; fl.l_start = 0; - // len 0 has a special meaning: lock all bytes despite how - // large the file grows. + // len 0 locks all bytes despite how large the file grows. fl.l_len = 0; - EXPECT_THAT(fcntl(-1, F_SETLK, &fl), SyscallFailsWithErrno(EBADF)); + EXPECT_THAT(fcntl(fd, F_SETLK, &fl), SyscallSucceeds()); } -TEST_F(FcntlLockTest, SetLockPipe) { - int fds[2]; - ASSERT_THAT(pipe(fds), SyscallSucceeds()); - +void TestLockBadFD(int fd, + short lock_type = F_RDLCK) { // NOLINT, type in flock struct flock fl; - fl.l_type = F_WRLCK; + fl.l_type = lock_type; fl.l_whence = SEEK_SET; fl.l_start = 0; - // Same as SetLockBadFd, but doesn't matter, we expect this to fail. + // len 0 locks all bytes despite how large the file grows. fl.l_len = 0; - EXPECT_THAT(fcntl(fds[0], F_SETLK, &fl), SyscallFailsWithErrno(EBADF)); - EXPECT_THAT(close(fds[0]), SyscallSucceeds()); - EXPECT_THAT(close(fds[1]), SyscallSucceeds()); + EXPECT_THAT(fcntl(fd, F_SETLK, &fl), SyscallFailsWithErrno(EBADF)); } +TEST_F(FcntlLockTest, SetLockBadFd) { TestLockBadFD(-1); } + TEST_F(FcntlLockTest, SetLockDir) { auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0666)); + auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0000)); + TestLock(fd.get()); +} - struct flock fl; - fl.l_type = F_RDLCK; - fl.l_whence = SEEK_SET; - fl.l_start = 0; - // Same as SetLockBadFd. - fl.l_len = 0; +TEST_F(FcntlLockTest, SetLockSymlink) { + // TODO(gvisor.dev/issue/2782): Replace with IsRunningWithVFS1() when O_PATH + // is supported. + SKIP_IF(IsRunningOnGvisor()); - EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + auto symlink = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), file.path())); + + auto fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(symlink.path(), O_RDONLY | O_PATH, 0000)); + TestLockBadFD(fd.get()); +} + +TEST_F(FcntlLockTest, SetLockProc) { + auto fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/status", O_RDONLY, 0000)); + TestLock(fd.get()); +} + +TEST_F(FcntlLockTest, SetLockPipe) { + SKIP_IF(IsRunningWithVFS1()); + + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + + TestLock(fds[0]); + TestLockBadFD(fds[0], F_WRLCK); + + TestLock(fds[1], F_WRLCK); + TestLockBadFD(fds[1]); + + EXPECT_THAT(close(fds[0]), SyscallSucceeds()); + EXPECT_THAT(close(fds[1]), SyscallSucceeds()); +} + +TEST_F(FcntlLockTest, SetLockSocket) { + SKIP_IF(IsRunningWithVFS1()); + + int sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_THAT(sock, SyscallSucceeds()); + + struct sockaddr_un addr = + ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(true /* abstract */, AF_UNIX)); + ASSERT_THAT( + bind(sock, reinterpret_cast(&addr), sizeof(addr)), + SyscallSucceeds()); + + TestLock(sock); + EXPECT_THAT(close(sock), SyscallSucceeds()); } TEST_F(FcntlLockTest, SetLockBadOpenFlagsWrite) { @@ -241,8 +281,7 @@ TEST_F(FcntlLockTest, SetLockBadOpenFlagsWrite) { fl0.l_type = F_WRLCK; fl0.l_whence = SEEK_SET; fl0.l_start = 0; - // Same as SetLockBadFd. - fl0.l_len = 0; + fl0.l_len = 0; // Lock all file // Expect that setting a write lock using a read only file descriptor // won't work. @@ -704,7 +743,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingWriteLock) { << "Exited with code: " << status; } -// This test will veirfy that blocking works as expected when another process +// This test will verify that blocking works as expected when another process // holds a read lock when obtaining a write lock. This test will hold the lock // for some amount of time and then wait for the second process to send over the // socket_fd the amount of time it was blocked for before the lock succeeded. @@ -1109,8 +1148,7 @@ int main(int argc, char** argv) { fl.l_start = absl::GetFlag(FLAGS_child_setlock_start); fl.l_len = absl::GetFlag(FLAGS_child_setlock_len); - // Test the fcntl, no need to log, the error is unambiguously - // from fcntl at this point. + // Test the fcntl. int err = 0; int ret = 0; @@ -1123,6 +1161,8 @@ int main(int argc, char** argv) { if (ret == -1 && errno != 0) { err = errno; + std::cerr << "CHILD lock " << setlock_on << " failed " << err + << std::endl; } // If there is a socket fd let's send back the time in microseconds it took -- cgit v1.2.3 From 364ac92baf83f2352f78b718090472639bd92a76 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 23 Jun 2020 23:32:23 -0700 Subject: Support for saving pointers to fields in the state package. Previously, it was not possible to encode/decode an object graph which contained a pointer to a field within another type. This was because the encoder was previously unable to disambiguate a pointer to an object and a pointer within the object. This CL remedies this by constructing an address map tracking the full memory range object occupy. The encoded Refvalue message has been extended to allow references to children objects within another object. Because the encoding process may learn about object structure over time, we cannot encode any objects under the entire graph has been generated. This CL also updates the state package to use standard interfaces intead of reflection-based dispatch in order to improve performance overall. This includes a custom wire protocol to significantly reduce the number of allocations and take advantage of structure packing. As part of these changes, there are a small number of minor changes in other places of the code base: * The lists used during encoding are changed to use intrusive lists with the objectEncodeState directly, which required that the ilist Len() method is updated to work properly with the ElementMapper mechanism. * A bug is fixed in the list code wherein Remove() called on an element that is already removed can corrupt the list (removing the element if there's only a single element). Now the behavior is correct. * Standard error wrapping is introduced. * Compressio was updated to implement the new wire.Reader and wire.Writer inteface methods directly. The lack of a ReadByte and WriteByte caused issues not due to interface dispatch, but because underlying slices for a Read or Write call through an interface would always escape to the heap! * Statify has been updated to support the new APIs. See README.md for a description of how the new mechanism works. PiperOrigin-RevId: 318010298 --- pkg/compressio/compressio.go | 54 +- pkg/gohacks/BUILD | 1 + pkg/ilist/list.go | 6 +- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 22 +- pkg/sentry/pgalloc/BUILD | 1 + pkg/sentry/pgalloc/save_restore.go | 13 +- pkg/state/BUILD | 68 ++- pkg/state/README.md | 158 ++++++ pkg/state/decode.go | 918 ++++++++++++++++++-------------- pkg/state/decode_unsafe.go | 27 + pkg/state/encode.go | 1025 ++++++++++++++++++++++++------------ pkg/state/encode_unsafe.go | 48 -- pkg/state/map.go | 232 -------- pkg/state/object.proto | 140 ----- pkg/state/pretty/BUILD | 13 + pkg/state/pretty/pretty.go | 273 ++++++++++ pkg/state/printer.go | 251 --------- pkg/state/state.go | 360 ++++++------- pkg/state/state_norace.go | 19 + pkg/state/state_race.go | 19 + pkg/state/state_test.go | 721 ------------------------- pkg/state/statefile/BUILD | 1 + pkg/state/statefile/statefile.go | 15 +- pkg/state/stats.go | 117 ++-- pkg/state/tests/BUILD | 43 ++ pkg/state/tests/array.go | 35 ++ pkg/state/tests/array_test.go | 134 +++++ pkg/state/tests/bench.go | 24 + pkg/state/tests/bench_test.go | 153 ++++++ pkg/state/tests/bool_test.go | 31 ++ pkg/state/tests/float_test.go | 118 +++++ pkg/state/tests/integer.go | 163 ++++++ pkg/state/tests/integer_test.go | 94 ++++ pkg/state/tests/load.go | 61 +++ pkg/state/tests/load_test.go | 70 +++ pkg/state/tests/map.go | 28 + pkg/state/tests/map_test.go | 90 ++++ pkg/state/tests/register.go | 21 + pkg/state/tests/register_test.go | 167 ++++++ pkg/state/tests/string_test.go | 34 ++ pkg/state/tests/struct.go | 65 +++ pkg/state/tests/struct_test.go | 89 ++++ pkg/state/tests/tests.go | 215 ++++++++ pkg/state/types.go | 361 +++++++++++++ pkg/state/wire/BUILD | 12 + pkg/state/wire/wire.go | 970 ++++++++++++++++++++++++++++++++++ runsc/cmd/BUILD | 2 +- runsc/cmd/statefile.go | 12 +- tools/checkescape/checkescape.go | 4 +- tools/go_stateify/main.go | 182 ++++--- 51 files changed, 5171 insertions(+), 2510 deletions(-) create mode 100644 pkg/state/README.md create mode 100644 pkg/state/decode_unsafe.go delete mode 100644 pkg/state/map.go delete mode 100644 pkg/state/object.proto create mode 100644 pkg/state/pretty/BUILD create mode 100644 pkg/state/pretty/pretty.go delete mode 100644 pkg/state/printer.go create mode 100644 pkg/state/state_norace.go create mode 100644 pkg/state/state_race.go delete mode 100644 pkg/state/state_test.go create mode 100644 pkg/state/tests/BUILD create mode 100644 pkg/state/tests/array.go create mode 100644 pkg/state/tests/array_test.go create mode 100644 pkg/state/tests/bench.go create mode 100644 pkg/state/tests/bench_test.go create mode 100644 pkg/state/tests/bool_test.go create mode 100644 pkg/state/tests/float_test.go create mode 100644 pkg/state/tests/integer.go create mode 100644 pkg/state/tests/integer_test.go create mode 100644 pkg/state/tests/load.go create mode 100644 pkg/state/tests/load_test.go create mode 100644 pkg/state/tests/map.go create mode 100644 pkg/state/tests/map_test.go create mode 100644 pkg/state/tests/register.go create mode 100644 pkg/state/tests/register_test.go create mode 100644 pkg/state/tests/string_test.go create mode 100644 pkg/state/tests/struct.go create mode 100644 pkg/state/tests/struct_test.go create mode 100644 pkg/state/tests/tests.go create mode 100644 pkg/state/types.go create mode 100644 pkg/state/wire/BUILD create mode 100644 pkg/state/wire/wire.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go index 5f52cbe74..b094c5662 100644 --- a/pkg/compressio/compressio.go +++ b/pkg/compressio/compressio.go @@ -346,20 +346,22 @@ func (p *pool) schedule(c *chunk, callback func(*chunk) error) error { } } -// reader chunks reads and decompresses. -type reader struct { +// Reader is a compressed reader. +type Reader struct { pool // in is the source. in io.Reader } +var _ io.Reader = (*Reader)(nil) + // NewReader returns a new compressed reader. If key is non-nil, the data stream // is assumed to contain expected hash values, which will be compared against // hash values computed from the compressed bytes. See package comments for // details. -func NewReader(in io.Reader, key []byte) (io.Reader, error) { - r := &reader{ +func NewReader(in io.Reader, key []byte) (*Reader, error) { + r := &Reader{ in: in, } @@ -394,8 +396,19 @@ var errNewBuffer = errors.New("buffer ready") // ErrHashMismatch is returned if the hash does not match. var ErrHashMismatch = errors.New("hash mismatch") +// ReadByte implements wire.Reader.ReadByte. +func (r *Reader) ReadByte() (byte, error) { + var p [1]byte + n, err := r.Read(p[:]) + if n != 1 { + return p[0], err + } + // Suppress EOF. + return p[0], nil +} + // Read implements io.Reader.Read. -func (r *reader) Read(p []byte) (int, error) { +func (r *Reader) Read(p []byte) (int, error) { r.mu.Lock() defer r.mu.Unlock() @@ -551,8 +564,8 @@ func (r *reader) Read(p []byte) (int, error) { return done, nil } -// writer chunks and schedules writes. -type writer struct { +// Writer is a compressed writer. +type Writer struct { pool // out is the underlying writer. @@ -562,6 +575,8 @@ type writer struct { closed bool } +var _ io.Writer = (*Writer)(nil) + // NewWriter returns a new compressed writer. If key is non-nil, hash values are // generated and written out for compressed bytes. See package comments for // details. @@ -569,8 +584,8 @@ type writer struct { // The recommended chunkSize is on the order of 1M. Extra memory may be // buffered (in the form of read-ahead, or buffered writes), and is limited to // O(chunkSize * [1+GOMAXPROCS]). -func NewWriter(out io.Writer, key []byte, chunkSize uint32, level int) (io.WriteCloser, error) { - w := &writer{ +func NewWriter(out io.Writer, key []byte, chunkSize uint32, level int) (*Writer, error) { + w := &Writer{ pool: pool{ chunkSize: chunkSize, buf: bufPool.Get().(*bytes.Buffer), @@ -597,7 +612,7 @@ func NewWriter(out io.Writer, key []byte, chunkSize uint32, level int) (io.Write } // flush writes a single buffer. -func (w *writer) flush(c *chunk) error { +func (w *Writer) flush(c *chunk) error { // Prefix each chunk with a length; this allows the reader to safely // limit reads while buffering. l := uint32(c.compressed.Len()) @@ -624,8 +639,23 @@ func (w *writer) flush(c *chunk) error { return nil } +// WriteByte implements wire.Writer.WriteByte. +// +// Note that this implementation is necessary on the object itself, as an +// interface-based dispatch cannot tell whether the array backing the slice +// escapes, therefore the all bytes written will generate an escape. +func (w *Writer) WriteByte(b byte) error { + var p [1]byte + p[0] = b + n, err := w.Write(p[:]) + if n != 1 { + return err + } + return nil +} + // Write implements io.Writer.Write. -func (w *writer) Write(p []byte) (int, error) { +func (w *Writer) Write(p []byte) (int, error) { w.mu.Lock() defer w.mu.Unlock() @@ -710,7 +740,7 @@ func (w *writer) Write(p []byte) (int, error) { } // Close implements io.Closer.Close. -func (w *writer) Close() error { +func (w *Writer) Close() error { w.mu.Lock() defer w.mu.Unlock() diff --git a/pkg/gohacks/BUILD b/pkg/gohacks/BUILD index 798a65eca..35683fe98 100644 --- a/pkg/gohacks/BUILD +++ b/pkg/gohacks/BUILD @@ -7,5 +7,6 @@ go_library( srcs = [ "gohacks_unsafe.go", ], + stateify = False, visibility = ["//:sandbox"], ) diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go index 0d07da3b1..f4a4c33d3 100644 --- a/pkg/ilist/list.go +++ b/pkg/ilist/list.go @@ -90,7 +90,7 @@ func (l *List) Back() Element { // // NOTE: This is an O(n) operation. func (l *List) Len() (count int) { - for e := l.Front(); e != nil; e = e.Next() { + for e := l.Front(); e != nil; e = (ElementMapper{}.linkerFor(e)).Next() { count++ } return count @@ -182,13 +182,13 @@ func (l *List) Remove(e Element) { if prev != nil { ElementMapper{}.linkerFor(prev).SetNext(next) - } else { + } else if l.head == e { l.head = next } if next != nil { ElementMapper{}.linkerFor(next).SetPrev(prev) - } else { + } else if l.tail == e { l.tail = prev } diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 1510a7c26..25fe1921b 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -200,6 +200,7 @@ go_library( "//pkg/sentry/vfs", "//pkg/state", "//pkg/state/statefile", + "//pkg/state/wire", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 554a42e05..2177b785a 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -34,7 +34,6 @@ package kernel import ( "errors" "fmt" - "io" "path/filepath" "sync/atomic" "time" @@ -73,6 +72,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) @@ -417,7 +417,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { // SaveTo saves the state of k to w. // // Preconditions: The kernel must be paused throughout the call to SaveTo. -func (k *Kernel) SaveTo(w io.Writer) error { +func (k *Kernel) SaveTo(w wire.Writer) error { saveStart := time.Now() ctx := k.SupervisorContext() @@ -473,18 +473,18 @@ func (k *Kernel) SaveTo(w io.Writer) error { // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() - if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil { + if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) // Save the kernel state. kernelStart := time.Now() - var stats state.Stats - if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil { + stats, err := state.Save(k.SupervisorContext(), w, k) + if err != nil { return err } - log.Infof("Kernel save stats: %s", &stats) + log.Infof("Kernel save stats: %s", stats.String()) log.Infof("Kernel save took [%s].", time.Since(kernelStart)) // Save the memory file's state. @@ -629,7 +629,7 @@ func (ts *TaskSet) unregisterEpollWaiters() { } // LoadFrom returns a new Kernel loaded from args. -func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error { +func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error { loadStart := time.Now() initAppCores := k.applicationCores @@ -640,7 +640,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() var features cpuid.FeatureSet - if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil { + if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) @@ -655,11 +655,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the kernel state. kernelStart := time.Now() - var stats state.Stats - if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil { + stats, err := state.Load(k.SupervisorContext(), r, k) + if err != nil { return err } - log.Infof("Kernel load stats: %s", &stats) + log.Infof("Kernel load stats: %s", stats.String()) log.Infof("Kernel load took [%s].", time.Since(kernelStart)) // rootNetworkNamespace should be populated after loading the state file. diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index a9836ba71..e1fcb175f 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -92,6 +92,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/state", + "//pkg/state/wire", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index f8385c146..78317fa35 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -26,11 +26,12 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" "gvisor.dev/gvisor/pkg/usermem" ) // SaveTo writes f's state to the given stream. -func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error { +func (f *MemoryFile) SaveTo(ctx context.Context, w wire.Writer) error { // Wait for reclaim. f.mu.Lock() defer f.mu.Unlock() @@ -79,10 +80,10 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error { } // Save metadata. - if err := state.Save(ctx, w, &f.fileSize, nil); err != nil { + if _, err := state.Save(ctx, w, &f.fileSize); err != nil { return err } - if err := state.Save(ctx, w, &f.usage, nil); err != nil { + if _, err := state.Save(ctx, w, &f.usage); err != nil { return err } @@ -115,9 +116,9 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error { } // LoadFrom loads MemoryFile state from the given stream. -func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error { +func (f *MemoryFile) LoadFrom(ctx context.Context, r wire.Reader) error { // Load metadata. - if err := state.Load(ctx, r, &f.fileSize, nil); err != nil { + if _, err := state.Load(ctx, r, &f.fileSize); err != nil { return err } if err := f.file.Truncate(f.fileSize); err != nil { @@ -125,7 +126,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error { } newMappings := make([]uintptr, f.fileSize>>chunkShift) f.mappings.Store(newMappings) - if err := state.Load(ctx, r, &f.usage, nil); err != nil { + if _, err := state.Load(ctx, r, &f.usage); err != nil { return err } diff --git a/pkg/state/BUILD b/pkg/state/BUILD index 2b1350135..089b3bbef 100644 --- a/pkg/state/BUILD +++ b/pkg/state/BUILD @@ -1,8 +1,46 @@ -load("//tools:defs.bzl", "go_library", "go_test", "proto_library") +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +go_template_instance( + name = "pending_list", + out = "pending_list.go", + package = "state", + prefix = "pending", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*objectEncodeState", + "ElementMapper": "pendingMapper", + "Linker": "*pendingEntry", + }, +) + +go_template_instance( + name = "deferred_list", + out = "deferred_list.go", + package = "state", + prefix = "deferred", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*objectEncodeState", + "ElementMapper": "deferredMapper", + "Linker": "*deferredEntry", + }, +) + +go_template_instance( + name = "complete_list", + out = "complete_list.go", + package = "state", + prefix = "complete", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*objectDecodeState", + "Linker": "*objectDecodeState", + }, +) + go_template_instance( name = "addr_range", out = "addr_range.go", @@ -29,7 +67,7 @@ go_template_instance( types = { "Key": "uintptr", "Range": "addrRange", - "Value": "reflect.Value", + "Value": "*objectEncodeState", "Functions": "addrSetFunctions", }, ) @@ -39,32 +77,24 @@ go_library( srcs = [ "addr_range.go", "addr_set.go", + "complete_list.go", "decode.go", + "decode_unsafe.go", + "deferred_list.go", "encode.go", "encode_unsafe.go", - "map.go", - "printer.go", + "pending_list.go", "state.go", + "state_norace.go", + "state_race.go", "stats.go", + "types.go", ], marshal = False, stateify = False, visibility = ["//:sandbox"], deps = [ - ":object_go_proto", - "@com_github_golang_protobuf//proto:go_default_library", + "//pkg/log", + "//pkg/state/wire", ], ) - -proto_library( - name = "object", - srcs = ["object.proto"], - visibility = ["//:sandbox"], -) - -go_test( - name = "state_test", - timeout = "long", - srcs = ["state_test.go"], - library = ":state", -) diff --git a/pkg/state/README.md b/pkg/state/README.md new file mode 100644 index 000000000..1aa401193 --- /dev/null +++ b/pkg/state/README.md @@ -0,0 +1,158 @@ +# State Encoding and Decoding + +The state package implements the encoding and decoding of data structures for +`go_stateify`. This package is designed for use cases other than the standard +encoding packages, e.g. `gob` and `json`. Principally: + +* This package operates on complex object graphs and accurately serializes and + restores all relationships. That is, you can have things like: intrusive + pointers, cycles, and pointer chains of arbitrary depths. These are not + handled appropriately by existing encoders. This is not an implementation + flaw: the formats themselves are not capable of representing these graphs, + as they can only generate directed trees. + +* This package allows installing order-dependent load callbacks and then + resolves that graph at load time, with cycle detection. Similarly, there is + no analogous feature possible in the standard encoders. + +* This package handles the resolution of interfaces, based on a registered + type name. For interface objects type information is saved in the serialized + format. This is generally true for `gob` as well, but it works differently. + +Here's an overview of how encoding and decoding works. + +## Encoding + +Encoding produces a `statefile`, which contains a list of chunks of the form +`(header, payload)`. The payload can either be some raw data, or a series of +encoded wire objects representing some object graph. All encoded objects are +defined in the `wire` subpackage. + +Encoding of an object graph begins with `encodeState.Save`. + +### 1. Memory Map & Encoding + +To discover relationships between potentially interdependent data structures +(for example, a struct may contain pointers to members of other data +structures), the encoder first walks the object graph and constructs a memory +map of the objects in the input graph. As this walk progresses, objects are +queued in the `pending` list and items are placed on the `deferred` list as they +are discovered. No single object will be encoded multiple times, but the +discovered relationships between objects may change as more parts of the overall +object graph are discovered. + +The encoder starts at the root object and recursively visits all reachable +objects, recording the address ranges containing the underlying data for each +object. This is stored as a segment set (`addrSet`), mapping address ranges to +the of the object occupying the range; see `encodeState.values`. Note that there +is special handling for zero-sized types and map objects during this process. + +Additionally, the encoder assigns each object a unique identifier which is used +to indicate relationships between objects in the statefile; see `objectID` in +`encode.go`. + +### 2. Type Serialization + +The enoder will subsequently serialize all information about discovered types, +including field names. These are used during decoding to reconcile these types +with other internally registered types. + +### 3. Object Serialization + +With a full address map, and all objects correctly encoded, all object encodings +are serialized. The assigned `objectID`s aren't explicitly encoded in the +statefile. The order of object messages in the stream determine their IDs. + +### Example + +Given the following data structure definitions: + +```go +type system struct { + o *outer + i *inner +} + +type outer struct { + a int64 + cn *container +} + +type container struct { + n uint64 + elem *inner +} + +type inner struct { + c container + x, y uint64 +} +``` + +Initialized like this: + +```go +o := outer{ + a: 10, + cn: nil, +} +i := inner{ + x: 20, + y: 30, + c: container{}, +} +s := system{ + o: &o, + i: &i, +} + +o.cn = &i.c +o.cn.elem = &i + +``` + +Encoding will produce an object stream like this: + +``` +g0r1 = struct{ + i: g0r3, + o: g0r2, +} +g0r2 = struct{ + a: 10, + cn: g0r3.c, +} +g0r3 = struct{ + c: struct{ + elem: g0r3, + n: 0u, + }, + x: 20u, + y: 30u, +} +``` + +Note how `g0r3.c` is correctly encoded as the underlying `container` object for +`inner.c`, and how the pointer from `outer.cn` points to it, despite `system.i` +being discovered after the pointer to it in `system.o.cn`. Also note that +decoding isn't strictly reliant on the order of encoded object stream, as long +as the relationship between objects are correctly encoded. + +## Decoding + +Decoding reads the statefile and reconstructs the object graph. Decoding begins +in `decodeState.Load`. Decoding is performed in a single pass over the object +stream in the statefile, and a subsequent pass over all deserialized objects is +done to fire off all loading callbacks in the correctly defined order. Note that +introducing cycles is possible here, but these are detected and an error will be +returned. + +Decoding is relatively straight forward. For most primitive values, the decoder +constructs an appropriate object and fills it with the values encoded in the +statefile. Pointers need special handling, as they must point to a value +allocated elsewhere. When values are constructed, the decoder indexes them by +their `objectID`s in `decodeState.objectsByID`. The target of pointers are +resolved by searching for the target in this index by their `objectID`; see +`decodeState.register`. For pointers to values inside another value (fields in a +pointer, elements of an array), the decoder uses the accessor path to walk to +the appropriate location; see `walkChild`. diff --git a/pkg/state/decode.go b/pkg/state/decode.go index 590c241a3..c9971cdf6 100644 --- a/pkg/state/decode.go +++ b/pkg/state/decode.go @@ -17,28 +17,49 @@ package state import ( "bytes" "context" - "encoding/binary" - "errors" "fmt" - "io" + "math" "reflect" - "sort" - "github.com/golang/protobuf/proto" - pb "gvisor.dev/gvisor/pkg/state/object_go_proto" + "gvisor.dev/gvisor/pkg/state/wire" ) -// objectState represents an object that may be in the process of being +// internalCallback is a interface called on object completion. +// +// There are two implementations: objectDecodeState & userCallback. +type internalCallback interface { + // source returns the dependent object. May be nil. + source() *objectDecodeState + + // callbackRun executes the callback. + callbackRun() +} + +// userCallback is an implementation of internalCallback. +type userCallback func() + +// source implements internalCallback.source. +func (userCallback) source() *objectDecodeState { + return nil +} + +// callbackRun implements internalCallback.callbackRun. +func (uc userCallback) callbackRun() { + uc() +} + +// objectDecodeState represents an object that may be in the process of being // decoded. Specifically, it represents either a decoded object, or an an // interest in a future object that will be decoded. When that interest is // registered (via register), the storage for the object will be created, but // it will not be decoded until the object is encountered in the stream. -type objectState struct { +type objectDecodeState struct { // id is the id for this object. - // - // If this field is zero, then this is an anonymous (unregistered, - // non-reference primitive) object. This is immutable. - id uint64 + id objectID + + // typ is the id for this typeID. This may be zero if this is not a + // type-registered structure. + typ typeID // obj is the object. This may or may not be valid yet, depending on // whether complete returns true. However, regardless of whether the @@ -57,69 +78,52 @@ type objectState struct { // blockedBy is the number of dependencies this object has. blockedBy int - // blocking is a list of the objects blocked by this one. - blocking []*objectState + // callbacksInline is inline storage for callbacks. + callbacksInline [2]internalCallback // callbacks is a set of callbacks to execute on load. - callbacks []func() - - // path is the decoding path to the object. - path recoverable -} - -// complete indicates the object is complete. -func (os *objectState) complete() bool { - return os.blockedBy == 0 && len(os.callbacks) == 0 -} - -// checkComplete checks for completion. If the object is complete, pending -// callbacks will be executed and checkComplete will be called on downstream -// objects (those depending on this one). -func (os *objectState) checkComplete(stats *Stats) { - if os.blockedBy > 0 { - return - } - stats.Start(os.obj) + callbacks []internalCallback - // Fire all callbacks. - for _, fn := range os.callbacks { - fn() - } - os.callbacks = nil - - // Clear all blocked objects. - for _, other := range os.blocking { - other.blockedBy-- - other.checkComplete(stats) - } - os.blocking = nil - stats.Done() + completeEntry } -// waitFor queues a dependency on the given object. -func (os *objectState) waitFor(other *objectState, callback func()) { - os.blockedBy++ - other.blocking = append(other.blocking, os) - if callback != nil { - other.callbacks = append(other.callbacks, callback) +// addCallback adds a callback to the objectDecodeState. +func (ods *objectDecodeState) addCallback(ic internalCallback) { + if ods.callbacks == nil { + ods.callbacks = ods.callbacksInline[:0] } + ods.callbacks = append(ods.callbacks, ic) } // findCycleFor returns when the given object is found in the blocking set. -func (os *objectState) findCycleFor(target *objectState) []*objectState { - for _, other := range os.blocking { - if other == target { - return []*objectState{target} +func (ods *objectDecodeState) findCycleFor(target *objectDecodeState) []*objectDecodeState { + for _, ic := range ods.callbacks { + other := ic.source() + if other != nil && other == target { + return []*objectDecodeState{target} } else if childList := other.findCycleFor(target); childList != nil { return append(childList, other) } } - return nil + + // This should not occur. + Failf("no deadlock found?") + panic("unreachable") } // findCycle finds a dependency cycle. -func (os *objectState) findCycle() []*objectState { - return append(os.findCycleFor(os), os) +func (ods *objectDecodeState) findCycle() []*objectDecodeState { + return append(ods.findCycleFor(ods), ods) +} + +// source implements internalCallback.source. +func (ods *objectDecodeState) source() *objectDecodeState { + return ods +} + +// callbackRun implements internalCallback.callbackRun. +func (ods *objectDecodeState) callbackRun() { + ods.blockedBy-- } // decodeState is a graph of objects in the process of being decoded. @@ -137,30 +141,66 @@ type decodeState struct { // ctx is the decode context. ctx context.Context + // r is the input stream. + r wire.Reader + + // types is the type database. + types typeDecodeDatabase + // objectByID is the set of objects in progress. - objectsByID map[uint64]*objectState + objectsByID []*objectDecodeState // deferred are objects that have been read, by no interest has been // registered yet. These will be decoded once interest in registered. - deferred map[uint64]*pb.Object + deferred map[objectID]wire.Object - // outstanding is the number of outstanding objects. - outstanding uint32 + // pending is the set of objects that are not yet complete. + pending completeList - // r is the input stream. - r io.Reader - - // stats is the passed stats object. - stats *Stats - - // recoverable is the panic recover facility. - recoverable + // stats tracks time data. + stats Stats } // lookup looks up an object in decodeState or returns nil if no such object // has been previously registered. -func (ds *decodeState) lookup(id uint64) *objectState { - return ds.objectsByID[id] +func (ds *decodeState) lookup(id objectID) *objectDecodeState { + if len(ds.objectsByID) < int(id) { + return nil + } + return ds.objectsByID[id-1] +} + +// checkComplete checks for completion. +func (ds *decodeState) checkComplete(ods *objectDecodeState) bool { + // Still blocked? + if ods.blockedBy > 0 { + return false + } + + // Track stats if relevant. + if ods.callbacks != nil && ods.typ != 0 { + ds.stats.start(ods.typ) + defer ds.stats.done() + } + + // Fire all callbacks. + for _, ic := range ods.callbacks { + ic.callbackRun() + } + + // Mark completed. + cbs := ods.callbacks + ods.callbacks = nil + ds.pending.Remove(ods) + + // Recursively check others. + for _, ic := range cbs { + if other := ic.source(); other != nil && other.blockedBy == 0 { + ds.checkComplete(other) + } + } + + return true // All set. } // wait registers a dependency on an object. @@ -168,11 +208,8 @@ func (ds *decodeState) lookup(id uint64) *objectState { // As a special case, we always allow _useable_ references back to the first // decoding object because it may have fields that are already decoded. We also // allow trivial self reference, since they can be handled internally. -func (ds *decodeState) wait(waiter *objectState, id uint64, callback func()) { +func (ds *decodeState) wait(waiter *objectDecodeState, id objectID, callback func()) { switch id { - case 0: - // Nil pointer; nothing to wait for. - fallthrough case waiter.id: // Trivial self reference. fallthrough @@ -184,107 +221,188 @@ func (ds *decodeState) wait(waiter *objectState, id uint64, callback func()) { return } + // Mark as blocked. + waiter.blockedBy++ + // No nil can be returned here. - waiter.waitFor(ds.lookup(id), callback) + other := ds.lookup(id) + if callback != nil { + // Add the additional user callback. + other.addCallback(userCallback(callback)) + } + + // Mark waiter as unblocked. + other.addCallback(waiter) } // waitObject notes a blocking relationship. -func (ds *decodeState) waitObject(os *objectState, p *pb.Object, callback func()) { - if rv, ok := p.Value.(*pb.Object_RefValue); ok { +func (ds *decodeState) waitObject(ods *objectDecodeState, encoded wire.Object, callback func()) { + if rv, ok := encoded.(*wire.Ref); ok && rv.Root != 0 { // Refs can encode pointers and maps. - ds.wait(os, rv.RefValue, callback) - } else if sv, ok := p.Value.(*pb.Object_SliceValue); ok { + ds.wait(ods, objectID(rv.Root), callback) + } else if sv, ok := encoded.(*wire.Slice); ok && sv.Ref.Root != 0 { // See decodeObject; we need to wait for the array (if non-nil). - ds.wait(os, sv.SliceValue.RefValue, callback) - } else if iv, ok := p.Value.(*pb.Object_InterfaceValue); ok { + ds.wait(ods, objectID(sv.Ref.Root), callback) + } else if iv, ok := encoded.(*wire.Interface); ok { // It's an interface (wait recurisvely). - ds.waitObject(os, iv.InterfaceValue.Value, callback) + ds.waitObject(ods, iv.Value, callback) } else if callback != nil { // Nothing to wait for: execute the callback immediately. callback() } } +// walkChild returns a child object from obj, given an accessor path. This is +// the decode-side equivalent to traverse in encode.go. +// +// For the purposes of this function, a child object is either a field within a +// struct or an array element, with one such indirection per element in +// path. The returned value may be an unexported field, so it may not be +// directly assignable. See unsafePointerTo. +func walkChild(path []wire.Dot, obj reflect.Value) reflect.Value { + // See wire.Ref.Dots. The path here is specified in reverse order. + for i := len(path) - 1; i >= 0; i-- { + switch pc := path[i].(type) { + case *wire.FieldName: // Must be a pointer. + if obj.Kind() != reflect.Struct { + Failf("next component in child path is a field name, but the current object is not a struct. Path: %v, current obj: %#v", path, obj) + } + obj = obj.FieldByName(string(*pc)) + case wire.Index: // Embedded. + if obj.Kind() != reflect.Array { + Failf("next component in child path is an array index, but the current object is not an array. Path: %v, current obj: %#v", path, obj) + } + obj = obj.Index(int(pc)) + default: + panic("unreachable: switch should be exhaustive") + } + } + return obj +} + // register registers a decode with a type. // // This type is only used to instantiate a new object if it has not been -// registered previously. -func (ds *decodeState) register(id uint64, typ reflect.Type) *objectState { - os, ok := ds.objectsByID[id] - if ok { - return os +// registered previously. This depends on the type provided if none is +// available in the object itself. +func (ds *decodeState) register(r *wire.Ref, typ reflect.Type) reflect.Value { + // Grow the objectsByID slice. + id := objectID(r.Root) + if len(ds.objectsByID) < int(id) { + ds.objectsByID = append(ds.objectsByID, make([]*objectDecodeState, int(id)-len(ds.objectsByID))...) + } + + // Does this object already exist? + ods := ds.objectsByID[id-1] + if ods != nil { + return walkChild(r.Dots, ods.obj) + } + + // Create the object. + if len(r.Dots) != 0 { + typ = ds.findType(r.Type) } + v := reflect.New(typ) + ods = &objectDecodeState{ + id: id, + obj: v.Elem(), + } + ds.objectsByID[id-1] = ods + ds.pending.PushBack(ods) - // Record in the object index. - if typ.Kind() == reflect.Map { - os = &objectState{id: id, obj: reflect.MakeMap(typ), path: ds.recoverable.copy()} - } else { - os = &objectState{id: id, obj: reflect.New(typ).Elem(), path: ds.recoverable.copy()} + // Process any deferred objects & callbacks. + if encoded, ok := ds.deferred[id]; ok { + delete(ds.deferred, id) + ds.decodeObject(ods, ods.obj, encoded) } - ds.objectsByID[id] = os - if o, ok := ds.deferred[id]; ok { - // There is a deferred object. - delete(ds.deferred, id) // Free memory. - ds.decodeObject(os, os.obj, o, "", nil) - } else { - // There is no deferred object. - ds.outstanding++ + return walkChild(r.Dots, ods.obj) +} + +// objectDecoder is for decoding structs. +type objectDecoder struct { + // ds is decodeState. + ds *decodeState + + // ods is current object being decoded. + ods *objectDecodeState + + // reconciledTypeEntry is the reconciled type information. + rte *reconciledTypeEntry + + // encoded is the encoded object state. + encoded *wire.Struct +} + +// load is helper for the public methods on Source. +func (od *objectDecoder) load(slot int, objPtr reflect.Value, wait bool, fn func()) { + // Note that we have reconciled the type and may remap the fields here + // to match what's expected by the decoder. The "slot" parameter here + // is in terms of the local type, where the fields in the encoded + // object are in terms of the wire object's type, which might be in a + // different order (but will have the same fields). + v := *od.encoded.Field(od.rte.FieldOrder[slot]) + od.ds.decodeObject(od.ods, objPtr.Elem(), v) + if wait { + // Mark this individual object a blocker. + od.ds.waitObject(od.ods, v, fn) } +} - return os +// aterLoad implements Source.AfterLoad. +func (od *objectDecoder) afterLoad(fn func()) { + // Queue the local callback; this will execute when all of the above + // data dependencies have been cleared. + od.ods.addCallback(userCallback(fn)) } // decodeStruct decodes a struct value. -func (ds *decodeState) decodeStruct(os *objectState, obj reflect.Value, s *pb.Struct) { - // Set the fields. - m := Map{newInternalMap(nil, ds, os)} - defer internalMapPool.Put(m.internalMap) - for _, field := range s.Fields { - m.data = append(m.data, entry{ - name: field.Name, - object: field.Value, - }) - } - - // Sort the fields for efficient searching. - // - // Technically, these should already appear in sorted order in the - // state ordering, so this cost is effectively a single scan to ensure - // that the order is correct. - if len(m.data) > 1 { - sort.Slice(m.data, func(i, j int) bool { - return m.data[i].name < m.data[j].name - }) - } - - // Invoke the load; this will recursively decode other objects. - fns, ok := registeredTypes.lookupFns(obj.Addr().Type()) - if ok { - // Invoke the loader. - fns.invokeLoad(obj.Addr(), m) - } else if obj.NumField() == 0 { - // Allow anonymous empty structs. - return - } else { +func (ds *decodeState) decodeStruct(ods *objectDecodeState, obj reflect.Value, encoded *wire.Struct) { + if encoded.TypeID == 0 { + // Allow anonymous empty structs, but only if the encoded + // object also has no fields. + if encoded.Fields() == 0 && obj.NumField() == 0 { + return + } + // Propagate an error. - panic(fmt.Errorf("unregistered type %s", obj.Type())) + Failf("empty struct on wire %#v has field mismatch with type %q", encoded, obj.Type().Name()) + } + + // Lookup the object type. + rte := ds.types.Lookup(typeID(encoded.TypeID), obj.Type()) + ods.typ = typeID(encoded.TypeID) + + // Invoke the loader. + od := objectDecoder{ + ds: ds, + ods: ods, + rte: rte, + encoded: encoded, + } + ds.stats.start(ods.typ) + defer ds.stats.done() + if sl, ok := obj.Addr().Interface().(SaverLoader); ok { + // Note: may be a registered empty struct which does not + // implement the saver/loader interfaces. + sl.StateLoad(Source{internal: od}) } } // decodeMap decodes a map value. -func (ds *decodeState) decodeMap(os *objectState, obj reflect.Value, m *pb.Map) { +func (ds *decodeState) decodeMap(ods *objectDecodeState, obj reflect.Value, encoded *wire.Map) { if obj.IsNil() { + // See pointerTo. obj.Set(reflect.MakeMap(obj.Type())) } - for i := 0; i < len(m.Keys); i++ { + for i := 0; i < len(encoded.Keys); i++ { // Decode the objects. kv := reflect.New(obj.Type().Key()).Elem() vv := reflect.New(obj.Type().Elem()).Elem() - ds.decodeObject(os, kv, m.Keys[i], ".(key %d)", i) - ds.decodeObject(os, vv, m.Values[i], "[%#v]", kv.Interface()) - ds.waitObject(os, m.Keys[i], nil) - ds.waitObject(os, m.Values[i], nil) + ds.decodeObject(ods, kv, encoded.Keys[i]) + ds.decodeObject(ods, vv, encoded.Values[i]) + ds.waitObject(ods, encoded.Keys[i], nil) + ds.waitObject(ods, encoded.Values[i], nil) // Set in the map. obj.SetMapIndex(kv, vv) @@ -292,271 +410,294 @@ func (ds *decodeState) decodeMap(os *objectState, obj reflect.Value, m *pb.Map) } // decodeArray decodes an array value. -func (ds *decodeState) decodeArray(os *objectState, obj reflect.Value, a *pb.Array) { - if len(a.Contents) != obj.Len() { - panic(fmt.Errorf("mismatching array length expect=%d, actual=%d", obj.Len(), len(a.Contents))) +func (ds *decodeState) decodeArray(ods *objectDecodeState, obj reflect.Value, encoded *wire.Array) { + if len(encoded.Contents) != obj.Len() { + Failf("mismatching array length expect=%d, actual=%d", obj.Len(), len(encoded.Contents)) } // Decode the contents into the array. - for i := 0; i < len(a.Contents); i++ { - ds.decodeObject(os, obj.Index(i), a.Contents[i], "[%d]", i) - ds.waitObject(os, a.Contents[i], nil) + for i := 0; i < len(encoded.Contents); i++ { + ds.decodeObject(ods, obj.Index(i), encoded.Contents[i]) + ds.waitObject(ods, encoded.Contents[i], nil) } } -// decodeInterface decodes an interface value. -func (ds *decodeState) decodeInterface(os *objectState, obj reflect.Value, i *pb.Interface) { - // Is this a nil value? - if i.Type == "" { - return // Just leave obj alone. +// findType finds the type for the given wire.TypeSpecs. +func (ds *decodeState) findType(t wire.TypeSpec) reflect.Type { + switch x := t.(type) { + case wire.TypeID: + typ := ds.types.LookupType(typeID(x)) + rte := ds.types.Lookup(typeID(x), typ) + return rte.LocalType + case *wire.TypeSpecPointer: + return reflect.PtrTo(ds.findType(x.Type)) + case *wire.TypeSpecArray: + return reflect.ArrayOf(int(x.Count), ds.findType(x.Type)) + case *wire.TypeSpecSlice: + return reflect.SliceOf(ds.findType(x.Type)) + case *wire.TypeSpecMap: + return reflect.MapOf(ds.findType(x.Key), ds.findType(x.Value)) + default: + // Should not happen. + Failf("unknown type %#v", t) } + panic("unreachable") +} - // Get the dispatchable type. This may not be used if the given - // reference has already been resolved, but if not we need to know the - // type to create. - t, ok := registeredTypes.lookupType(i.Type) - if !ok { - panic(fmt.Errorf("no valid type for %q", i.Type)) +// decodeInterface decodes an interface value. +func (ds *decodeState) decodeInterface(ods *objectDecodeState, obj reflect.Value, encoded *wire.Interface) { + if _, ok := encoded.Type.(wire.TypeSpecNil); ok { + // Special case; the nil object. Just decode directly, which + // will read nil from the wire (if encoded correctly). + ds.decodeObject(ods, obj, encoded.Value) + return } - if obj.Kind() != reflect.Map { - // Set the obj to be the given typed value; this actually sets - // obj to be a non-zero value -- namely, it inserts type - // information. There's no need to do this for maps. - obj.Set(reflect.Zero(t)) + // We now need to resolve the actual type. + typ := ds.findType(encoded.Type) + + // We need to imbue type information here, then we can proceed to + // decode normally. In order to avoid issues with setting value-types, + // we create a new non-interface version of this object. We will then + // set the interface object to be equal to whatever we decode. + origObj := obj + obj = reflect.New(typ).Elem() + defer origObj.Set(obj) + + // With the object now having sufficient type information to actually + // have Set called on it, we can proceed to decode the value. + ds.decodeObject(ods, obj, encoded.Value) +} + +// isFloatEq determines if x and y represent the same value. +func isFloatEq(x float64, y float64) bool { + switch { + case math.IsNaN(x): + return math.IsNaN(y) + case math.IsInf(x, 1): + return math.IsInf(y, 1) + case math.IsInf(x, -1): + return math.IsInf(y, -1) + default: + return x == y } +} - // Decode the dereferenced element; there is no need to wait here, as - // the interface object shares the current object state. - ds.decodeObject(os, obj, i.Value, ".(%s)", i.Type) +// isComplexEq determines if x and y represent the same value. +func isComplexEq(x complex128, y complex128) bool { + return isFloatEq(real(x), real(y)) && isFloatEq(imag(x), imag(y)) } // decodeObject decodes a object value. -func (ds *decodeState) decodeObject(os *objectState, obj reflect.Value, object *pb.Object, format string, param interface{}) { - ds.push(false, format, param) - ds.stats.Add(obj) - ds.stats.Start(obj) - - switch x := object.GetValue().(type) { - case *pb.Object_BoolValue: - obj.SetBool(x.BoolValue) - case *pb.Object_StringValue: - obj.SetString(string(x.StringValue)) - case *pb.Object_Int64Value: - obj.SetInt(x.Int64Value) - if obj.Int() != x.Int64Value { - panic(fmt.Errorf("signed integer truncated in %v for %s", object, obj.Type())) +func (ds *decodeState) decodeObject(ods *objectDecodeState, obj reflect.Value, encoded wire.Object) { + switch x := encoded.(type) { + case wire.Nil: // Fast path: first. + // We leave obj alone here. That's because if obj represents an + // interface, it may have been imbued with type information in + // decodeInterface, and we don't want to destroy that. + case *wire.Ref: + // Nil pointers may be encoded in a "forceValue" context. For + // those we just leave it alone as the value will already be + // correct (nil). + if id := objectID(x.Root); id == 0 { + return } - case *pb.Object_Uint64Value: - obj.SetUint(x.Uint64Value) - if obj.Uint() != x.Uint64Value { - panic(fmt.Errorf("unsigned integer truncated in %v for %s", object, obj.Type())) - } - case *pb.Object_DoubleValue: - obj.SetFloat(x.DoubleValue) - if obj.Float() != x.DoubleValue { - panic(fmt.Errorf("float truncated in %v for %s", object, obj.Type())) - } - case *pb.Object_RefValue: - // Resolve the pointer itself, even though the object may not - // be decoded yet. You need to use wait() in order to ensure - // that is the case. See wait above, and Map.Barrier. - if id := x.RefValue; id != 0 { - // Decoding the interface should have imparted type - // information, so from this point it's safe to resolve - // and use this dynamic information for actually - // creating the object in register. - // - // (For non-interfaces this is a no-op). - dyntyp := reflect.TypeOf(obj.Interface()) - if dyntyp.Kind() == reflect.Map { - // Remove the map object count here to avoid - // double counting, as this object will be - // counted again when it gets processed later. - // We do not add a reference count as the - // reference is artificial. - ds.stats.Remove(obj) - obj.Set(ds.register(id, dyntyp).obj) - } else if dyntyp.Kind() == reflect.Ptr { - ds.push(true /* dereference */, "", nil) - obj.Set(ds.register(id, dyntyp.Elem()).obj.Addr()) - ds.pop() - } else { - obj.Set(ds.register(id, dyntyp.Elem()).obj.Addr()) + + // Note that if this is a map type, we go through a level of + // indirection to allow for map aliasing. + if obj.Kind() == reflect.Map { + v := ds.register(x, obj.Type()) + if v.IsNil() { + // Note that we don't want to clobber the map + // if has already been decoded by decodeMap. We + // just make it so that we have a consistent + // reference when that eventually does happen. + v.Set(reflect.MakeMap(v.Type())) } - } else { - // We leave obj alone here. That's because if obj - // represents an interface, it may have been embued - // with type information in decodeInterface, and we - // don't want to destroy that information. + obj.Set(v) + return } - case *pb.Object_SliceValue: - // It's okay to slice the array here, since the contents will - // still be provided later on. These semantics are a bit - // strange but they are handled in the Map.Barrier properly. - // - // The special semantics of zero ref apply here too. - if id := x.SliceValue.RefValue; id != 0 && x.SliceValue.Capacity > 0 { - v := reflect.ArrayOf(int(x.SliceValue.Capacity), obj.Type().Elem()) - obj.Set(ds.register(id, v).obj.Slice3(0, int(x.SliceValue.Length), int(x.SliceValue.Capacity))) + + // Normal assignment: authoritative only if no dots. + v := ds.register(x, obj.Type().Elem()) + if v.IsValid() { + obj.Set(unsafePointerTo(v)) } - case *pb.Object_ArrayValue: - ds.decodeArray(os, obj, x.ArrayValue) - case *pb.Object_StructValue: - ds.decodeStruct(os, obj, x.StructValue) - case *pb.Object_MapValue: - ds.decodeMap(os, obj, x.MapValue) - case *pb.Object_InterfaceValue: - ds.decodeInterface(os, obj, x.InterfaceValue) - case *pb.Object_ByteArrayValue: - copyArray(obj, reflect.ValueOf(x.ByteArrayValue)) - case *pb.Object_Uint16ArrayValue: - // 16-bit slices are serialized as 32-bit slices. - // See object.proto for details. - s := x.Uint16ArrayValue.Values - t := obj.Slice(0, obj.Len()).Interface().([]uint16) - if len(t) != len(s) { - panic(fmt.Errorf("mismatching array length expect=%d, actual=%d", len(t), len(s))) + case wire.Bool: + obj.SetBool(bool(x)) + case wire.Int: + obj.SetInt(int64(x)) + if obj.Int() != int64(x) { + Failf("signed integer truncated from %v to %v", int64(x), obj.Int()) } - for i := range s { - t[i] = uint16(s[i]) + case wire.Uint: + obj.SetUint(uint64(x)) + if obj.Uint() != uint64(x) { + Failf("unsigned integer truncated from %v to %v", uint64(x), obj.Uint()) } - case *pb.Object_Uint32ArrayValue: - copyArray(obj, reflect.ValueOf(x.Uint32ArrayValue.Values)) - case *pb.Object_Uint64ArrayValue: - copyArray(obj, reflect.ValueOf(x.Uint64ArrayValue.Values)) - case *pb.Object_UintptrArrayValue: - copyArray(obj, castSlice(reflect.ValueOf(x.UintptrArrayValue.Values), reflect.TypeOf(uintptr(0)))) - case *pb.Object_Int8ArrayValue: - copyArray(obj, castSlice(reflect.ValueOf(x.Int8ArrayValue.Values), reflect.TypeOf(int8(0)))) - case *pb.Object_Int16ArrayValue: - // 16-bit slices are serialized as 32-bit slices. - // See object.proto for details. - s := x.Int16ArrayValue.Values - t := obj.Slice(0, obj.Len()).Interface().([]int16) - if len(t) != len(s) { - panic(fmt.Errorf("mismatching array length expect=%d, actual=%d", len(t), len(s))) + case wire.Float32: + obj.SetFloat(float64(x)) + case wire.Float64: + obj.SetFloat(float64(x)) + if !isFloatEq(obj.Float(), float64(x)) { + Failf("floating point number truncated from %v to %v", float64(x), obj.Float()) } - for i := range s { - t[i] = int16(s[i]) + case *wire.Complex64: + obj.SetComplex(complex128(*x)) + case *wire.Complex128: + obj.SetComplex(complex128(*x)) + if !isComplexEq(obj.Complex(), complex128(*x)) { + Failf("complex number truncated from %v to %v", complex128(*x), obj.Complex()) } - case *pb.Object_Int32ArrayValue: - copyArray(obj, reflect.ValueOf(x.Int32ArrayValue.Values)) - case *pb.Object_Int64ArrayValue: - copyArray(obj, reflect.ValueOf(x.Int64ArrayValue.Values)) - case *pb.Object_BoolArrayValue: - copyArray(obj, reflect.ValueOf(x.BoolArrayValue.Values)) - case *pb.Object_Float64ArrayValue: - copyArray(obj, reflect.ValueOf(x.Float64ArrayValue.Values)) - case *pb.Object_Float32ArrayValue: - copyArray(obj, reflect.ValueOf(x.Float32ArrayValue.Values)) + case *wire.String: + obj.SetString(string(*x)) + case *wire.Slice: + // See *wire.Ref above; same applies. + if id := objectID(x.Ref.Root); id == 0 { + return + } + // Note that it's fine to slice the array here and assume that + // contents will still be filled in later on. + typ := reflect.ArrayOf(int(x.Capacity), obj.Type().Elem()) // The object type. + v := ds.register(&x.Ref, typ) + obj.Set(v.Slice3(0, int(x.Length), int(x.Capacity))) + case *wire.Array: + ds.decodeArray(ods, obj, x) + case *wire.Struct: + ds.decodeStruct(ods, obj, x) + case *wire.Map: + ds.decodeMap(ods, obj, x) + case *wire.Interface: + ds.decodeInterface(ods, obj, x) default: // Shoud not happen, not propagated as an error. - panic(fmt.Sprintf("unknown object %v for %s", object, obj.Type())) - } - - ds.stats.Done() - ds.pop() -} - -func copyArray(dest reflect.Value, src reflect.Value) { - if dest.Len() != src.Len() { - panic(fmt.Errorf("mismatching array length expect=%d, actual=%d", dest.Len(), src.Len())) + Failf("unknown object %#v for %q", encoded, obj.Type().Name()) } - reflect.Copy(dest, castSlice(src, dest.Type().Elem())) } -// Deserialize deserializes the object state. +// Load deserializes the object graph rooted at obj. // // This function may panic and should be run in safely(). -func (ds *decodeState) Deserialize(obj reflect.Value) { - ds.objectsByID[1] = &objectState{id: 1, obj: obj, path: ds.recoverable.copy()} - ds.outstanding = 1 // The root object. +func (ds *decodeState) Load(obj reflect.Value) { + ds.stats.init() + defer ds.stats.fini(func(id typeID) string { + return ds.types.LookupName(id) + }) + + // Create the root object. + ds.objectsByID = append(ds.objectsByID, &objectDecodeState{ + id: 1, + obj: obj, + }) + + // Read the number of objects. + lastID, object, err := ReadHeader(ds.r) + if err != nil { + Failf("header error: %w", err) + } + if !object { + Failf("object missing") + } + + // Decode all objects. + var ( + encoded wire.Object + ods *objectDecodeState + id = objectID(1) + tid = typeID(1) + ) + if err := safely(func() { + // Decode all objects in the stream. + // + // Note that the structure of this decoding loop should match + // the raw decoding loop in printer.go. + for id <= objectID(lastID) { + // Unmarshal the object. + encoded = wire.Load(ds.r) + + // Is this a type object? Handle inline. + if wt, ok := encoded.(*wire.Type); ok { + ds.types.Register(wt) + tid++ + encoded = nil + continue + } - // Decode all objects in the stream. - // - // See above, we never process objects while we have no outstanding - // interests (other than the very first object). - for id := uint64(1); ds.outstanding > 0; id++ { - os := ds.lookup(id) - ds.stats.Start(os.obj) - - o, err := ds.readObject() - if err != nil { - panic(err) - } + // Actually resolve the object. + ods = ds.lookup(id) + if ods != nil { + // Decode the object. + ds.decodeObject(ods, ods.obj, encoded) + } else { + // If an object hasn't had interest registered + // previously or isn't yet valid, we deferred + // decoding until interest is registered. + ds.deferred[id] = encoded + } - if os != nil { - // Decode the object. - ds.from = &os.path - ds.decodeObject(os, os.obj, o, "", nil) - ds.outstanding-- + // For error handling. + ods = nil + encoded = nil + id++ + } + }); err != nil { + // Include as much information as we can, taking into account + // the possible state transitions above. + if ods != nil { + Failf("error decoding object ID %d (%T) from %#v: %w", id, ods.obj.Interface(), encoded, err) + } else if encoded != nil { + Failf("lookup error decoding object ID %d from %#v: %w", id, encoded, err) } else { - // If an object hasn't had interest registered - // previously, we deferred decoding until interest is - // registered. - ds.deferred[id] = o + Failf("general decoding error: %w", err) } - - ds.stats.Done() - } - - // Check the zero-length header at the end. - length, object, err := ReadHeader(ds.r) - if err != nil { - panic(err) - } - if length != 0 { - panic(fmt.Sprintf("expected zero-length terminal, got %d", length)) - } - if object { - panic("expected non-object terminal") } // Check if we have any deferred objects. - if count := len(ds.deferred); count > 0 { - // Shoud not happen, not propagated as an error. - panic(fmt.Sprintf("still have %d deferred objects", count)) - } - - // Scan and fire all callbacks. - for _, os := range ds.objectsByID { - os.checkComplete(ds.stats) + for id, encoded := range ds.deferred { + // Shoud never happen, the graph was bogus. + Failf("still have deferred objects: one is ID %d, %#v", id, encoded) } - // Check if we have any remaining dependency cycles. - for _, os := range ds.objectsByID { - if !os.complete() { - // This must be the result of a dependency cycle. - cycle := os.findCycle() - var buf bytes.Buffer - buf.WriteString("dependency cycle: {") - for i, cycleOS := range cycle { - if i > 0 { - buf.WriteString(" => ") + // Scan and fire all callbacks. We iterate over the list of incomplete + // objects until all have been finished. We stop iterating if no + // objects become complete (there is a dependency cycle). + // + // Note that we iterate backwards here, because there will be a strong + // tendendcy for blocking relationships to go from earlier objects to + // later (deeper) objects in the graph. This will reduce the number of + // iterations required to finish all objects. + if err := safely(func() { + for ds.pending.Back() != nil { + thisCycle := false + for ods = ds.pending.Back(); ods != nil; { + if ds.checkComplete(ods) { + thisCycle = true + break } - buf.WriteString(fmt.Sprintf("%s", cycleOS.obj.Type())) + ods = ods.Prev() + } + if !thisCycle { + break } - buf.WriteString("}") - // Panic as an error; propagate to the caller. - panic(errors.New(string(buf.Bytes()))) } - } -} - -type byteReader struct { - io.Reader -} - -// ReadByte implements io.ByteReader. -func (br byteReader) ReadByte() (byte, error) { - var b [1]byte - n, err := br.Reader.Read(b[:]) - if n > 0 { - return b[0], nil - } else if err != nil { - return 0, err - } else { - return 0, io.ErrUnexpectedEOF + }); err != nil { + Failf("error executing callbacks for %#v: %w", ods.obj.Interface(), err) + } + + // Check if we have any remaining dependency cycles. If there are any + // objects left in the pending list, then it must be due to a cycle. + if ods := ds.pending.Front(); ods != nil { + // This must be the result of a dependency cycle. + cycle := ods.findCycle() + var buf bytes.Buffer + buf.WriteString("dependency cycle: {") + for i, cycleOS := range cycle { + if i > 0 { + buf.WriteString(" => ") + } + fmt.Fprintf(&buf, "%q", cycleOS.obj.Type()) + } + buf.WriteString("}") + Failf("incomplete graph: %s", string(buf.Bytes())) } } @@ -565,45 +706,20 @@ func (br byteReader) ReadByte() (byte, error) { // Each object written to the statefile is prefixed with a header. See // WriteHeader for more information; these functions are exported to allow // non-state writes to the file to play nice with debugging tools. -func ReadHeader(r io.Reader) (length uint64, object bool, err error) { +func ReadHeader(r wire.Reader) (length uint64, object bool, err error) { // Read the header. - length, err = binary.ReadUvarint(byteReader{r}) + err = safely(func() { + length = wire.LoadUint(r) + }) if err != nil { - return + // On the header, pass raw I/O errors. + if sErr, ok := err.(*ErrState); ok { + return 0, false, sErr.Unwrap() + } } // Decode whether the object is valid. - object = length&0x1 != 0 - length = length >> 1 + object = length&objectFlag != 0 + length &^= objectFlag return } - -// readObject reads an object from the stream. -func (ds *decodeState) readObject() (*pb.Object, error) { - // Read the header. - length, object, err := ReadHeader(ds.r) - if err != nil { - return nil, err - } - if !object { - return nil, fmt.Errorf("invalid object header") - } - - // Read the object. - buf := make([]byte, length) - for done := 0; done < len(buf); { - n, err := ds.r.Read(buf[done:]) - done += n - if n == 0 && err != nil { - return nil, err - } - } - - // Unmarshal. - obj := new(pb.Object) - if err := proto.Unmarshal(buf, obj); err != nil { - return nil, err - } - - return obj, nil -} diff --git a/pkg/state/decode_unsafe.go b/pkg/state/decode_unsafe.go new file mode 100644 index 000000000..d048f61a1 --- /dev/null +++ b/pkg/state/decode_unsafe.go @@ -0,0 +1,27 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package state + +import ( + "reflect" + "unsafe" +) + +// unsafePointerTo is logically equivalent to reflect.Value.Addr, but works on +// values representing unexported fields. This bypasses visibility, but not +// type safety. +func unsafePointerTo(obj reflect.Value) reflect.Value { + return reflect.NewAt(obj.Type(), unsafe.Pointer(obj.UnsafeAddr())) +} diff --git a/pkg/state/encode.go b/pkg/state/encode.go index c5118d3a9..92fcad4e9 100644 --- a/pkg/state/encode.go +++ b/pkg/state/encode.go @@ -15,437 +15,797 @@ package state import ( - "container/list" "context" - "encoding/binary" - "fmt" - "io" "reflect" - "sort" - "github.com/golang/protobuf/proto" - pb "gvisor.dev/gvisor/pkg/state/object_go_proto" + "gvisor.dev/gvisor/pkg/state/wire" ) -// queuedObject is an object queued for encoding. -type queuedObject struct { - id uint64 - obj reflect.Value - path recoverable +// objectEncodeState the type and identity of an object occupying a memory +// address range. This is the value type for addrSet, and the intrusive entry +// for the pending and deferred lists. +type objectEncodeState struct { + // id is the assigned ID for this object. + id objectID + + // obj is the object value. Note that this may be replaced if we + // encounter an object that contains this object. When this happens (in + // resolve), we will update existing references approprately, below, + // and defer a re-encoding of the object. + obj reflect.Value + + // encoded is the encoded value of this object. Note that this may not + // be up to date if this object is still in the deferred list. + encoded wire.Object + + // how indicates whether this object should be encoded as a value. This + // is used only for deferred encoding. + how encodeStrategy + + // refs are the list of reference objects used by other objects + // referring to this object. When the object is updated, these + // references may be updated directly and automatically. + refs []*wire.Ref + + pendingEntry + deferredEntry } // encodeState is state used for encoding. // -// The encoding process is a breadth-first traversal of the object graph. The -// inherent races and dependencies are much simpler than the decode case. +// The encoding process constructs a representation of the in-memory graph of +// objects before a single object is serialized. This is done to ensure that +// all references can be fully disambiguated. See resolve for more details. type encodeState struct { // ctx is the encode context. ctx context.Context - // lastID is the last object ID. - // - // See idsByObject for context. Because of the special zero encoding - // used for reference values, the first ID must be 1. - lastID uint64 + // w is the output stream. + w wire.Writer - // idsByObject is a set of objects, indexed via: - // - // reflect.ValueOf(x).UnsafeAddr - // - // This provides IDs for objects. - idsByObject map[uintptr]uint64 + // types is the type database. + types typeEncodeDatabase + + // lastID is the last allocated object ID. + lastID objectID - // values stores values that span the addresses. + // values tracks the address ranges occupied by objects, along with the + // types of these objects. This is used to locate pointer targets, + // including pointers to fields within another type. // - // addrSet is a a generated type which efficiently stores ranges of - // addresses. When encoding pointers, these ranges are filled in and - // used to check for overlapping or conflicting pointers. This would - // indicate a pointer to an field, or a non-type safe value, neither of - // which are currently decodable. + // Multiple objects may overlap in memory iff the larger object fully + // contains the smaller one, and the type of the smaller object matches + // a field or array element's type at the appropriate offset. An + // arbitrary number of objects may be nested in this manner. // - // See the usage of values below for more context. + // Note that this does not track zero-sized objects, those are tracked + // by zeroValues below. values addrSet - // w is the output stream. - w io.Writer + // zeroValues tracks zero-sized objects. + zeroValues map[reflect.Type]*objectEncodeState - // pending is the list of objects to be serialized. - // - // This is a set of queuedObjects. - pending list.List + // deferred is the list of objects to be encoded. + deferred deferredList - // done is the a list of finished objects. - // - // This is kept to prevent garbage collection and address reuse. - done list.List + // pendingTypes is the list of types to be serialized. Serialization + // will occur when all objects have been encoded, but before pending is + // serialized. + pendingTypes []wire.Type - // stats is the passed stats object. - stats *Stats + // pending is the list of objects to be serialized. Serialization does + // not actually occur until the full object graph is computed. + pending pendingList - // recoverable is the panic recover facility. - recoverable + // stats tracks time data. + stats Stats } -// register looks up an ID, registering if necessary. +// isSameSizeParent returns true if child is a field value or element within +// parent. Only a struct or array can have a child value. +// +// isSameSizeParent deals with objects like this: +// +// struct child { +// // fields.. +// } // -// If the object was not previously registered, it is enqueued to be serialized. -// See the documentation for idsByObject for more information. -func (es *encodeState) register(obj reflect.Value) uint64 { - // It is not legal to call register for any non-pointer objects (see - // below), so we panic with a recoverable error if this is a mismatch. - if obj.Kind() != reflect.Ptr && obj.Kind() != reflect.Map { - panic(fmt.Errorf("non-pointer %#v registered", obj.Interface())) +// struct parent { +// c child +// } +// +// var p parent +// record(&p.c) +// +// Here, &p and &p.c occupy the exact same address range. +// +// Or like this: +// +// struct child { +// // fields +// } +// +// var arr [1]parent +// record(&arr[0]) +// +// Similarly, &arr[0] and &arr[0].c have the exact same address range. +// +// Precondition: parent and child must occupy the same memory. +func isSameSizeParent(parent reflect.Value, childType reflect.Type) bool { + switch parent.Kind() { + case reflect.Struct: + for i := 0; i < parent.NumField(); i++ { + field := parent.Field(i) + if field.Type() == childType { + return true + } + // Recurse through any intermediate types. + if isSameSizeParent(field, childType) { + return true + } + // Does it make sense to keep going if the first field + // doesn't match? Yes, because there might be an + // arbitrary number of zero-sized fields before we get + // a match, and childType itself can be zero-sized. + } + return false + case reflect.Array: + // The only case where an array with more than one elements can + // return true is if childType is zero-sized. In such cases, + // it's ambiguous which element contains the match since a + // zero-sized child object fully fits in any of the zero-sized + // elements in an array... However since all elements are of + // the same type, we only need to check one element. + // + // For non-zero-sized childTypes, parent.Len() must be 1, but a + // combination of the precondition and an implicit comparison + // between the array element size and childType ensures this. + return parent.Len() > 0 && isSameSizeParent(parent.Index(0), childType) + default: + return false } +} - addr := obj.Pointer() - if obj.Kind() == reflect.Ptr && obj.Elem().Type().Size() == 0 { - // For zero-sized objects, we always provide a unique ID. - // That's because the runtime internally multiplexes pointers - // to the same address. We can't be certain what the intent is - // with pointers to zero-sized objects, so we just give them - // all unique identities. - } else if id, ok := es.idsByObject[addr]; ok { - // Already registered. - return id - } - - // Ensure that the first ID given out is one. See note on lastID. The - // ID zero is used to indicate nil values. +// nextID returns the next valid ID. +func (es *encodeState) nextID() objectID { es.lastID++ - id := es.lastID - es.idsByObject[addr] = id - if obj.Kind() == reflect.Ptr { - // Dereference and treat as a pointer. - es.pending.PushBack(queuedObject{id: id, obj: obj.Elem(), path: es.recoverable.copy()}) - - // Register this object at all addresses. - typ := obj.Elem().Type() - if size := typ.Size(); size > 0 { - r := addrRange{addr, addr + size} - if !es.values.IsEmptyRange(r) { - old := es.values.LowerBoundSegment(addr).Value().Interface().(recoverable) - panic(fmt.Errorf("overlapping objects: [new object] %#v [existing object path] %s", obj.Interface(), old.path())) + return objectID(es.lastID) +} + +// dummyAddr points to the dummy zero-sized address. +var dummyAddr = reflect.ValueOf(new(struct{})).Pointer() + +// resolve records the address range occupied by an object. +func (es *encodeState) resolve(obj reflect.Value, ref *wire.Ref) { + addr := obj.Pointer() + + // Is this a map pointer? Just record the single address. It is not + // possible to take any pointers into the map internals. + if obj.Kind() == reflect.Map { + if addr == 0 { + // Just leave the nil reference alone. This is fine, we + // may need to encode as a reference in this way. We + // return nil for our objectEncodeState so that anyone + // depending on this value knows there's nothing there. + return + } + if seg, _ := es.values.Find(addr); seg.Ok() { + // Ensure the map types match. + existing := seg.Value() + if existing.obj.Type() != obj.Type() { + Failf("overlapping map objects at 0x%x: [new object] %#v [existing object type] %s", addr, obj, existing.obj) } - es.values.Add(r, reflect.ValueOf(es.recoverable.copy())) + + // No sense recording refs, maps may not be replaced by + // covering objects, they are maximal. + ref.Root = wire.Uint(existing.id) + return } + + // Record the map. + oes := &objectEncodeState{ + id: es.nextID(), + obj: obj, + how: encodeMapAsValue, + } + es.values.Add(addrRange{addr, addr + 1}, oes) + es.pending.PushBack(oes) + es.deferred.PushBack(oes) + + // See above: no ref recording. + ref.Root = wire.Uint(oes.id) + return + } + + // If not a map, then the object must be a pointer. + if obj.Kind() != reflect.Ptr { + Failf("attempt to record non-map and non-pointer object %#v", obj) + } + + obj = obj.Elem() // Value from here. + + // Is this a zero-sized type? + typ := obj.Type() + size := typ.Size() + if size == 0 { + if addr == dummyAddr { + // Zero-sized objects point to a dummy byte within the + // runtime. There's no sense recording this in the + // address map. We add this to the dedicated + // zeroValues. + // + // Note that zero-sized objects must be *true* + // zero-sized objects. They cannot be part of some + // larger object. In that case, they are assigned a + // 1-byte address at the end of the object. + oes, ok := es.zeroValues[typ] + if !ok { + oes = &objectEncodeState{ + id: es.nextID(), + obj: obj, + } + es.zeroValues[typ] = oes + es.pending.PushBack(oes) + es.deferred.PushBack(oes) + } + + // There's also no sense tracking back references. We + // know that this is a true zero-sized object, and not + // part of a larger container, so it will not change. + ref.Root = wire.Uint(oes.id) + return + } + size = 1 // See above. + } + + // Calculate the container. + end := addr + size + r := addrRange{addr, end} + if seg, _ := es.values.Find(addr); seg.Ok() { + existing := seg.Value() + switch { + case seg.Start() == addr && seg.End() == end && obj.Type() == existing.obj.Type(): + // The object is a perfect match. Happy path. Avoid the + // traversal and just return directly. We don't need to + // encode the type information or any dots here. + ref.Root = wire.Uint(existing.id) + existing.refs = append(existing.refs, ref) + return + + case (seg.Start() < addr && seg.End() >= end) || (seg.Start() <= addr && seg.End() > end): + // The previously registered object is larger than + // this, no need to update. But we expect some + // traversal below. + + case seg.Start() == addr && seg.End() == end: + if !isSameSizeParent(obj, existing.obj.Type()) { + break // Needs traversal. + } + fallthrough // Needs update. + + case (seg.Start() > addr && seg.End() <= end) || (seg.Start() >= addr && seg.End() < end): + // Update the object and redo the encoding. + old := existing.obj + existing.obj = obj + es.deferred.Remove(existing) + es.deferred.PushBack(existing) + + // The previously registered object is superseded by + // this new object. We are guaranteed to not have any + // mergeable neighbours in this segment set. + if !raceEnabled { + seg.SetRangeUnchecked(r) + } else { + // Add extra paranoid. This will be statically + // removed at compile time unless a race build. + es.values.Remove(seg) + es.values.Add(r, existing) + seg = es.values.LowerBoundSegment(addr) + } + + // Compute the traversal required & update references. + dots := traverse(obj.Type(), old.Type(), addr, seg.Start()) + wt := es.findType(obj.Type()) + for _, ref := range existing.refs { + ref.Dots = append(ref.Dots, dots...) + ref.Type = wt + } + default: + // There is a non-sensical overlap. + Failf("overlapping objects: [new object] %#v [existing object] %#v", obj, existing.obj) + } + + // Compute the new reference, record and return it. + ref.Root = wire.Uint(existing.id) + ref.Dots = traverse(existing.obj.Type(), obj.Type(), seg.Start(), addr) + ref.Type = es.findType(obj.Type()) + existing.refs = append(existing.refs, ref) + return + } + + // The only remaining case is a pointer value that doesn't overlap with + // any registered addresses. Create a new entry for it, and start + // tracking the first reference we just created. + oes := &objectEncodeState{ + id: es.nextID(), + obj: obj, + } + if !raceEnabled { + es.values.AddWithoutMerging(r, oes) } else { - // Push back the map itself; when maps are encoded from the - // top-level, forceMap will be equal to true. - es.pending.PushBack(queuedObject{id: id, obj: obj, path: es.recoverable.copy()}) + // Merges should never happen. This is just enabled extra + // sanity checks because the Merge function below will panic. + es.values.Add(r, oes) + } + es.pending.PushBack(oes) + es.deferred.PushBack(oes) + ref.Root = wire.Uint(oes.id) + oes.refs = append(oes.refs, ref) +} + +// traverse searches for a target object within a root object, where the target +// object is a struct field or array element within root, with potentially +// multiple intervening types. traverse returns the set of field or element +// traversals required to reach the target. +// +// Note that for efficiency, traverse returns the dots in the reverse order. +// That is, the first traversal required will be the last element of the list. +// +// Precondition: The target object must lie completely within the range defined +// by [rootAddr, rootAddr + sizeof(rootType)]. +func traverse(rootType, targetType reflect.Type, rootAddr, targetAddr uintptr) []wire.Dot { + // Recursion base case: the types actually match. + if targetType == rootType && targetAddr == rootAddr { + return nil } - return id + switch rootType.Kind() { + case reflect.Struct: + offset := targetAddr - rootAddr + for i := rootType.NumField(); i > 0; i-- { + field := rootType.Field(i - 1) + // The first field from the end with an offset that is + // smaller than or equal to our address offset is where + // the target is located. Traverse from there. + if field.Offset <= offset { + dots := traverse(field.Type, targetType, rootAddr+field.Offset, targetAddr) + fieldName := wire.FieldName(field.Name) + return append(dots, &fieldName) + } + } + // Should never happen; the target should be reachable. + Failf("no field in root type %v contains target type %v", rootType, targetType) + + case reflect.Array: + // Since arrays have homogenous types, all elements have the + // same size and we can compute where the target lives. This + // does not matter for the purpose of typing, but matters for + // the purpose of computing the address of the given index. + elemSize := int(rootType.Elem().Size()) + n := int(targetAddr-rootAddr) / elemSize // Relies on integer division rounding down. + if rootType.Len() < n { + Failf("traversal target of type %v @%x is beyond the end of the array type %v @%x with %v elements", + targetType, targetAddr, rootType, rootAddr, rootType.Len()) + } + dots := traverse(rootType.Elem(), targetType, rootAddr+uintptr(n*elemSize), targetAddr) + return append(dots, wire.Index(n)) + + default: + // For any other type, there's no possibility of aliasing so if + // the types didn't match earlier then we have an addresss + // collision which shouldn't be possible at this point. + Failf("traverse failed for root type %v and target type %v", rootType, targetType) + } + panic("unreachable") } // encodeMap encodes a map. -func (es *encodeState) encodeMap(obj reflect.Value) *pb.Map { - var ( - keys []*pb.Object - values []*pb.Object - ) +func (es *encodeState) encodeMap(obj reflect.Value, dest *wire.Object) { + if obj.IsNil() { + // Because there is a difference between a nil map and an empty + // map, we need to not decode in the case of a truly nil map. + *dest = wire.Nil{} + return + } + l := obj.Len() + m := &wire.Map{ + Keys: make([]wire.Object, l), + Values: make([]wire.Object, l), + } + *dest = m for i, k := range obj.MapKeys() { v := obj.MapIndex(k) - kp := es.encodeObject(k, false, ".(key %d)", i) - vp := es.encodeObject(v, false, "[%#v]", k.Interface()) - keys = append(keys, kp) - values = append(values, vp) + // Map keys must be encoded using the full value because the + // type will be omitted after the first key. + es.encodeObject(k, encodeAsValue, &m.Keys[i]) + es.encodeObject(v, encodeAsValue, &m.Values[i]) } - return &pb.Map{Keys: keys, Values: values} +} + +// objectEncoder is for encoding structs. +type objectEncoder struct { + // es is encodeState. + es *encodeState + + // encoded is the encoded struct. + encoded *wire.Struct +} + +// save is called by the public methods on Sink. +func (oe *objectEncoder) save(slot int, obj reflect.Value) { + fieldValue := oe.encoded.Field(slot) + oe.es.encodeObject(obj, encodeDefault, fieldValue) } // encodeStruct encodes a composite object. -func (es *encodeState) encodeStruct(obj reflect.Value) *pb.Struct { - // Invoke the save. - m := Map{newInternalMap(es, nil, nil)} - defer internalMapPool.Put(m.internalMap) +func (es *encodeState) encodeStruct(obj reflect.Value, dest *wire.Object) { + // Ensure that the obj is addressable. There are two cases when it is + // not. First, is when this is dispatched via SaveValue. Second, when + // this is a map key as a struct. Either way, we need to make a copy to + // obtain an addressable value. if !obj.CanAddr() { - // Force it to a * type of the above; this involves a copy. localObj := reflect.New(obj.Type()) localObj.Elem().Set(obj) obj = localObj.Elem() } - fns, ok := registeredTypes.lookupFns(obj.Addr().Type()) - if ok { - // Invoke the provided saver. - fns.invokeSave(obj.Addr(), m) - } else if obj.NumField() == 0 { - // Allow unregistered anonymous, empty structs. - return &pb.Struct{} - } else { - // Propagate an error. - panic(fmt.Errorf("unregistered type %T", obj.Interface())) - } - - // Sort the underlying slice, and check for duplicates. This is done - // once instead of on each add, because performing this sort once is - // far more efficient. - if len(m.data) > 1 { - sort.Slice(m.data, func(i, j int) bool { - return m.data[i].name < m.data[j].name - }) - for i := range m.data { - if i > 0 && m.data[i-1].name == m.data[i].name { - panic(fmt.Errorf("duplicate name %s", m.data[i].name)) - } + + // Prepare the value. + s := &wire.Struct{} + *dest = s + + // Look the type up in the database. + te, ok := es.types.Lookup(obj.Type()) + if te == nil { + if obj.NumField() == 0 { + // Allow unregistered anonymous, empty structs. This + // will just return success without ever invoking the + // passed function. This uses the immutable EmptyStruct + // variable to prevent an allocation in this case. + // + // Note that this mechanism does *not* work for + // interfaces in general. So you can't dispatch + // non-registered empty structs via interfaces because + // then they can't be restored. + s.Alloc(0) + return } + // We need a SaverLoader for struct types. + Failf("struct %T does not implement SaverLoader", obj.Interface()) } - - // Encode the resulting fields. - fields := make([]*pb.Field, 0, len(m.data)) - for _, e := range m.data { - fields = append(fields, &pb.Field{ - Name: e.name, - Value: e.object, - }) + if !ok { + // Queue the type to be serialized. + es.pendingTypes = append(es.pendingTypes, te.Type) } - // Return the encoded object. - return &pb.Struct{Fields: fields} + // Invoke the provided saver. + s.TypeID = wire.TypeID(te.ID) + s.Alloc(len(te.Fields)) + oe := objectEncoder{ + es: es, + encoded: s, + } + es.stats.start(te.ID) + defer es.stats.done() + if sl, ok := obj.Addr().Interface().(SaverLoader); ok { + // Note: may be a registered empty struct which does not + // implement the saver/loader interfaces. + sl.StateSave(Sink{internal: oe}) + } } // encodeArray encodes an array. -func (es *encodeState) encodeArray(obj reflect.Value) *pb.Array { - var ( - contents []*pb.Object - ) - for i := 0; i < obj.Len(); i++ { - entry := es.encodeObject(obj.Index(i), false, "[%d]", i) - contents = append(contents, entry) - } - return &pb.Array{Contents: contents} +func (es *encodeState) encodeArray(obj reflect.Value, dest *wire.Object) { + l := obj.Len() + a := &wire.Array{ + Contents: make([]wire.Object, l), + } + *dest = a + for i := 0; i < l; i++ { + // We need to encode the full value because arrays are encoded + // using the type information from only the first element. + es.encodeObject(obj.Index(i), encodeAsValue, &a.Contents[i]) + } +} + +// findType recursively finds type information. +func (es *encodeState) findType(typ reflect.Type) wire.TypeSpec { + // First: check if this is a proper type. It's possible for pointers, + // slices, arrays, maps, etc to all have some different type. + te, ok := es.types.Lookup(typ) + if te != nil { + if !ok { + // See encodeStruct. + es.pendingTypes = append(es.pendingTypes, te.Type) + } + return wire.TypeID(te.ID) + } + + switch typ.Kind() { + case reflect.Ptr: + return &wire.TypeSpecPointer{ + Type: es.findType(typ.Elem()), + } + case reflect.Slice: + return &wire.TypeSpecSlice{ + Type: es.findType(typ.Elem()), + } + case reflect.Array: + return &wire.TypeSpecArray{ + Count: wire.Uint(typ.Len()), + Type: es.findType(typ.Elem()), + } + case reflect.Map: + return &wire.TypeSpecMap{ + Key: es.findType(typ.Key()), + Value: es.findType(typ.Elem()), + } + default: + // After potentially chasing many pointers, the + // ultimate type of the object is not known. + Failf("type %q is not known", typ) + } + panic("unreachable") } // encodeInterface encodes an interface. -// -// Precondition: the value is not nil. -func (es *encodeState) encodeInterface(obj reflect.Value) *pb.Interface { - // Check for the nil interface. - obj = reflect.ValueOf(obj.Interface()) +func (es *encodeState) encodeInterface(obj reflect.Value, dest *wire.Object) { + // Dereference the object. + obj = obj.Elem() if !obj.IsValid() { - return &pb.Interface{ - Type: "", // left alone in decode. - Value: &pb.Object{Value: &pb.Object_RefValue{0}}, + // Special case: the nil object. + *dest = &wire.Interface{ + Type: wire.TypeSpecNil{}, + Value: wire.Nil{}, } + return } - // We have an interface value here. How do we save that? We - // resolve the underlying type and save it as a dispatchable. - typName, ok := registeredTypes.lookupName(obj.Type()) - if !ok { - panic(fmt.Errorf("type %s is not registered", obj.Type())) + + // Encode underlying object. + i := &wire.Interface{ + Type: es.findType(obj.Type()), } + *dest = i + es.encodeObject(obj, encodeAsValue, &i.Value) +} - // Encode the object again. - return &pb.Interface{ - Type: typName, - Value: es.encodeObject(obj, false, ".(%s)", typName), +// isPrimitive returns true if this is a primitive object, or a composite +// object composed entirely of primitives. +func isPrimitiveZero(typ reflect.Type) bool { + switch typ.Kind() { + case reflect.Ptr: + // Pointers are always treated as primitive types because we + // won't encode directly from here. Returning true here won't + // prevent the object from being encoded correctly. + return true + case reflect.Bool: + return true + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + return true + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: + return true + case reflect.Float32, reflect.Float64: + return true + case reflect.Complex64, reflect.Complex128: + return true + case reflect.String: + return true + case reflect.Slice: + // The slice itself a primitive, but not necessarily the array + // that points to. This is similar to a pointer. + return true + case reflect.Array: + // We cannot treat an array as a primitive, because it may be + // composed of structures or other things with side-effects. + return isPrimitiveZero(typ.Elem()) + case reflect.Interface: + // Since we now that this type is the zero type, the interface + // value must be zero. Therefore this is primitive. + return true + case reflect.Struct: + return false + case reflect.Map: + // The isPrimitiveZero function is called only on zero-types to + // see if it's safe to serialize. Since a zero map has no + // elements, it is safe to treat as a primitive. + return true + default: + Failf("unknown type %q", typ.Name()) } + panic("unreachable") } -// encodeObject encodes an object. -// -// If mapAsValue is true, then a map will be encoded directly. -func (es *encodeState) encodeObject(obj reflect.Value, mapAsValue bool, format string, param interface{}) (object *pb.Object) { - es.push(false, format, param) - es.stats.Add(obj) - es.stats.Start(obj) +// encodeStrategy is the strategy used for encodeObject. +type encodeStrategy int +const ( + // encodeDefault means types are encoded normally as references. + encodeDefault encodeStrategy = iota + + // encodeAsValue means that types will never take short-circuited and + // will always be encoded as a normal value. + encodeAsValue + + // encodeMapAsValue means that even maps will be fully encoded. + encodeMapAsValue +) + +// encodeObject encodes an object. +func (es *encodeState) encodeObject(obj reflect.Value, how encodeStrategy, dest *wire.Object) { + if how == encodeDefault && isPrimitiveZero(obj.Type()) && obj.IsZero() { + *dest = wire.Nil{} + return + } switch obj.Kind() { + case reflect.Ptr: // Fast path: first. + r := new(wire.Ref) + *dest = r + if obj.IsNil() { + // May be in an array or elsewhere such that a value is + // required. So we encode as a reference to the zero + // object, which does not exist. Note that this has to + // be handled correctly in the decode path as well. + return + } + es.resolve(obj, r) case reflect.Bool: - object = &pb.Object{Value: &pb.Object_BoolValue{obj.Bool()}} + *dest = wire.Bool(obj.Bool()) case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: - object = &pb.Object{Value: &pb.Object_Int64Value{obj.Int()}} + *dest = wire.Int(obj.Int()) case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: - object = &pb.Object{Value: &pb.Object_Uint64Value{obj.Uint()}} - case reflect.Float32, reflect.Float64: - object = &pb.Object{Value: &pb.Object_DoubleValue{obj.Float()}} + *dest = wire.Uint(obj.Uint()) + case reflect.Float32: + *dest = wire.Float32(obj.Float()) + case reflect.Float64: + *dest = wire.Float64(obj.Float()) + case reflect.Complex64: + c := wire.Complex64(obj.Complex()) + *dest = &c // Needs alloc. + case reflect.Complex128: + c := wire.Complex128(obj.Complex()) + *dest = &c // Needs alloc. + case reflect.String: + s := wire.String(obj.String()) + *dest = &s // Needs alloc. case reflect.Array: - switch obj.Type().Elem().Kind() { - case reflect.Uint8: - object = &pb.Object{Value: &pb.Object_ByteArrayValue{pbSlice(obj).Interface().([]byte)}} - case reflect.Uint16: - // 16-bit slices are serialized as 32-bit slices. - // See object.proto for details. - s := pbSlice(obj).Interface().([]uint16) - t := make([]uint32, len(s)) - for i := range s { - t[i] = uint32(s[i]) - } - object = &pb.Object{Value: &pb.Object_Uint16ArrayValue{&pb.Uint16S{Values: t}}} - case reflect.Uint32: - object = &pb.Object{Value: &pb.Object_Uint32ArrayValue{&pb.Uint32S{Values: pbSlice(obj).Interface().([]uint32)}}} - case reflect.Uint64: - object = &pb.Object{Value: &pb.Object_Uint64ArrayValue{&pb.Uint64S{Values: pbSlice(obj).Interface().([]uint64)}}} - case reflect.Uintptr: - object = &pb.Object{Value: &pb.Object_UintptrArrayValue{&pb.Uintptrs{Values: pbSlice(obj).Interface().([]uint64)}}} - case reflect.Int8: - object = &pb.Object{Value: &pb.Object_Int8ArrayValue{&pb.Int8S{Values: pbSlice(obj).Interface().([]byte)}}} - case reflect.Int16: - // 16-bit slices are serialized as 32-bit slices. - // See object.proto for details. - s := pbSlice(obj).Interface().([]int16) - t := make([]int32, len(s)) - for i := range s { - t[i] = int32(s[i]) - } - object = &pb.Object{Value: &pb.Object_Int16ArrayValue{&pb.Int16S{Values: t}}} - case reflect.Int32: - object = &pb.Object{Value: &pb.Object_Int32ArrayValue{&pb.Int32S{Values: pbSlice(obj).Interface().([]int32)}}} - case reflect.Int64: - object = &pb.Object{Value: &pb.Object_Int64ArrayValue{&pb.Int64S{Values: pbSlice(obj).Interface().([]int64)}}} - case reflect.Bool: - object = &pb.Object{Value: &pb.Object_BoolArrayValue{&pb.Bools{Values: pbSlice(obj).Interface().([]bool)}}} - case reflect.Float32: - object = &pb.Object{Value: &pb.Object_Float32ArrayValue{&pb.Float32S{Values: pbSlice(obj).Interface().([]float32)}}} - case reflect.Float64: - object = &pb.Object{Value: &pb.Object_Float64ArrayValue{&pb.Float64S{Values: pbSlice(obj).Interface().([]float64)}}} - default: - object = &pb.Object{Value: &pb.Object_ArrayValue{es.encodeArray(obj)}} - } + es.encodeArray(obj, dest) case reflect.Slice: - if obj.IsNil() || obj.Cap() == 0 { - // Handled specially in decode; store as nil value. - object = &pb.Object{Value: &pb.Object_RefValue{0}} - } else { - // Serialize a slice as the array plus length and capacity. - object = &pb.Object{Value: &pb.Object_SliceValue{&pb.Slice{ - Capacity: uint32(obj.Cap()), - Length: uint32(obj.Len()), - RefValue: es.register(arrayFromSlice(obj)), - }}} + s := &wire.Slice{ + Capacity: wire.Uint(obj.Cap()), + Length: wire.Uint(obj.Len()), } - case reflect.String: - object = &pb.Object{Value: &pb.Object_StringValue{[]byte(obj.String())}} - case reflect.Ptr: + *dest = s + // Note that we do need to provide a wire.Slice type here as + // how is not encodeDefault. If this were the case, then it + // would have been caught by the IsZero check above and we + // would have just used wire.Nil{}. if obj.IsNil() { - // Handled specially in decode; store as a nil value. - object = &pb.Object{Value: &pb.Object_RefValue{0}} - } else { - es.push(true /* dereference */, "", nil) - object = &pb.Object{Value: &pb.Object_RefValue{es.register(obj)}} - es.pop() + return } + // Slices need pointer resolution. + es.resolve(arrayFromSlice(obj), &s.Ref) case reflect.Interface: - // We don't check for IsNil here, as we want to encode type - // information. The case of the empty interface (no type, no - // value) is handled by encodeInteface. - object = &pb.Object{Value: &pb.Object_InterfaceValue{es.encodeInterface(obj)}} + es.encodeInterface(obj, dest) case reflect.Struct: - object = &pb.Object{Value: &pb.Object_StructValue{es.encodeStruct(obj)}} + es.encodeStruct(obj, dest) case reflect.Map: - if obj.IsNil() { - // Handled specially in decode; store as a nil value. - object = &pb.Object{Value: &pb.Object_RefValue{0}} - } else if mapAsValue { - // Encode the map directly. - object = &pb.Object{Value: &pb.Object_MapValue{es.encodeMap(obj)}} - } else { - // Encode a reference to the map. - // - // Remove the map object count here to avoid double - // counting, as this object will be counted again when - // it gets processed later. We do not add a reference - // count as the reference is artificial. - es.stats.Remove(obj) - object = &pb.Object{Value: &pb.Object_RefValue{es.register(obj)}} + if how == encodeMapAsValue { + es.encodeMap(obj, dest) + return } + r := new(wire.Ref) + *dest = r + es.resolve(obj, r) default: - panic(fmt.Errorf("unknown primitive %#v", obj.Interface())) + Failf("unknown object %#v", obj.Interface()) + panic("unreachable") } - - es.stats.Done() - es.pop() - return } -// Serialize serializes the object state. -// -// This function may panic and should be run in safely(). -func (es *encodeState) Serialize(obj reflect.Value) { - es.register(obj.Addr()) - - // Pop off the list until we're done. - for es.pending.Len() > 0 { - e := es.pending.Front() - - // Extract the queued object. - qo := e.Value.(queuedObject) - es.stats.Start(qo.obj) +// Save serializes the object graph rooted at obj. +func (es *encodeState) Save(obj reflect.Value) { + es.stats.init() + defer es.stats.fini(func(id typeID) string { + return es.pendingTypes[id-1].Name + }) + + // Resolve the first object, which should queue a pile of additional + // objects on the pending list. All queued objects should be fully + // resolved, and we should be able to serialize after this call. + var root wire.Ref + es.resolve(obj.Addr(), &root) + + // Encode the graph. + var oes *objectEncodeState + if err := safely(func() { + for oes = es.deferred.Front(); oes != nil; oes = es.deferred.Front() { + // Remove and encode the object. Note that as a result + // of this encoding, the object may be enqueued on the + // deferred list yet again. That's expected, and why it + // is removed first. + es.deferred.Remove(oes) + es.encodeObject(oes.obj, oes.how, &oes.encoded) + } + }); err != nil { + // Include the object in the error message. + Failf("encoding error at object %#v: %w", oes.obj.Interface(), err) + } - es.pending.Remove(e) + // Check that items are pending. + if es.pending.Front() == nil { + Failf("pending is empty?") + } - es.from = &qo.path - o := es.encodeObject(qo.obj, true, "", nil) + // Write the header with the number of objects. Note that there is no + // way that es.lastID could conflict with objectID, which would + // indicate that an impossibly large encoding. + if err := WriteHeader(es.w, uint64(es.lastID), true); err != nil { + Failf("error writing header: %w", err) + } - // Emit to our output stream. - if err := es.writeObject(qo.id, o); err != nil { - panic(err) + // Serialize all pending types and pending objects. Note that we don't + // bother removing from this list as we walk it because that just + // wastes time. It will not change after this point. + var id objectID + if err := safely(func() { + for _, wt := range es.pendingTypes { + // Encode the type. + wire.Save(es.w, &wt) } + for oes = es.pending.Front(); oes != nil; oes = oes.pendingEntry.Next() { + id++ // First object is 1. + if oes.id != id { + Failf("expected id %d, got %d", id, oes.id) + } - // Mark as done. - es.done.PushBack(e) - es.stats.Done() + // Marshall the object. + wire.Save(es.w, oes.encoded) + } + }); err != nil { + // Include the object and the error. + Failf("error serializing object %#v: %w", oes.encoded, err) } - // Write a zero-length terminal at the end; this is a sanity check - // applied at decode time as well (see decode.go). - if err := WriteHeader(es.w, 0, false); err != nil { - panic(err) + // Check what we wrote. + if id != es.lastID { + Failf("expected %d objects, wrote %d", es.lastID, id) } } +// objectFlag indicates that the length is a # of objects, rather than a raw +// byte length. When this is set on a length header in the stream, it may be +// decoded appropriately. +const objectFlag uint64 = 1 << 63 + // WriteHeader writes a header. // // Each object written to the statefile should be prefixed with a header. In // order to generate statefiles that play nicely with debugging tools, raw // writes should be prefixed with a header with object set to false and the // appropriate length. This will allow tools to skip these regions. -func WriteHeader(w io.Writer, length uint64, object bool) error { - // The lowest-order bit encodes whether this is a valid object. This is - // a purely internal convention, but allows the object flag to be - // returned from ReadHeader. - length = length << 1 +func WriteHeader(w wire.Writer, length uint64, object bool) error { + // Sanity check the length. + if length&objectFlag != 0 { + Failf("impossibly huge length: %d", length) + } if object { - length |= 0x1 + length |= objectFlag } // Write a header. - var hdr [32]byte - encodedLen := binary.PutUvarint(hdr[:], length) - for done := 0; done < encodedLen; { - n, err := w.Write(hdr[done:encodedLen]) - done += n - if n == 0 && err != nil { - return err - } - } - - return nil + return safely(func() { + wire.SaveUint(w, length) + }) } -// writeObject writes an object to the stream. -func (es *encodeState) writeObject(id uint64, obj *pb.Object) error { - // Marshal the proto. - buf, err := proto.Marshal(obj) - if err != nil { - return err - } +// pendingMapper is for the pending list. +type pendingMapper struct{} - // Write the object header. - if err := WriteHeader(es.w, uint64(len(buf)), true); err != nil { - return err - } +func (pendingMapper) linkerFor(oes *objectEncodeState) *pendingEntry { return &oes.pendingEntry } - // Write the object. - for done := 0; done < len(buf); { - n, err := es.w.Write(buf[done:]) - done += n - if n == 0 && err != nil { - return err - } - } +// deferredMapper is for the deferred list. +type deferredMapper struct{} - return nil -} +func (deferredMapper) linkerFor(oes *objectEncodeState) *deferredEntry { return &oes.deferredEntry } // addrSetFunctions is used by addrSet. type addrSetFunctions struct{} @@ -458,13 +818,24 @@ func (addrSetFunctions) MaxKey() uintptr { return ^uintptr(0) } -func (addrSetFunctions) ClearValue(val *reflect.Value) { +func (addrSetFunctions) ClearValue(val **objectEncodeState) { + *val = nil } -func (addrSetFunctions) Merge(_ addrRange, val1 reflect.Value, _ addrRange, val2 reflect.Value) (reflect.Value, bool) { - return val1, val1 == val2 +func (addrSetFunctions) Merge(r1 addrRange, val1 *objectEncodeState, r2 addrRange, val2 *objectEncodeState) (*objectEncodeState, bool) { + if val1.obj == val2.obj { + // This, should never happen. It would indicate that the same + // object exists in two non-contiguous address ranges. Note + // that this assertion can only be triggered if the race + // detector is enabled. + Failf("unexpected merge in addrSet @ %v and %v: %#v and %#v", r1, r2, val1.obj, val2.obj) + } + // Reject the merge. + return val1, false } -func (addrSetFunctions) Split(_ addrRange, val reflect.Value, _ uintptr) (reflect.Value, reflect.Value) { - return val, val +func (addrSetFunctions) Split(r addrRange, val *objectEncodeState, _ uintptr) (*objectEncodeState, *objectEncodeState) { + // A split should never happen: we don't remove ranges. + Failf("unexpected split in addrSet @ %v: %#v", r, val.obj) + panic("unreachable") } diff --git a/pkg/state/encode_unsafe.go b/pkg/state/encode_unsafe.go index 457e6dbb7..e0dad83b4 100644 --- a/pkg/state/encode_unsafe.go +++ b/pkg/state/encode_unsafe.go @@ -31,51 +31,3 @@ func arrayFromSlice(obj reflect.Value) reflect.Value { reflect.ArrayOf(obj.Cap(), obj.Type().Elem()), unsafe.Pointer(obj.Pointer())) } - -// pbSlice returns a protobuf-supported slice of the array and erase the -// original element type (which could be a defined type or non-supported type). -func pbSlice(obj reflect.Value) reflect.Value { - var typ reflect.Type - switch obj.Type().Elem().Kind() { - case reflect.Uint8: - typ = reflect.TypeOf(byte(0)) - case reflect.Uint16: - typ = reflect.TypeOf(uint16(0)) - case reflect.Uint32: - typ = reflect.TypeOf(uint32(0)) - case reflect.Uint64: - typ = reflect.TypeOf(uint64(0)) - case reflect.Uintptr: - typ = reflect.TypeOf(uint64(0)) - case reflect.Int8: - typ = reflect.TypeOf(byte(0)) - case reflect.Int16: - typ = reflect.TypeOf(int16(0)) - case reflect.Int32: - typ = reflect.TypeOf(int32(0)) - case reflect.Int64: - typ = reflect.TypeOf(int64(0)) - case reflect.Bool: - typ = reflect.TypeOf(bool(false)) - case reflect.Float32: - typ = reflect.TypeOf(float32(0)) - case reflect.Float64: - typ = reflect.TypeOf(float64(0)) - default: - panic("slice element is not of basic value type") - } - return reflect.NewAt( - reflect.ArrayOf(obj.Len(), typ), - unsafe.Pointer(obj.Slice(0, obj.Len()).Pointer()), - ).Elem().Slice(0, obj.Len()) -} - -func castSlice(obj reflect.Value, elemTyp reflect.Type) reflect.Value { - if obj.Type().Elem().Size() != elemTyp.Size() { - panic("cannot cast slice into other element type of different size") - } - return reflect.NewAt( - reflect.ArrayOf(obj.Len(), elemTyp), - unsafe.Pointer(obj.Slice(0, obj.Len()).Pointer()), - ).Elem() -} diff --git a/pkg/state/map.go b/pkg/state/map.go deleted file mode 100644 index 4f3ebb0da..000000000 --- a/pkg/state/map.go +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package state - -import ( - "context" - "fmt" - "reflect" - "sort" - "sync" - - pb "gvisor.dev/gvisor/pkg/state/object_go_proto" -) - -// entry is a single map entry. -type entry struct { - name string - object *pb.Object -} - -// internalMap is the internal Map state. -// -// These are recycled via a pool to avoid churn. -type internalMap struct { - // es is encodeState. - es *encodeState - - // ds is decodeState. - ds *decodeState - - // os is current object being decoded. - // - // This will always be nil during encode. - os *objectState - - // data stores the encoded values. - data []entry -} - -var internalMapPool = sync.Pool{ - New: func() interface{} { - return new(internalMap) - }, -} - -// newInternalMap returns a cached map. -func newInternalMap(es *encodeState, ds *decodeState, os *objectState) *internalMap { - m := internalMapPool.Get().(*internalMap) - m.es = es - m.ds = ds - m.os = os - if m.data != nil { - m.data = m.data[:0] - } - return m -} - -// Map is a generic state container. -// -// This is the object passed to Save and Load in order to store their state. -// -// Detailed documentation is available in individual methods. -type Map struct { - *internalMap -} - -// Save adds the given object to the map. -// -// You should pass always pointers to the object you are saving. For example: -// -// type X struct { -// A int -// B *int -// } -// -// func (x *X) Save(m Map) { -// m.Save("A", &x.A) -// m.Save("B", &x.B) -// } -// -// func (x *X) Load(m Map) { -// m.Load("A", &x.A) -// m.Load("B", &x.B) -// } -func (m Map) Save(name string, objPtr interface{}) { - m.save(name, reflect.ValueOf(objPtr).Elem(), ".%s") -} - -// SaveValue adds the given object value to the map. -// -// This should be used for values where pointers are not available, or casts -// are required during Save/Load. -// -// For example, if we want to cast external package type P.Foo to int64: -// -// type X struct { -// A P.Foo -// } -// -// func (x *X) Save(m Map) { -// m.SaveValue("A", int64(x.A)) -// } -// -// func (x *X) Load(m Map) { -// m.LoadValue("A", new(int64), func(x interface{}) { -// x.A = P.Foo(x.(int64)) -// }) -// } -func (m Map) SaveValue(name string, obj interface{}) { - m.save(name, reflect.ValueOf(obj), ".(value %s)") -} - -// save is helper for the above. It takes the name of value to save the field -// to, the field object (obj), and a format string that specifies how the -// field's saving logic is dispatched from the struct (normal, value, etc.). The -// format string should expect one string parameter, which is the name of the -// field. -func (m Map) save(name string, obj reflect.Value, format string) { - if m.es == nil { - // Not currently encoding. - m.Failf("no encode state for %q", name) - } - - // Attempt the encode. - // - // These are sorted at the end, after all objects are added and will be - // sorted and checked for duplicates (see encodeStruct). - m.data = append(m.data, entry{ - name: name, - object: m.es.encodeObject(obj, false, format, name), - }) -} - -// Load loads the given object from the map. -// -// See Save for an example. -func (m Map) Load(name string, objPtr interface{}) { - m.load(name, reflect.ValueOf(objPtr), false, nil, ".%s") -} - -// LoadWait loads the given objects from the map, and marks it as requiring all -// AfterLoad executions to complete prior to running this object's AfterLoad. -// -// See Save for an example. -func (m Map) LoadWait(name string, objPtr interface{}) { - m.load(name, reflect.ValueOf(objPtr), true, nil, ".(wait %s)") -} - -// LoadValue loads the given object value from the map. -// -// See SaveValue for an example. -func (m Map) LoadValue(name string, objPtr interface{}, fn func(interface{})) { - o := reflect.ValueOf(objPtr) - m.load(name, o, true, func() { fn(o.Elem().Interface()) }, ".(value %s)") -} - -// load is helper for the above. It takes the name of value to load the field -// from, the target field pointer (objPtr), whether load completion of the -// struct depends on the field's load completion (wait), the load completion -// logic (fn), and a format string that specifies how the field's loading logic -// is dispatched from the struct (normal, wait, value, etc.). The format string -// should expect one string parameter, which is the name of the field. -func (m Map) load(name string, objPtr reflect.Value, wait bool, fn func(), format string) { - if m.ds == nil { - // Not currently decoding. - m.Failf("no decode state for %q", name) - } - - // Find the object. - // - // These are sorted up front (and should appear in the state file - // sorted as well), so we can do a binary search here to ensure that - // large structs don't behave badly. - i := sort.Search(len(m.data), func(i int) bool { - return m.data[i].name >= name - }) - if i >= len(m.data) || m.data[i].name != name { - // There is no data for this name? - m.Failf("no data found for %q", name) - } - - // Perform the decode. - m.ds.decodeObject(m.os, objPtr.Elem(), m.data[i].object, format, name) - if wait { - // Mark this individual object a blocker. - m.ds.waitObject(m.os, m.data[i].object, fn) - } -} - -// Failf fails the save or restore with the provided message. Processing will -// stop after calling Failf, as the state package uses a panic & recover -// mechanism for state errors. You should defer any cleanup required. -func (m Map) Failf(format string, args ...interface{}) { - panic(fmt.Errorf(format, args...)) -} - -// AfterLoad schedules a function execution when all objects have been allocated -// and their automated loading and customized load logic have been executed. fn -// will not be executed until all of current object's dependencies' AfterLoad() -// logic, if exist, have been executed. -func (m Map) AfterLoad(fn func()) { - if m.ds == nil { - // Not currently decoding. - m.Failf("not decoding") - } - - // Queue the local callback; this will execute when all of the above - // data dependencies have been cleared. - m.os.callbacks = append(m.os.callbacks, fn) -} - -// Context returns the current context object. -func (m Map) Context() context.Context { - if m.es != nil { - return m.es.ctx - } else if m.ds != nil { - return m.ds.ctx - } - return context.Background() // No context. -} diff --git a/pkg/state/object.proto b/pkg/state/object.proto deleted file mode 100644 index 5ebcfb151..000000000 --- a/pkg/state/object.proto +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -package gvisor.state.statefile; - -// Slice is a slice value. -message Slice { - uint32 length = 1; - uint32 capacity = 2; - uint64 ref_value = 3; -} - -// Array is an array value. -message Array { - repeated Object contents = 1; -} - -// Map is a map value. -message Map { - repeated Object keys = 1; - repeated Object values = 2; -} - -// Interface is an interface value. -message Interface { - string type = 1; - Object value = 2; -} - -// Struct is a basic composite value. -message Struct { - repeated Field fields = 1; -} - -// Field encodes a single field. -message Field { - string name = 1; - Object value = 2; -} - -// Uint16s encodes an uint16 array. To be used inside oneof structure. -message Uint16s { - // There is no 16-bit type in protobuf so we use variable length 32-bit here. - repeated uint32 values = 1; -} - -// Uint32s encodes an uint32 array. To be used inside oneof structure. -message Uint32s { - repeated fixed32 values = 1; -} - -// Uint64s encodes an uint64 array. To be used inside oneof structure. -message Uint64s { - repeated fixed64 values = 1; -} - -// Uintptrs encodes an uintptr array. To be used inside oneof structure. -message Uintptrs { - repeated fixed64 values = 1; -} - -// Int8s encodes an int8 array. To be used inside oneof structure. -message Int8s { - bytes values = 1; -} - -// Int16s encodes an int16 array. To be used inside oneof structure. -message Int16s { - // There is no 16-bit type in protobuf so we use variable length 32-bit here. - repeated int32 values = 1; -} - -// Int32s encodes an int32 array. To be used inside oneof structure. -message Int32s { - repeated sfixed32 values = 1; -} - -// Int64s encodes an int64 array. To be used inside oneof structure. -message Int64s { - repeated sfixed64 values = 1; -} - -// Bools encodes a boolean array. To be used inside oneof structure. -message Bools { - repeated bool values = 1; -} - -// Float64s encodes a float64 array. To be used inside oneof structure. -message Float64s { - repeated double values = 1; -} - -// Float32s encodes a float32 array. To be used inside oneof structure. -message Float32s { - repeated float values = 1; -} - -// Object are primitive encodings. -// -// Note that ref_value references an Object.id, below. -message Object { - oneof value { - bool bool_value = 1; - bytes string_value = 2; - int64 int64_value = 3; - uint64 uint64_value = 4; - double double_value = 5; - uint64 ref_value = 6; - Slice slice_value = 7; - Array array_value = 8; - Interface interface_value = 9; - Struct struct_value = 10; - Map map_value = 11; - bytes byte_array_value = 12; - Uint16s uint16_array_value = 13; - Uint32s uint32_array_value = 14; - Uint64s uint64_array_value = 15; - Uintptrs uintptr_array_value = 16; - Int8s int8_array_value = 17; - Int16s int16_array_value = 18; - Int32s int32_array_value = 19; - Int64s int64_array_value = 20; - Bools bool_array_value = 21; - Float64s float64_array_value = 22; - Float32s float32_array_value = 23; - } -} diff --git a/pkg/state/pretty/BUILD b/pkg/state/pretty/BUILD new file mode 100644 index 000000000..d053802f7 --- /dev/null +++ b/pkg/state/pretty/BUILD @@ -0,0 +1,13 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "pretty", + srcs = ["pretty.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/state", + "//pkg/state/wire", + ], +) diff --git a/pkg/state/pretty/pretty.go b/pkg/state/pretty/pretty.go new file mode 100644 index 000000000..cf37aaa49 --- /dev/null +++ b/pkg/state/pretty/pretty.go @@ -0,0 +1,273 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pretty is a pretty-printer for state streams. +package pretty + +import ( + "fmt" + "io" + "io/ioutil" + "reflect" + "strings" + + "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" +) + +func formatRef(x *wire.Ref, graph uint64, html bool) string { + baseRef := fmt.Sprintf("g%dr%d", graph, x.Root) + fullRef := baseRef + if len(x.Dots) > 0 { + // See wire.Ref; Type valid if Dots non-zero. + typ, _ := formatType(x.Type, graph, html) + var buf strings.Builder + buf.WriteString("(*") + buf.WriteString(typ) + buf.WriteString(")(") + buf.WriteString(baseRef) + for _, component := range x.Dots { + switch v := component.(type) { + case *wire.FieldName: + buf.WriteString(".") + buf.WriteString(string(*v)) + case wire.Index: + buf.WriteString(fmt.Sprintf("[%d]", v)) + default: + panic(fmt.Sprintf("unreachable: switch should be exhaustive, unhandled case %v", reflect.TypeOf(component))) + } + } + buf.WriteString(")") + fullRef = buf.String() + } + if html { + return fmt.Sprintf("%s", baseRef, fullRef) + } + return fullRef +} + +func formatType(t wire.TypeSpec, graph uint64, html bool) (string, bool) { + switch x := t.(type) { + case wire.TypeID: + base := fmt.Sprintf("g%dt%d", graph, x) + if html { + return fmt.Sprintf("%s", base, base), true + } + return fmt.Sprintf("%s", base), true + case wire.TypeSpecNil: + return "", false // Only nil type. + case *wire.TypeSpecPointer: + element, _ := formatType(x.Type, graph, html) + return fmt.Sprintf("(*%s)", element), true + case *wire.TypeSpecArray: + element, _ := formatType(x.Type, graph, html) + return fmt.Sprintf("[%d](%s)", x.Count, element), true + case *wire.TypeSpecSlice: + element, _ := formatType(x.Type, graph, html) + return fmt.Sprintf("([]%s)", element), true + case *wire.TypeSpecMap: + key, _ := formatType(x.Key, graph, html) + value, _ := formatType(x.Value, graph, html) + return fmt.Sprintf("(map[%s]%s)", key, value), true + default: + panic(fmt.Sprintf("unreachable: unknown type %T", t)) + } +} + +// format formats a single object, for pretty-printing. It also returns whether +// the value is a non-zero value. +func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bool) { + switch x := encoded.(type) { + case wire.Nil: + return "nil", false + case *wire.String: + return fmt.Sprintf("%q", *x), *x != "" + case *wire.Complex64: + return fmt.Sprintf("%f+%fi", real(*x), imag(*x)), *x != 0.0 + case *wire.Complex128: + return fmt.Sprintf("%f+%fi", real(*x), imag(*x)), *x != 0.0 + case *wire.Ref: + return formatRef(x, graph, html), x.Root != 0 + case *wire.Type: + tabs := "\n" + strings.Repeat("\t", depth) + items := make([]string, 0, len(x.Fields)+2) + items = append(items, fmt.Sprintf("type %s {", x.Name)) + for i := 0; i < len(x.Fields); i++ { + items = append(items, fmt.Sprintf("\t%d: %s,", i, x.Fields[i])) + } + items = append(items, "}") + return strings.Join(items, tabs), true // No zero value. + case *wire.Slice: + return fmt.Sprintf("%s{len:%d,cap:%d}", formatRef(&x.Ref, graph, html), x.Length, x.Capacity), x.Capacity != 0 + case *wire.Array: + if len(x.Contents) == 0 { + return "[]", false + } + items := make([]string, 0, len(x.Contents)+2) + zeros := make([]string, 0) // used to eliminate zero entries. + items = append(items, "[") + tabs := "\n" + strings.Repeat("\t", depth) + for i := 0; i < len(x.Contents); i++ { + item, ok := format(graph, depth+1, x.Contents[i], html) + if !ok { + zeros = append(zeros, fmt.Sprintf("\t%s,", item)) + continue + } + if len(zeros) > 0 { + items = append(items, zeros...) + zeros = nil + } + items = append(items, fmt.Sprintf("\t%s,", item)) + } + if len(zeros) > 0 { + items = append(items, fmt.Sprintf("\t... (%d zeros),", len(zeros))) + } + items = append(items, "]") + return strings.Join(items, tabs), len(zeros) < len(x.Contents) + case *wire.Struct: + typ, _ := formatType(x.TypeID, graph, html) + if x.Fields() == 0 { + return fmt.Sprintf("struct[%s]{}", typ), false + } + items := make([]string, 0, 2) + items = append(items, fmt.Sprintf("struct[%s]{", typ)) + tabs := "\n" + strings.Repeat("\t", depth) + allZero := true + for i := 0; i < x.Fields(); i++ { + element, ok := format(graph, depth+1, *x.Field(i), html) + allZero = allZero && !ok + items = append(items, fmt.Sprintf("\t%d: %s,", i, element)) + i++ + } + items = append(items, "}") + return strings.Join(items, tabs), !allZero + case *wire.Map: + if len(x.Keys) == 0 { + return "map{}", false + } + items := make([]string, 0, len(x.Keys)+2) + items = append(items, "map{") + tabs := "\n" + strings.Repeat("\t", depth) + for i := 0; i < len(x.Keys); i++ { + key, _ := format(graph, depth+1, x.Keys[i], html) + value, _ := format(graph, depth+1, x.Values[i], html) + items = append(items, fmt.Sprintf("\t%s: %s,", key, value)) + } + items = append(items, "}") + return strings.Join(items, tabs), true + case *wire.Interface: + typ, typOk := formatType(x.Type, graph, html) + element, elementOk := format(graph, depth+1, x.Value, html) + return fmt.Sprintf("interface[%s]{%s}", typ, element), typOk || elementOk + default: + // Must be a primitive; use reflection. + return fmt.Sprintf("%v", encoded), true + } +} + +// printStream is the basic print implementation. +func printStream(w io.Writer, r wire.Reader, html bool) (err error) { + // current graph ID. + var graph uint64 + + if html { + fmt.Fprintf(w, "
")
+		defer fmt.Fprintf(w, "
") + } + + defer func() { + if r := recover(); r != nil { + if rErr, ok := r.(error); ok { + err = rErr // Override return. + return + } + panic(r) // Propagate. + } + }() + + for { + // Find the first object to begin generation. + length, object, err := state.ReadHeader(r) + if err == io.EOF { + // Nothing else to do. + break + } else if err != nil { + return err + } + if !object { + graph++ // Increment the graph. + if length > 0 { + fmt.Fprintf(w, "(%d bytes non-object data)\n", length) + io.Copy(ioutil.Discard, &io.LimitedReader{ + R: r, + N: int64(length), + }) + } + continue + } + + // Read & unmarshal the object. + // + // Note that this loop must match the general structure of the + // loop in decode.go. But we don't register type information, + // etc. and just print the raw structures. + var ( + oid uint64 = 1 + tid uint64 = 1 + ) + for oid <= length { + // Unmarshal the object. + encoded := wire.Load(r) + + // Is this a type? + if _, ok := encoded.(*wire.Type); ok { + str, _ := format(graph, 0, encoded, html) + tag := fmt.Sprintf("g%dt%d", graph, tid) + if html { + // See below. + tag = fmt.Sprintf("%s", tag, tag, tag) + } + if _, err := fmt.Fprintf(w, "%s = %s\n", tag, str); err != nil { + return err + } + tid++ + continue + } + + // Format the node. + str, _ := format(graph, 0, encoded, html) + tag := fmt.Sprintf("g%dr%d", graph, oid) + if html { + // Create a little tag with an anchor next to it for linking. + tag = fmt.Sprintf("%s", tag, tag, tag) + } + if _, err := fmt.Fprintf(w, "%s = %s\n", tag, str); err != nil { + return err + } + oid++ + } + } + + return nil +} + +// PrintText reads the stream from r and prints text to w. +func PrintText(w io.Writer, r wire.Reader) error { + return printStream(w, r, false /* html */) +} + +// PrintHTML reads the stream from r and prints html to w. +func PrintHTML(w io.Writer, r wire.Reader) error { + return printStream(w, r, true /* html */) +} diff --git a/pkg/state/printer.go b/pkg/state/printer.go deleted file mode 100644 index 3ce18242f..000000000 --- a/pkg/state/printer.go +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package state - -import ( - "fmt" - "io" - "io/ioutil" - "reflect" - "strings" - - "github.com/golang/protobuf/proto" - pb "gvisor.dev/gvisor/pkg/state/object_go_proto" -) - -// format formats a single object, for pretty-printing. It also returns whether -// the value is a non-zero value. -func format(graph uint64, depth int, object *pb.Object, html bool) (string, bool) { - switch x := object.GetValue().(type) { - case *pb.Object_BoolValue: - return fmt.Sprintf("%t", x.BoolValue), x.BoolValue != false - case *pb.Object_StringValue: - return fmt.Sprintf("\"%s\"", string(x.StringValue)), len(x.StringValue) != 0 - case *pb.Object_Int64Value: - return fmt.Sprintf("%d", x.Int64Value), x.Int64Value != 0 - case *pb.Object_Uint64Value: - return fmt.Sprintf("%du", x.Uint64Value), x.Uint64Value != 0 - case *pb.Object_DoubleValue: - return fmt.Sprintf("%f", x.DoubleValue), x.DoubleValue != 0.0 - case *pb.Object_RefValue: - if x.RefValue == 0 { - return "nil", false - } - ref := fmt.Sprintf("g%dr%d", graph, x.RefValue) - if html { - ref = fmt.Sprintf("%s", ref, ref) - } - return ref, true - case *pb.Object_SliceValue: - if x.SliceValue.RefValue == 0 { - return "nil", false - } - ref := fmt.Sprintf("g%dr%d", graph, x.SliceValue.RefValue) - if html { - ref = fmt.Sprintf("%s", ref, ref) - } - return fmt.Sprintf("%s[:%d:%d]", ref, x.SliceValue.Length, x.SliceValue.Capacity), true - case *pb.Object_ArrayValue: - if len(x.ArrayValue.Contents) == 0 { - return "[]", false - } - items := make([]string, 0, len(x.ArrayValue.Contents)+2) - zeros := make([]string, 0) // used to eliminate zero entries. - items = append(items, "[") - tabs := "\n" + strings.Repeat("\t", depth) - for i := 0; i < len(x.ArrayValue.Contents); i++ { - item, ok := format(graph, depth+1, x.ArrayValue.Contents[i], html) - if ok { - if len(zeros) > 0 { - items = append(items, zeros...) - zeros = nil - } - items = append(items, fmt.Sprintf("\t%s,", item)) - } else { - zeros = append(zeros, fmt.Sprintf("\t%s,", item)) - } - } - if len(zeros) > 0 { - items = append(items, fmt.Sprintf("\t... (%d zeros),", len(zeros))) - } - items = append(items, "]") - return strings.Join(items, tabs), len(zeros) < len(x.ArrayValue.Contents) - case *pb.Object_StructValue: - if len(x.StructValue.Fields) == 0 { - return "struct{}", false - } - items := make([]string, 0, len(x.StructValue.Fields)+2) - items = append(items, "struct{") - tabs := "\n" + strings.Repeat("\t", depth) - allZero := true - for _, field := range x.StructValue.Fields { - element, ok := format(graph, depth+1, field.Value, html) - allZero = allZero && !ok - items = append(items, fmt.Sprintf("\t%s: %s,", field.Name, element)) - } - items = append(items, "}") - return strings.Join(items, tabs), !allZero - case *pb.Object_MapValue: - if len(x.MapValue.Keys) == 0 { - return "map{}", false - } - items := make([]string, 0, len(x.MapValue.Keys)+2) - items = append(items, "map{") - tabs := "\n" + strings.Repeat("\t", depth) - for i := 0; i < len(x.MapValue.Keys); i++ { - key, _ := format(graph, depth+1, x.MapValue.Keys[i], html) - value, _ := format(graph, depth+1, x.MapValue.Values[i], html) - items = append(items, fmt.Sprintf("\t%s: %s,", key, value)) - } - items = append(items, "}") - return strings.Join(items, tabs), true - case *pb.Object_InterfaceValue: - if x.InterfaceValue.Type == "" { - return "interface(nil){}", false - } - element, _ := format(graph, depth+1, x.InterfaceValue.Value, html) - return fmt.Sprintf("interface(\"%s\"){%s}", x.InterfaceValue.Type, element), true - case *pb.Object_ByteArrayValue: - return printArray(reflect.ValueOf(x.ByteArrayValue)) - case *pb.Object_Uint16ArrayValue: - return printArray(reflect.ValueOf(x.Uint16ArrayValue.Values)) - case *pb.Object_Uint32ArrayValue: - return printArray(reflect.ValueOf(x.Uint32ArrayValue.Values)) - case *pb.Object_Uint64ArrayValue: - return printArray(reflect.ValueOf(x.Uint64ArrayValue.Values)) - case *pb.Object_UintptrArrayValue: - return printArray(castSlice(reflect.ValueOf(x.UintptrArrayValue.Values), reflect.TypeOf(uintptr(0)))) - case *pb.Object_Int8ArrayValue: - return printArray(castSlice(reflect.ValueOf(x.Int8ArrayValue.Values), reflect.TypeOf(int8(0)))) - case *pb.Object_Int16ArrayValue: - return printArray(reflect.ValueOf(x.Int16ArrayValue.Values)) - case *pb.Object_Int32ArrayValue: - return printArray(reflect.ValueOf(x.Int32ArrayValue.Values)) - case *pb.Object_Int64ArrayValue: - return printArray(reflect.ValueOf(x.Int64ArrayValue.Values)) - case *pb.Object_BoolArrayValue: - return printArray(reflect.ValueOf(x.BoolArrayValue.Values)) - case *pb.Object_Float64ArrayValue: - return printArray(reflect.ValueOf(x.Float64ArrayValue.Values)) - case *pb.Object_Float32ArrayValue: - return printArray(reflect.ValueOf(x.Float32ArrayValue.Values)) - } - - // Should not happen, but tolerate. - return fmt.Sprintf("(unknown proto type: %T)", object.GetValue()), true -} - -// PrettyPrint reads the state stream from r, and pretty prints to w. -func PrettyPrint(w io.Writer, r io.Reader, html bool) error { - var ( - // current graph ID. - graph uint64 - - // current object ID. - id uint64 - ) - - if html { - fmt.Fprintf(w, "
")
-		defer fmt.Fprintf(w, "
") - } - - for { - // Find the first object to begin generation. - length, object, err := ReadHeader(r) - if err == io.EOF { - // Nothing else to do. - break - } else if err != nil { - return err - } - if !object { - // Increment the graph number & reset the ID. - graph++ - id = 0 - if length > 0 { - fmt.Fprintf(w, "(%d bytes non-object data)\n", length) - io.Copy(ioutil.Discard, &io.LimitedReader{ - R: r, - N: int64(length), - }) - } - continue - } - - // Read & unmarshal the object. - buf := make([]byte, length) - for done := 0; done < len(buf); { - n, err := r.Read(buf[done:]) - done += n - if n == 0 && err != nil { - return err - } - } - obj := new(pb.Object) - if err := proto.Unmarshal(buf, obj); err != nil { - return err - } - - id++ // First object must be one. - str, _ := format(graph, 0, obj, html) - tag := fmt.Sprintf("g%dr%d", graph, id) - if html { - tag = fmt.Sprintf("%s", tag, tag) - } - if _, err := fmt.Fprintf(w, "%s = %s\n", tag, str); err != nil { - return err - } - } - - return nil -} - -func printArray(s reflect.Value) (string, bool) { - zero := reflect.Zero(s.Type().Elem()).Interface() - z := "0" - switch s.Type().Elem().Kind() { - case reflect.Bool: - z = "false" - case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: - case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: - case reflect.Float32, reflect.Float64: - default: - return fmt.Sprintf("unexpected non-primitive type array: %#v", s.Interface()), true - } - - zeros := 0 - items := make([]string, 0, s.Len()) - for i := 0; i <= s.Len(); i++ { - if i < s.Len() && reflect.DeepEqual(s.Index(i).Interface(), zero) { - zeros++ - continue - } - if zeros > 0 { - if zeros <= 4 { - for ; zeros > 0; zeros-- { - items = append(items, z) - } - } else { - items = append(items, fmt.Sprintf("(%d %ss)", zeros, z)) - zeros = 0 - } - } - if i < s.Len() { - items = append(items, fmt.Sprintf("%v", s.Index(i).Interface())) - } - } - return "[" + strings.Join(items, ",") + "]", zeros < s.Len() -} diff --git a/pkg/state/state.go b/pkg/state/state.go index 03ae2dbb0..acb629969 100644 --- a/pkg/state/state.go +++ b/pkg/state/state.go @@ -31,210 +31,226 @@ // Uint64 default // Float32 default // Float64 default -// Complex64 custom -// Complex128 custom +// Complex64 default +// Complex128 default // Array default // Chan custom // Func custom -// Interface custom -// Map default (*) +// Interface default +// Map default // Ptr default // Slice default // String default -// Struct custom +// Struct custom (*) Unless zero-sized. // UnsafePointer custom // -// (*) Maps are treated as value types by this package, even if they are -// pointers internally. If you want to save two independent references -// to the same map value, you must explicitly use a pointer to a map. +// See README.md for an overview of how encoding and decoding works. package state import ( "context" "fmt" - "io" "reflect" "runtime" - pb "gvisor.dev/gvisor/pkg/state/object_go_proto" + "gvisor.dev/gvisor/pkg/state/wire" ) +// objectID is a unique identifier assigned to each object to be serialized. +// Each instance of an object is considered separately, i.e. if there are two +// objects of the same type in the object graph being serialized, they'll be +// assigned unique objectIDs. +type objectID uint32 + +// typeID is the identifier for a type. Types are serialized and tracked +// alongside objects in order to avoid the overhead of encoding field names in +// all objects. +type typeID uint32 + // ErrState is returned when an error is encountered during encode/decode. type ErrState struct { // err is the underlying error. err error - // path is the visit path from root to the current object. - path string - // trace is the stack trace. trace string } // Error returns a sensible description of the state error. func (e *ErrState) Error() string { - return fmt.Sprintf("%v:\nstate path: %s\n%s", e.err, e.path, e.trace) + return fmt.Sprintf("%v:\n%s", e.err, e.trace) } -// UnwrapErrState returns the underlying error in ErrState. -// -// If err is not *ErrState, err is returned directly. -func UnwrapErrState(err error) error { - if e, ok := err.(*ErrState); ok { - return e.err - } - return err +// Unwrap implements standard unwrapping. +func (e *ErrState) Unwrap() error { + return e.err } // Save saves the given object state. -func Save(ctx context.Context, w io.Writer, rootPtr interface{}, stats *Stats) error { +func Save(ctx context.Context, w wire.Writer, rootPtr interface{}) (Stats, error) { // Create the encoding state. - es := &encodeState{ - ctx: ctx, - idsByObject: make(map[uintptr]uint64), - w: w, - stats: stats, + es := encodeState{ + ctx: ctx, + w: w, + types: makeTypeEncodeDatabase(), + zeroValues: make(map[reflect.Type]*objectEncodeState), } // Perform the encoding. - return es.safely(func() { - es.Serialize(reflect.ValueOf(rootPtr).Elem()) + err := safely(func() { + es.Save(reflect.ValueOf(rootPtr).Elem()) }) + return es.stats, err } // Load loads a checkpoint. -func Load(ctx context.Context, r io.Reader, rootPtr interface{}, stats *Stats) error { +func Load(ctx context.Context, r wire.Reader, rootPtr interface{}) (Stats, error) { // Create the decoding state. - ds := &decodeState{ - ctx: ctx, - objectsByID: make(map[uint64]*objectState), - deferred: make(map[uint64]*pb.Object), - r: r, - stats: stats, + ds := decodeState{ + ctx: ctx, + r: r, + types: makeTypeDecodeDatabase(), + deferred: make(map[objectID]wire.Object), } // Attempt our decode. - return ds.safely(func() { - ds.Deserialize(reflect.ValueOf(rootPtr).Elem()) + err := safely(func() { + ds.Load(reflect.ValueOf(rootPtr).Elem()) }) + return ds.stats, err } -// Fns are the state dispatch functions. -type Fns struct { - // Save is a function like Save(concreteType, Map). - Save interface{} - - // Load is a function like Load(concreteType, Map). - Load interface{} +// Sink is used for Type.StateSave. +type Sink struct { + internal objectEncoder } -// Save executes the save function. -func (fns *Fns) invokeSave(obj reflect.Value, m Map) { - reflect.ValueOf(fns.Save).Call([]reflect.Value{obj, reflect.ValueOf(m)}) +// Save adds the given object to the map. +// +// You should pass always pointers to the object you are saving. For example: +// +// type X struct { +// A int +// B *int +// } +// +// func (x *X) StateTypeInfo(m Sink) state.TypeInfo { +// return state.TypeInfo{ +// Name: "pkg.X", +// Fields: []string{ +// "A", +// "B", +// }, +// } +// } +// +// func (x *X) StateSave(m Sink) { +// m.Save(0, &x.A) // Field is A. +// m.Save(1, &x.B) // Field is B. +// } +// +// func (x *X) StateLoad(m Source) { +// m.Load(0, &x.A) // Field is A. +// m.Load(1, &x.B) // Field is B. +// } +func (s Sink) Save(slot int, objPtr interface{}) { + s.internal.save(slot, reflect.ValueOf(objPtr).Elem()) } -// Load executes the load function. -func (fns *Fns) invokeLoad(obj reflect.Value, m Map) { - reflect.ValueOf(fns.Load).Call([]reflect.Value{obj, reflect.ValueOf(m)}) +// SaveValue adds the given object value to the map. +// +// This should be used for values where pointers are not available, or casts +// are required during Save/Load. +// +// For example, if we want to cast external package type P.Foo to int64: +// +// func (x *X) StateSave(m Sink) { +// m.SaveValue(0, "A", int64(x.A)) +// } +// +// func (x *X) StateLoad(m Source) { +// m.LoadValue(0, new(int64), func(x interface{}) { +// x.A = P.Foo(x.(int64)) +// }) +// } +func (s Sink) SaveValue(slot int, obj interface{}) { + s.internal.save(slot, reflect.ValueOf(obj)) } -// validateStateFn ensures types are correct. -func validateStateFn(fn interface{}, typ reflect.Type) bool { - fnTyp := reflect.TypeOf(fn) - if fnTyp.Kind() != reflect.Func { - return false - } - if fnTyp.NumIn() != 2 { - return false - } - if fnTyp.NumOut() != 0 { - return false - } - if fnTyp.In(0) != typ { - return false - } - if fnTyp.In(1) != reflect.TypeOf(Map{}) { - return false - } - return true +// Context returns the context object provided at save time. +func (s Sink) Context() context.Context { + return s.internal.es.ctx } -// Validate validates all state functions. -func (fns *Fns) Validate(typ reflect.Type) bool { - return validateStateFn(fns.Save, typ) && validateStateFn(fns.Load, typ) +// Type is an interface that must be implemented by Struct objects. This allows +// these objects to be serialized while minimizing runtime reflection required. +// +// All these methods can be automatically generated by the go_statify tool. +type Type interface { + // StateTypeName returns the type's name. + // + // This is used for matching type information during encoding and + // decoding, as well as dynamic interface dispatch. This should be + // globally unique. + StateTypeName() string + + // StateFields returns information about the type. + // + // Fields is the set of fields for the object. Calls to Sink.Save and + // Source.Load must be made in-order with respect to these fields. + // + // This will be called at most once per serialization. + StateFields() []string } -type typeDatabase struct { - // nameToType is a forward lookup table. - nameToType map[string]reflect.Type - - // typeToName is the reverse lookup table. - typeToName map[reflect.Type]string +// SaverLoader must be implemented by struct types. +type SaverLoader interface { + // StateSave saves the state of the object to the given Map. + StateSave(Sink) - // typeToFns is the function lookup table. - typeToFns map[reflect.Type]Fns + // StateLoad loads the state of the object. + StateLoad(Source) } -// registeredTypes is a database used for SaveInterface and LoadInterface. -var registeredTypes = typeDatabase{ - nameToType: make(map[string]reflect.Type), - typeToName: make(map[reflect.Type]string), - typeToFns: make(map[reflect.Type]Fns), +// Source is used for Type.StateLoad. +type Source struct { + internal objectDecoder } -// register registers a type under the given name. This will generally be -// called via init() methods, and therefore uses panic to propagate errors. -func (t *typeDatabase) register(name string, typ reflect.Type, fns Fns) { - // We can't allow name collisions. - if ot, ok := t.nameToType[name]; ok { - panic(fmt.Sprintf("type %q can't use name %q, already in use by type %q", typ.Name(), name, ot.Name())) - } - - // Or multiple registrations. - if on, ok := t.typeToName[typ]; ok { - panic(fmt.Sprintf("type %q can't be registered as %q, already registered as %q", typ.Name(), name, on)) - } - - t.nameToType[name] = typ - t.typeToName[typ] = name - t.typeToFns[typ] = fns +// Load loads the given object passed as a pointer.. +// +// See Sink.Save for an example. +func (s Source) Load(slot int, objPtr interface{}) { + s.internal.load(slot, reflect.ValueOf(objPtr), false, nil) } -// lookupType finds a type given a name. -func (t *typeDatabase) lookupType(name string) (reflect.Type, bool) { - typ, ok := t.nameToType[name] - return typ, ok +// LoadWait loads the given objects from the map, and marks it as requiring all +// AfterLoad executions to complete prior to running this object's AfterLoad. +// +// See Sink.Save for an example. +func (s Source) LoadWait(slot int, objPtr interface{}) { + s.internal.load(slot, reflect.ValueOf(objPtr), true, nil) } -// lookupName finds a name given a type. -func (t *typeDatabase) lookupName(typ reflect.Type) (string, bool) { - name, ok := t.typeToName[typ] - return name, ok +// LoadValue loads the given object value from the map. +// +// See Sink.SaveValue for an example. +func (s Source) LoadValue(slot int, objPtr interface{}, fn func(interface{})) { + o := reflect.ValueOf(objPtr) + s.internal.load(slot, o, true, func() { fn(o.Elem().Interface()) }) } -// lookupFns finds functions given a type. -func (t *typeDatabase) lookupFns(typ reflect.Type) (Fns, bool) { - fns, ok := t.typeToFns[typ] - return fns, ok +// AfterLoad schedules a function execution when all objects have been +// allocated and their automated loading and customized load logic have been +// executed. fn will not be executed until all of current object's +// dependencies' AfterLoad() logic, if exist, have been executed. +func (s Source) AfterLoad(fn func()) { + s.internal.afterLoad(fn) } -// Register must be called for any interface implementation types that -// implements Loader. -// -// Register should be called either immediately after startup or via init() -// methods. Double registration of either names or types will result in a panic. -// -// No synchronization is provided; this should only be called in init. -// -// Example usage: -// -// state.Register("Foo", (*Foo)(nil), state.Fns{ -// Save: (*Foo).Save, -// Load: (*Foo).Load, -// }) -// -func Register(name string, instance interface{}, fns Fns) { - registeredTypes.register(name, reflect.TypeOf(instance), fns) +// Context returns the context object provided at load time. +func (s Source) Context() context.Context { + return s.internal.ds.ctx } // IsZeroValue checks if the given value is the zero value. @@ -244,72 +260,14 @@ func IsZeroValue(val interface{}) bool { return val == nil || reflect.ValueOf(val).Elem().IsZero() } -// step captures one encoding / decoding step. On each step, there is up to one -// choice made, which is captured by non-nil param. We intentionally do not -// eagerly create the final path string, as that will only be needed upon panic. -type step struct { - // dereference indicate if the current object is obtained by - // dereferencing a pointer. - dereference bool - - // format is the formatting string that takes param below, if - // non-nil. For example, in array indexing case, we have "[%d]". - format string - - // param stores the choice made at the current encoding / decoding step. - // For eaxmple, in array indexing case, param stores the index. When no - // choice is made, e.g. dereference, param should be nil. - param interface{} -} - -// recoverable is the state encoding / decoding panic recovery facility. It is -// also used to store encoding / decoding steps as well as the reference to the -// original queued object from which the current object is dispatched. The -// complete encoding / decoding path is synthesised from the steps in all queued -// objects leading to the current object. -type recoverable struct { - from *recoverable - steps []step +// Failf is a wrapper around panic that should be used to generate errors that +// can be caught during saving and loading. +func Failf(fmtStr string, v ...interface{}) { + panic(fmt.Errorf(fmtStr, v...)) } -// push enters a new context level. -func (sr *recoverable) push(dereference bool, format string, param interface{}) { - sr.steps = append(sr.steps, step{dereference, format, param}) -} - -// pop exits the current context level. -func (sr *recoverable) pop() { - if len(sr.steps) <= 1 { - return - } - sr.steps = sr.steps[:len(sr.steps)-1] -} - -// path returns the complete encoding / decoding path from root. This is only -// called upon panic. -func (sr *recoverable) path() string { - if sr.from == nil { - return "root" - } - p := sr.from.path() - for _, s := range sr.steps { - if s.dereference { - p = fmt.Sprintf("*(%s)", p) - } - if s.param == nil { - p += s.format - } else { - p += fmt.Sprintf(s.format, s.param) - } - } - return p -} - -func (sr *recoverable) copy() recoverable { - return recoverable{from: sr.from, steps: append([]step(nil), sr.steps...)} -} - -// safely executes the given function, catching a panic and unpacking as an error. +// safely executes the given function, catching a panic and unpacking as an +// error. // // The error flow through the state package uses panic and recover. There are // two important reasons for this: @@ -323,9 +281,15 @@ func (sr *recoverable) copy() recoverable { // method doesn't add a lot of value. If there are specific error conditions // that you'd like to handle, you should add appropriate functionality to // objects themselves prior to calling Save() and Load(). -func (sr *recoverable) safely(fn func()) (err error) { +func safely(fn func()) (err error) { defer func() { if r := recover(); r != nil { + if es, ok := r.(*ErrState); ok { + err = es // Propagate. + return + } + + // Build a new state error. es := new(ErrState) if e, ok := r.(error); ok { es.err = e @@ -333,8 +297,6 @@ func (sr *recoverable) safely(fn func()) (err error) { es.err = fmt.Errorf("%v", r) } - es.path = sr.path() - // Make a stack. We don't know how big it will be ahead // of time, but want to make sure we get the whole // thing. So we just do a stupid brute force approach. diff --git a/pkg/state/state_norace.go b/pkg/state/state_norace.go new file mode 100644 index 000000000..4281aed6d --- /dev/null +++ b/pkg/state/state_norace.go @@ -0,0 +1,19 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !race + +package state + +var raceEnabled = false diff --git a/pkg/state/state_race.go b/pkg/state/state_race.go new file mode 100644 index 000000000..8232981ce --- /dev/null +++ b/pkg/state/state_race.go @@ -0,0 +1,19 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build race + +package state + +var raceEnabled = true diff --git a/pkg/state/state_test.go b/pkg/state/state_test.go deleted file mode 100644 index d7221e9e8..000000000 --- a/pkg/state/state_test.go +++ /dev/null @@ -1,721 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package state - -import ( - "bytes" - "context" - "io/ioutil" - "math" - "reflect" - "testing" -) - -// TestCase is used to define a single success/failure testcase of -// serialization of a set of objects. -type TestCase struct { - // Name is the name of the test case. - Name string - - // Objects is the list of values to serialize. - Objects []interface{} - - // Fail is whether the test case is supposed to fail or not. - Fail bool -} - -// runTest runs all testcases. -func runTest(t *testing.T, tests []TestCase) { - for _, test := range tests { - t.Logf("TEST %s:", test.Name) - for i, root := range test.Objects { - t.Logf(" case#%d: %#v", i, root) - - // Save the passed object. - saveBuffer := &bytes.Buffer{} - saveObjectPtr := reflect.New(reflect.TypeOf(root)) - saveObjectPtr.Elem().Set(reflect.ValueOf(root)) - if err := Save(context.Background(), saveBuffer, saveObjectPtr.Interface(), nil); err != nil && !test.Fail { - t.Errorf(" FAIL: Save failed unexpectedly: %v", err) - continue - } else if err != nil { - t.Logf(" PASS: Save failed as expected: %v", err) - continue - } - - // Load a new copy of the object. - loadObjectPtr := reflect.New(reflect.TypeOf(root)) - if err := Load(context.Background(), bytes.NewReader(saveBuffer.Bytes()), loadObjectPtr.Interface(), nil); err != nil && !test.Fail { - t.Errorf(" FAIL: Load failed unexpectedly: %v", err) - continue - } else if err != nil { - t.Logf(" PASS: Load failed as expected: %v", err) - continue - } - - // Compare the values. - loadedValue := loadObjectPtr.Elem().Interface() - if eq := reflect.DeepEqual(root, loadedValue); !eq && !test.Fail { - t.Errorf(" FAIL: Objects differs; got %#v", loadedValue) - continue - } else if !eq { - t.Logf(" PASS: Object different as expected.") - continue - } - - // Everything went okay. Is that good? - if test.Fail { - t.Errorf(" FAIL: Unexpected success.") - } else { - t.Logf(" PASS: Success.") - } - } - } -} - -// dumbStruct is a struct which does not implement the loader/saver interface. -// We expect that serialization of this struct will fail. -type dumbStruct struct { - A int - B int -} - -// smartStruct is a struct which does implement the loader/saver interface. -// We expect that serialization of this struct will succeed. -type smartStruct struct { - A int - B int -} - -func (s *smartStruct) save(m Map) { - m.Save("A", &s.A) - m.Save("B", &s.B) -} - -func (s *smartStruct) load(m Map) { - m.Load("A", &s.A) - m.Load("B", &s.B) -} - -// valueLoadStruct uses a value load. -type valueLoadStruct struct { - v int -} - -func (v *valueLoadStruct) save(m Map) { - m.SaveValue("v", v.v) -} - -func (v *valueLoadStruct) load(m Map) { - m.LoadValue("v", new(int), func(value interface{}) { - v.v = value.(int) - }) -} - -// afterLoadStruct has an AfterLoad function. -type afterLoadStruct struct { - v int -} - -func (a *afterLoadStruct) save(m Map) { -} - -func (a *afterLoadStruct) load(m Map) { - m.AfterLoad(func() { - a.v++ - }) -} - -// genericContainer is a generic dispatcher. -type genericContainer struct { - v interface{} -} - -func (g *genericContainer) save(m Map) { - m.Save("v", &g.v) -} - -func (g *genericContainer) load(m Map) { - m.Load("v", &g.v) -} - -// sliceContainer is a generic slice. -type sliceContainer struct { - v []interface{} -} - -func (s *sliceContainer) save(m Map) { - m.Save("v", &s.v) -} - -func (s *sliceContainer) load(m Map) { - m.Load("v", &s.v) -} - -// mapContainer is a generic map. -type mapContainer struct { - v map[int]interface{} -} - -func (mc *mapContainer) save(m Map) { - m.Save("v", &mc.v) -} - -func (mc *mapContainer) load(m Map) { - // Some of the test cases below assume legacy behavior wherein maps - // will automatically inherit dependencies. - m.LoadWait("v", &mc.v) -} - -// dumbMap is a map which does not implement the loader/saver interface. -// Serialization of this map will default to the standard encode/decode logic. -type dumbMap map[string]int - -// pointerStruct contains various pointers, shared and non-shared, and pointers -// to pointers. We expect that serialization will respect the structure. -type pointerStruct struct { - A *int - B *int - C *int - D *int - - AA **int - BB **int -} - -func (p *pointerStruct) save(m Map) { - m.Save("A", &p.A) - m.Save("B", &p.B) - m.Save("C", &p.C) - m.Save("D", &p.D) - m.Save("AA", &p.AA) - m.Save("BB", &p.BB) -} - -func (p *pointerStruct) load(m Map) { - m.Load("A", &p.A) - m.Load("B", &p.B) - m.Load("C", &p.C) - m.Load("D", &p.D) - m.Load("AA", &p.AA) - m.Load("BB", &p.BB) -} - -// testInterface is a trivial interface example. -type testInterface interface { - Foo() -} - -// testImpl is a trivial implementation of testInterface. -type testImpl struct { -} - -// Foo satisfies testInterface. -func (t *testImpl) Foo() { -} - -// testImpl is trivially serializable. -func (t *testImpl) save(m Map) { -} - -// testImpl is trivially serializable. -func (t *testImpl) load(m Map) { -} - -// testI demonstrates interface dispatching. -type testI struct { - I testInterface -} - -func (t *testI) save(m Map) { - m.Save("I", &t.I) -} - -func (t *testI) load(m Map) { - m.Load("I", &t.I) -} - -// cycleStruct is used to implement basic cycles. -type cycleStruct struct { - c *cycleStruct -} - -func (c *cycleStruct) save(m Map) { - m.Save("c", &c.c) -} - -func (c *cycleStruct) load(m Map) { - m.Load("c", &c.c) -} - -// badCycleStruct actually has deadlocking dependencies. -// -// This should pass if b.b = {nil|b} and fail otherwise. -type badCycleStruct struct { - b *badCycleStruct -} - -func (b *badCycleStruct) save(m Map) { - m.Save("b", &b.b) -} - -func (b *badCycleStruct) load(m Map) { - m.LoadWait("b", &b.b) - m.AfterLoad(func() { - // This is not executable, since AfterLoad requires that the - // object and all dependencies are complete. This should cause - // a deadlock error during load. - }) -} - -// emptyStructPointer points to an empty struct. -type emptyStructPointer struct { - nothing *struct{} -} - -func (e *emptyStructPointer) save(m Map) { - m.Save("nothing", &e.nothing) -} - -func (e *emptyStructPointer) load(m Map) { - m.Load("nothing", &e.nothing) -} - -// truncateInteger truncates an integer. -type truncateInteger struct { - v int64 - v2 int32 -} - -func (t *truncateInteger) save(m Map) { - t.v2 = int32(t.v) - m.Save("v", &t.v) -} - -func (t *truncateInteger) load(m Map) { - m.Load("v", &t.v2) - t.v = int64(t.v2) -} - -// truncateUnsignedInteger truncates an unsigned integer. -type truncateUnsignedInteger struct { - v uint64 - v2 uint32 -} - -func (t *truncateUnsignedInteger) save(m Map) { - t.v2 = uint32(t.v) - m.Save("v", &t.v) -} - -func (t *truncateUnsignedInteger) load(m Map) { - m.Load("v", &t.v2) - t.v = uint64(t.v2) -} - -// truncateFloat truncates a floating point number. -type truncateFloat struct { - v float64 - v2 float32 -} - -func (t *truncateFloat) save(m Map) { - t.v2 = float32(t.v) - m.Save("v", &t.v) -} - -func (t *truncateFloat) load(m Map) { - m.Load("v", &t.v2) - t.v = float64(t.v2) -} - -func TestTypes(t *testing.T) { - // x and y are basic integers, while xp points to x. - x := 1 - y := 2 - xp := &x - - // cs is a single object cycle. - cs := cycleStruct{nil} - cs.c = &cs - - // cs1 and cs2 are in a two object cycle. - cs1 := cycleStruct{nil} - cs2 := cycleStruct{nil} - cs1.c = &cs2 - cs2.c = &cs1 - - // bs is a single object cycle. - bs := badCycleStruct{nil} - bs.b = &bs - - // bs2 and bs2 are in a deadlocking cycle. - bs1 := badCycleStruct{nil} - bs2 := badCycleStruct{nil} - bs1.b = &bs2 - bs2.b = &bs1 - - // regular nils. - var ( - nilmap dumbMap - nilslice []byte - ) - - // embed points to embedded fields. - embed1 := pointerStruct{} - embed1.AA = &embed1.A - embed2 := pointerStruct{} - embed2.BB = &embed2.B - - // es1 contains two structs pointing to the same empty struct. - es := emptyStructPointer{new(struct{})} - es1 := []emptyStructPointer{es, es} - - tests := []TestCase{ - { - Name: "bool", - Objects: []interface{}{ - true, - false, - }, - }, - { - Name: "integers", - Objects: []interface{}{ - int(0), - int(1), - int(-1), - int8(0), - int8(1), - int8(-1), - int16(0), - int16(1), - int16(-1), - int32(0), - int32(1), - int32(-1), - int64(0), - int64(1), - int64(-1), - }, - }, - { - Name: "unsigned integers", - Objects: []interface{}{ - uint(0), - uint(1), - uint8(0), - uint8(1), - uint16(0), - uint16(1), - uint32(1), - uint64(0), - uint64(1), - }, - }, - { - Name: "strings", - Objects: []interface{}{ - "", - "foo", - "bar", - "\xa0", - }, - }, - { - Name: "slices", - Objects: []interface{}{ - []int{-1, 0, 1}, - []*int{&x, &x, &x}, - []int{1, 2, 3}[0:1], - []int{1, 2, 3}[1:2], - make([]byte, 32), - make([]byte, 32)[:16], - make([]byte, 32)[:16:20], - nilslice, - }, - }, - { - Name: "arrays", - Objects: []interface{}{ - &[1048576]bool{false, true, false, true}, - &[1048576]uint8{0, 1, 2, 3}, - &[1048576]byte{0, 1, 2, 3}, - &[1048576]uint16{0, 1, 2, 3}, - &[1048576]uint{0, 1, 2, 3}, - &[1048576]uint32{0, 1, 2, 3}, - &[1048576]uint64{0, 1, 2, 3}, - &[1048576]uintptr{0, 1, 2, 3}, - &[1048576]int8{0, -1, -2, -3}, - &[1048576]int16{0, -1, -2, -3}, - &[1048576]int32{0, -1, -2, -3}, - &[1048576]int64{0, -1, -2, -3}, - &[1048576]float32{0, 1.1, 2.2, 3.3}, - &[1048576]float64{0, 1.1, 2.2, 3.3}, - }, - }, - { - Name: "pointers", - Objects: []interface{}{ - &pointerStruct{A: &x, B: &x, C: &y, D: &y, AA: &xp, BB: &xp}, - &pointerStruct{}, - }, - }, - { - Name: "empty struct", - Objects: []interface{}{ - struct{}{}, - }, - }, - { - Name: "unenlightened structs", - Objects: []interface{}{ - &dumbStruct{A: 1, B: 2}, - }, - Fail: true, - }, - { - Name: "enlightened structs", - Objects: []interface{}{ - &smartStruct{A: 1, B: 2}, - }, - }, - { - Name: "load-hooks", - Objects: []interface{}{ - &afterLoadStruct{v: 1}, - &valueLoadStruct{v: 1}, - &genericContainer{v: &afterLoadStruct{v: 1}}, - &genericContainer{v: &valueLoadStruct{v: 1}}, - &sliceContainer{v: []interface{}{&afterLoadStruct{v: 1}}}, - &sliceContainer{v: []interface{}{&valueLoadStruct{v: 1}}}, - &mapContainer{v: map[int]interface{}{0: &afterLoadStruct{v: 1}}}, - &mapContainer{v: map[int]interface{}{0: &valueLoadStruct{v: 1}}}, - }, - }, - { - Name: "maps", - Objects: []interface{}{ - dumbMap{"a": -1, "b": 0, "c": 1}, - map[smartStruct]int{{}: 0, {A: 1}: 1}, - nilmap, - &mapContainer{v: map[int]interface{}{0: &smartStruct{A: 1}}}, - }, - }, - { - Name: "interfaces", - Objects: []interface{}{ - &testI{&testImpl{}}, - &testI{nil}, - &testI{(*testImpl)(nil)}, - }, - }, - { - Name: "unregistered-interfaces", - Objects: []interface{}{ - &genericContainer{v: afterLoadStruct{v: 1}}, - &genericContainer{v: valueLoadStruct{v: 1}}, - &sliceContainer{v: []interface{}{afterLoadStruct{v: 1}}}, - &sliceContainer{v: []interface{}{valueLoadStruct{v: 1}}}, - &mapContainer{v: map[int]interface{}{0: afterLoadStruct{v: 1}}}, - &mapContainer{v: map[int]interface{}{0: valueLoadStruct{v: 1}}}, - }, - Fail: true, - }, - { - Name: "cycles", - Objects: []interface{}{ - &cs, - &cs1, - &cycleStruct{&cs1}, - &cycleStruct{&cs}, - &badCycleStruct{nil}, - &bs, - }, - }, - { - Name: "deadlock", - Objects: []interface{}{ - &bs1, - }, - Fail: true, - }, - { - Name: "embed", - Objects: []interface{}{ - &embed1, - &embed2, - }, - Fail: true, - }, - { - Name: "empty structs", - Objects: []interface{}{ - new(struct{}), - es, - es1, - }, - }, - { - Name: "truncated okay", - Objects: []interface{}{ - &truncateInteger{v: 1}, - &truncateUnsignedInteger{v: 1}, - &truncateFloat{v: 1.0}, - }, - }, - { - Name: "truncated bad", - Objects: []interface{}{ - &truncateInteger{v: math.MaxInt32 + 1}, - &truncateUnsignedInteger{v: math.MaxUint32 + 1}, - &truncateFloat{v: math.MaxFloat32 * 2}, - }, - Fail: true, - }, - } - - runTest(t, tests) -} - -// benchStruct is used for benchmarking. -type benchStruct struct { - b *benchStruct - - // Dummy data is included to ensure that these objects are large. - // This is to detect possible regression when registering objects. - _ [4096]byte -} - -func (b *benchStruct) save(m Map) { - m.Save("b", &b.b) -} - -func (b *benchStruct) load(m Map) { - m.LoadWait("b", &b.b) - m.AfterLoad(b.afterLoad) -} - -func (b *benchStruct) afterLoad() { - // Do nothing, just force scheduling. -} - -// buildObject builds a benchmark object. -func buildObject(n int) (b *benchStruct) { - for i := 0; i < n; i++ { - b = &benchStruct{b: b} - } - return -} - -func BenchmarkEncoding(b *testing.B) { - b.StopTimer() - bs := buildObject(b.N) - var stats Stats - b.StartTimer() - if err := Save(context.Background(), ioutil.Discard, bs, &stats); err != nil { - b.Errorf("save failed: %v", err) - } - b.StopTimer() - if b.N > 1000 { - b.Logf("breakdown (n=%d): %s", b.N, &stats) - } -} - -func BenchmarkDecoding(b *testing.B) { - b.StopTimer() - bs := buildObject(b.N) - var newBS benchStruct - buf := &bytes.Buffer{} - if err := Save(context.Background(), buf, bs, nil); err != nil { - b.Errorf("save failed: %v", err) - } - var stats Stats - b.StartTimer() - if err := Load(context.Background(), buf, &newBS, &stats); err != nil { - b.Errorf("load failed: %v", err) - } - b.StopTimer() - if b.N > 1000 { - b.Logf("breakdown (n=%d): %s", b.N, &stats) - } -} - -func init() { - Register("stateTest.smartStruct", (*smartStruct)(nil), Fns{ - Save: (*smartStruct).save, - Load: (*smartStruct).load, - }) - Register("stateTest.afterLoadStruct", (*afterLoadStruct)(nil), Fns{ - Save: (*afterLoadStruct).save, - Load: (*afterLoadStruct).load, - }) - Register("stateTest.valueLoadStruct", (*valueLoadStruct)(nil), Fns{ - Save: (*valueLoadStruct).save, - Load: (*valueLoadStruct).load, - }) - Register("stateTest.genericContainer", (*genericContainer)(nil), Fns{ - Save: (*genericContainer).save, - Load: (*genericContainer).load, - }) - Register("stateTest.sliceContainer", (*sliceContainer)(nil), Fns{ - Save: (*sliceContainer).save, - Load: (*sliceContainer).load, - }) - Register("stateTest.mapContainer", (*mapContainer)(nil), Fns{ - Save: (*mapContainer).save, - Load: (*mapContainer).load, - }) - Register("stateTest.pointerStruct", (*pointerStruct)(nil), Fns{ - Save: (*pointerStruct).save, - Load: (*pointerStruct).load, - }) - Register("stateTest.testImpl", (*testImpl)(nil), Fns{ - Save: (*testImpl).save, - Load: (*testImpl).load, - }) - Register("stateTest.testI", (*testI)(nil), Fns{ - Save: (*testI).save, - Load: (*testI).load, - }) - Register("stateTest.cycleStruct", (*cycleStruct)(nil), Fns{ - Save: (*cycleStruct).save, - Load: (*cycleStruct).load, - }) - Register("stateTest.badCycleStruct", (*badCycleStruct)(nil), Fns{ - Save: (*badCycleStruct).save, - Load: (*badCycleStruct).load, - }) - Register("stateTest.emptyStructPointer", (*emptyStructPointer)(nil), Fns{ - Save: (*emptyStructPointer).save, - Load: (*emptyStructPointer).load, - }) - Register("stateTest.truncateInteger", (*truncateInteger)(nil), Fns{ - Save: (*truncateInteger).save, - Load: (*truncateInteger).load, - }) - Register("stateTest.truncateUnsignedInteger", (*truncateUnsignedInteger)(nil), Fns{ - Save: (*truncateUnsignedInteger).save, - Load: (*truncateUnsignedInteger).load, - }) - Register("stateTest.truncateFloat", (*truncateFloat)(nil), Fns{ - Save: (*truncateFloat).save, - Load: (*truncateFloat).load, - }) - Register("stateTest.benchStruct", (*benchStruct)(nil), Fns{ - Save: (*benchStruct).save, - Load: (*benchStruct).load, - }) -} diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD index e7581c09b..d6c89c7e9 100644 --- a/pkg/state/statefile/BUILD +++ b/pkg/state/statefile/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/binary", "//pkg/compressio", + "//pkg/state/wire", ], ) diff --git a/pkg/state/statefile/statefile.go b/pkg/state/statefile/statefile.go index c0f4c4954..bdfb800fb 100644 --- a/pkg/state/statefile/statefile.go +++ b/pkg/state/statefile/statefile.go @@ -57,6 +57,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/compressio" + "gvisor.dev/gvisor/pkg/state/wire" ) // keySize is the AES-256 key length. @@ -83,10 +84,16 @@ var ErrInvalidMetadataLength = fmt.Errorf("metadata length invalid, maximum size // ErrMetadataInvalid is returned if passed metadata is invalid. var ErrMetadataInvalid = fmt.Errorf("metadata invalid, can't start with _") +// WriteCloser is an io.Closer and wire.Writer. +type WriteCloser interface { + wire.Writer + io.Closer +} + // NewWriter returns a state data writer for a statefile. // // Note that the returned WriteCloser must be closed. -func NewWriter(w io.Writer, key []byte, metadata map[string]string) (io.WriteCloser, error) { +func NewWriter(w io.Writer, key []byte, metadata map[string]string) (WriteCloser, error) { if metadata == nil { metadata = make(map[string]string) } @@ -215,7 +222,7 @@ func metadata(r io.Reader, h hash.Hash) (map[string]string, error) { } // NewReader returns a reader for a statefile. -func NewReader(r io.Reader, key []byte) (io.Reader, map[string]string, error) { +func NewReader(r io.Reader, key []byte) (wire.Reader, map[string]string, error) { // Read the metadata with the hash. h := hmac.New(sha256.New, key) metadata, err := metadata(r, h) @@ -224,9 +231,9 @@ func NewReader(r io.Reader, key []byte) (io.Reader, map[string]string, error) { } // Wrap in compression. - rc, err := compressio.NewReader(r, key) + cr, err := compressio.NewReader(r, key) if err != nil { return nil, nil, err } - return rc, metadata, nil + return cr, metadata, nil } diff --git a/pkg/state/stats.go b/pkg/state/stats.go index eb51cda47..eaec664a1 100644 --- a/pkg/state/stats.go +++ b/pkg/state/stats.go @@ -17,7 +17,6 @@ package state import ( "bytes" "fmt" - "reflect" "sort" "time" ) @@ -35,92 +34,81 @@ type statEntry struct { // All exported receivers accept nil. type Stats struct { // byType contains a breakdown of time spent by type. - byType map[reflect.Type]*statEntry + // + // This is indexed *directly* by typeID, including zero. + byType []statEntry // stack contains objects in progress. - stack []reflect.Type + stack []typeID + + // names contains type names. + // + // This is also indexed *directly* by typeID, including zero, which we + // hard-code as "state.default". This is only resolved by calling fini + // on the stats object. + names []string // last is the last start time. last time.Time } -// sample adds the samples to the given object. -func (s *Stats) sample(typ reflect.Type) { - now := time.Now() - s.byType[typ].total += now.Sub(s.last) - s.last = now +// init initializes statistics. +func (s *Stats) init() { + s.last = time.Now() + s.stack = append(s.stack, 0) } -// Add adds a sample count. -func (s *Stats) Add(obj reflect.Value) { - if s == nil { - return - } - if s.byType == nil { - s.byType = make(map[reflect.Type]*statEntry) - } - typ := obj.Type() - entry, ok := s.byType[typ] - if !ok { - entry = new(statEntry) - s.byType[typ] = entry +// fini finalizes statistics. +func (s *Stats) fini(resolve func(id typeID) string) { + s.done() + + // Resolve all type names. + s.names = make([]string, len(s.byType)) + s.names[0] = "state.default" // See above. + for id := typeID(1); int(id) < len(s.names); id++ { + s.names[id] = resolve(id) } - entry.count++ } -// Remove removes a sample count. It should only be called after a previous -// Add(). -func (s *Stats) Remove(obj reflect.Value) { - if s == nil { - return +// sample adds the samples to the given object. +func (s *Stats) sample(id typeID) { + now := time.Now() + if len(s.byType) <= int(id) { + // Allocate all the missing entries in one fell swoop. + s.byType = append(s.byType, make([]statEntry, 1+int(id)-len(s.byType))...) } - typ := obj.Type() - entry := s.byType[typ] - entry.count-- + s.byType[id].total += now.Sub(s.last) + s.last = now } -// Start starts a sample. -func (s *Stats) Start(obj reflect.Value) { - if s == nil { - return - } - if len(s.stack) > 0 { - last := s.stack[len(s.stack)-1] - s.sample(last) - } else { - // First time sample. - s.last = time.Now() - } - s.stack = append(s.stack, obj.Type()) +// start starts a sample. +func (s *Stats) start(id typeID) { + last := s.stack[len(s.stack)-1] + s.sample(last) + s.stack = append(s.stack, id) } -// Done finishes the current sample. -func (s *Stats) Done() { - if s == nil { - return - } +// done finishes the current sample. +func (s *Stats) done() { last := s.stack[len(s.stack)-1] s.sample(last) + s.byType[last].count++ s.stack = s.stack[:len(s.stack)-1] } type sliceEntry struct { - typ reflect.Type + name string entry *statEntry } // String returns a table representation of the stats. func (s *Stats) String() string { - if s == nil || len(s.byType) == 0 { - return "(no data)" - } - // Build a list of stat entries. ss := make([]sliceEntry, 0, len(s.byType)) - for typ, entry := range s.byType { + for id := 0; id < len(s.names); id++ { ss = append(ss, sliceEntry{ - typ: typ, - entry: entry, + name: s.names[id], + entry: &s.byType[id], }) } @@ -136,17 +124,22 @@ func (s *Stats) String() string { total time.Duration ) buf.WriteString("\n") - buf.WriteString(fmt.Sprintf("%12s | %8s | %8s | %s\n", "total", "count", "per", "type")) - buf.WriteString("-------------+----------+----------+-------------\n") + buf.WriteString(fmt.Sprintf("% 16s | % 8s | % 16s | %s\n", "total", "count", "per", "type")) + buf.WriteString("-----------------+----------+------------------+----------------\n") for _, se := range ss { + if se.entry.count == 0 { + // Since we store all types linearly, we are not + // guaranteed that any entry actually has time. + continue + } count += se.entry.count total += se.entry.total per := se.entry.total / time.Duration(se.entry.count) - buf.WriteString(fmt.Sprintf("%12s | %8d | %8s | %s\n", - se.entry.total, se.entry.count, per, se.typ.String())) + buf.WriteString(fmt.Sprintf("% 16s | %8d | % 16s | %s\n", + se.entry.total, se.entry.count, per, se.name)) } - buf.WriteString("-------------+----------+----------+-------------\n") - buf.WriteString(fmt.Sprintf("%12s | %8d | %8s | [all]", + buf.WriteString("-----------------+----------+------------------+----------------\n") + buf.WriteString(fmt.Sprintf("% 16s | % 8d | % 16s | [all]", total, count, total/time.Duration(count))) return string(buf.Bytes()) } diff --git a/pkg/state/tests/BUILD b/pkg/state/tests/BUILD new file mode 100644 index 000000000..9297cafbe --- /dev/null +++ b/pkg/state/tests/BUILD @@ -0,0 +1,43 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "tests", + srcs = [ + "array.go", + "bench.go", + "integer.go", + "load.go", + "map.go", + "register.go", + "struct.go", + "tests.go", + ], + deps = [ + "//pkg/state", + "//pkg/state/pretty", + ], +) + +go_test( + name = "tests_test", + size = "small", + srcs = [ + "array_test.go", + "bench_test.go", + "bool_test.go", + "float_test.go", + "integer_test.go", + "load_test.go", + "map_test.go", + "register_test.go", + "string_test.go", + "struct_test.go", + ], + library = ":tests", + deps = [ + "//pkg/state", + "//pkg/state/wire", + ], +) diff --git a/pkg/state/tests/array.go b/pkg/state/tests/array.go new file mode 100644 index 000000000..0972a80e7 --- /dev/null +++ b/pkg/state/tests/array.go @@ -0,0 +1,35 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +// +stateify savable +type arrayContainer struct { + v [1]interface{} +} + +// +stateify savable +type arrayPtrContainer struct { + v *[1]interface{} +} + +// +stateify savable +type sliceContainer struct { + v []interface{} +} + +// +stateify savable +type slicePtrContainer struct { + v *[]interface{} +} diff --git a/pkg/state/tests/array_test.go b/pkg/state/tests/array_test.go new file mode 100644 index 000000000..a347b2947 --- /dev/null +++ b/pkg/state/tests/array_test.go @@ -0,0 +1,134 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "reflect" + "testing" +) + +var allArrayPrimitives = []interface{}{ + [1]bool{}, + [1]bool{true}, + [2]bool{false, true}, + [1]int{}, + [1]int{1}, + [2]int{0, 1}, + [1]int8{}, + [1]int8{1}, + [2]int8{0, 1}, + [1]int16{}, + [1]int16{1}, + [2]int16{0, 1}, + [1]int32{}, + [1]int32{1}, + [2]int32{0, 1}, + [1]int64{}, + [1]int64{1}, + [2]int64{0, 1}, + [1]uint{}, + [1]uint{1}, + [2]uint{0, 1}, + [1]uintptr{}, + [1]uintptr{1}, + [2]uintptr{0, 1}, + [1]uint8{}, + [1]uint8{1}, + [2]uint8{0, 1}, + [1]uint16{}, + [1]uint16{1}, + [2]uint16{0, 1}, + [1]uint32{}, + [1]uint32{1}, + [2]uint32{0, 1}, + [1]uint64{}, + [1]uint64{1}, + [2]uint64{0, 1}, + [1]string{}, + [1]string{""}, + [1]string{nonEmptyString}, + [2]string{"", nonEmptyString}, +} + +func TestArrayPrimitives(t *testing.T) { + runTestCases(t, false, "plain", flatten(allArrayPrimitives)) + runTestCases(t, false, "pointers", pointersTo(flatten(allArrayPrimitives))) + runTestCases(t, false, "interfaces", interfacesTo(flatten(allArrayPrimitives))) + runTestCases(t, false, "interfacesToPointers", interfacesTo(pointersTo(flatten(allArrayPrimitives)))) +} + +func TestSlices(t *testing.T) { + var allSlices = flatten( + filter(allArrayPrimitives, func(o interface{}) (interface{}, bool) { + v := reflect.New(reflect.TypeOf(o)).Elem() + v.Set(reflect.ValueOf(o)) + return v.Slice(0, v.Len()).Interface(), true + }), + filter(allArrayPrimitives, func(o interface{}) (interface{}, bool) { + v := reflect.New(reflect.TypeOf(o)).Elem() + v.Set(reflect.ValueOf(o)) + if v.Len() == 0 { + // Return the pure "nil" value for the slice. + return reflect.New(v.Slice(0, 0).Type()).Elem().Interface(), true + } + return v.Slice(1, v.Len()).Interface(), true + }), + filter(allArrayPrimitives, func(o interface{}) (interface{}, bool) { + v := reflect.New(reflect.TypeOf(o)).Elem() + v.Set(reflect.ValueOf(o)) + if v.Len() == 0 { + // Return the zero-valued slice. + return reflect.MakeSlice(v.Slice(0, 0).Type(), 0, 0).Interface(), true + } + return v.Slice(0, v.Len()-1).Interface(), true + }), + ) + runTestCases(t, false, "plain", allSlices) + runTestCases(t, false, "pointers", pointersTo(allSlices)) + runTestCases(t, false, "interfaces", interfacesTo(allSlices)) + runTestCases(t, false, "interfacesToPointers", interfacesTo(pointersTo(allSlices))) +} + +func TestArrayContainers(t *testing.T) { + var ( + emptyArray [1]interface{} + fullArray [1]interface{} + ) + fullArray[0] = &emptyArray + runTestCases(t, false, "", []interface{}{ + arrayContainer{v: emptyArray}, + arrayContainer{v: fullArray}, + arrayPtrContainer{v: nil}, + arrayPtrContainer{v: &emptyArray}, + arrayPtrContainer{v: &fullArray}, + }) +} + +func TestSliceContainers(t *testing.T) { + var ( + nilSlice []interface{} + emptySlice = make([]interface{}, 0) + fullSlice = []interface{}{nil} + ) + runTestCases(t, false, "", []interface{}{ + sliceContainer{v: nilSlice}, + sliceContainer{v: emptySlice}, + sliceContainer{v: fullSlice}, + slicePtrContainer{v: nil}, + slicePtrContainer{v: &nilSlice}, + slicePtrContainer{v: &emptySlice}, + slicePtrContainer{v: &fullSlice}, + }) +} diff --git a/pkg/state/tests/bench.go b/pkg/state/tests/bench.go new file mode 100644 index 000000000..40869cdfb --- /dev/null +++ b/pkg/state/tests/bench.go @@ -0,0 +1,24 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +// +stateify savable +type benchStruct struct { + B *benchStruct // Must be exported for gob. +} + +func (b *benchStruct) afterLoad() { + // Do nothing, just force scheduling. +} diff --git a/pkg/state/tests/bench_test.go b/pkg/state/tests/bench_test.go new file mode 100644 index 000000000..7e102c907 --- /dev/null +++ b/pkg/state/tests/bench_test.go @@ -0,0 +1,153 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "bytes" + "context" + "encoding/gob" + "fmt" + "testing" + + "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" +) + +// buildPtrObject builds a benchmark object. +func buildPtrObject(n int) interface{} { + b := new(benchStruct) + for i := 0; i < n; i++ { + b = &benchStruct{B: b} + } + return b +} + +// buildMapObject builds a benchmark object. +func buildMapObject(n int) interface{} { + b := new(benchStruct) + m := make(map[int]*benchStruct) + for i := 0; i < n; i++ { + m[i] = b + } + return &m +} + +// buildSliceObject builds a benchmark object. +func buildSliceObject(n int) interface{} { + b := new(benchStruct) + s := make([]*benchStruct, 0, n) + for i := 0; i < n; i++ { + s = append(s, b) + } + return &s +} + +var allObjects = map[string]struct { + New func(int) interface{} +}{ + "ptr": { + New: buildPtrObject, + }, + "map": { + New: buildMapObject, + }, + "slice": { + New: buildSliceObject, + }, +} + +func buildObjects(n int, fn func(int) interface{}) (iters int, v interface{}) { + // maxSize is the maximum size of an individual object below. For an N + // larger than this, we start to return multiple objects. + const maxSize = 1024 + if n <= maxSize { + return 1, fn(n) + } + iters = (n + maxSize - 1) / maxSize + return iters, fn(maxSize) +} + +// gobSave is a version of save using gob (no stats available). +func gobSave(_ context.Context, w wire.Writer, v interface{}) (_ state.Stats, err error) { + enc := gob.NewEncoder(w) + err = enc.Encode(v) + return +} + +// gobLoad is a version of load using gob (no stats available). +func gobLoad(_ context.Context, r wire.Reader, v interface{}) (_ state.Stats, err error) { + dec := gob.NewDecoder(r) + err = dec.Decode(v) + return +} + +var allAlgos = map[string]struct { + Save func(context.Context, wire.Writer, interface{}) (state.Stats, error) + Load func(context.Context, wire.Reader, interface{}) (state.Stats, error) + MaxPtr int +}{ + "state": { + Save: state.Save, + Load: state.Load, + }, + "gob": { + Save: gobSave, + Load: gobLoad, + }, +} + +func BenchmarkEncoding(b *testing.B) { + for objName, objInfo := range allObjects { + for algoName, algoInfo := range allAlgos { + b.Run(fmt.Sprintf("%s/%s", objName, algoName), func(b *testing.B) { + b.StopTimer() + n, v := buildObjects(b.N, objInfo.New) + b.ReportAllocs() + b.StartTimer() + for i := 0; i < n; i++ { + if _, err := algoInfo.Save(context.Background(), discard{}, v); err != nil { + b.Errorf("save failed: %v", err) + } + } + b.StopTimer() + }) + } + } +} + +func BenchmarkDecoding(b *testing.B) { + for objName, objInfo := range allObjects { + for algoName, algoInfo := range allAlgos { + b.Run(fmt.Sprintf("%s/%s", objName, algoName), func(b *testing.B) { + b.StopTimer() + n, v := buildObjects(b.N, objInfo.New) + buf := new(bytes.Buffer) + if _, err := algoInfo.Save(context.Background(), buf, v); err != nil { + b.Errorf("save failed: %v", err) + } + b.ReportAllocs() + b.StartTimer() + var r bytes.Reader + for i := 0; i < n; i++ { + r.Reset(buf.Bytes()) + if _, err := algoInfo.Load(context.Background(), &r, v); err != nil { + b.Errorf("load failed: %v", err) + } + } + b.StopTimer() + }) + } + } +} diff --git a/pkg/state/tests/bool_test.go b/pkg/state/tests/bool_test.go new file mode 100644 index 000000000..e17cfacf9 --- /dev/null +++ b/pkg/state/tests/bool_test.go @@ -0,0 +1,31 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "testing" +) + +var allBools = []bool{ + true, + false, +} + +func TestBool(t *testing.T) { + runTestCases(t, false, "plain", flatten(allBools)) + runTestCases(t, false, "pointers", pointersTo(flatten(allBools))) + runTestCases(t, false, "interfaces", interfacesTo(flatten(allBools))) + runTestCases(t, false, "interfacesToPointers", interfacesTo(pointersTo(flatten(allBools)))) +} diff --git a/pkg/state/tests/float_test.go b/pkg/state/tests/float_test.go new file mode 100644 index 000000000..3e89edd9c --- /dev/null +++ b/pkg/state/tests/float_test.go @@ -0,0 +1,118 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "math" + "testing" +) + +var safeFloat32s = []float32{ + float32(0.0), + float32(1.0), + float32(-1.0), + float32(math.Inf(1)), + float32(math.Inf(-1)), +} + +var allFloat32s = append(safeFloat32s, float32(math.NaN())) + +var safeFloat64s = []float64{ + float64(0.0), + float64(1.0), + float64(-1.0), + math.Inf(1), + math.Inf(-1), +} + +var allFloat64s = append(safeFloat64s, math.NaN()) + +func TestFloat(t *testing.T) { + runTestCases(t, false, "plain", flatten( + allFloat32s, + allFloat64s, + )) + // See checkEqual for why NaNs are missing. + runTestCases(t, false, "pointers", pointersTo(flatten( + safeFloat32s, + safeFloat64s, + ))) + runTestCases(t, false, "interfaces", interfacesTo(flatten( + safeFloat32s, + safeFloat64s, + ))) + runTestCases(t, false, "interfacesToPointers", interfacesTo(pointersTo(flatten( + safeFloat32s, + safeFloat64s, + )))) +} + +const onlyDouble float64 = 1.0000000000000002 + +func TestFloatTruncation(t *testing.T) { + runTestCases(t, true, "pass", []interface{}{ + truncatingFloat32{save: onlyDouble}, + }) + runTestCases(t, false, "fail", []interface{}{ + truncatingFloat32{save: 1.0}, + }) +} + +var safeComplex64s = combine(safeFloat32s, safeFloat32s, func(i, j interface{}) interface{} { + return complex(i.(float32), j.(float32)) +}) + +var allComplex64s = combine(allFloat32s, allFloat32s, func(i, j interface{}) interface{} { + return complex(i.(float32), j.(float32)) +}) + +var safeComplex128s = combine(safeFloat64s, safeFloat64s, func(i, j interface{}) interface{} { + return complex(i.(float64), j.(float64)) +}) + +var allComplex128s = combine(allFloat64s, allFloat64s, func(i, j interface{}) interface{} { + return complex(i.(float64), j.(float64)) +}) + +func TestComplex(t *testing.T) { + runTestCases(t, false, "plain", flatten( + allComplex64s, + allComplex128s, + )) + // See TestFloat; same issue. + runTestCases(t, false, "pointers", pointersTo(flatten( + safeComplex64s, + safeComplex128s, + ))) + runTestCases(t, false, "interfacse", interfacesTo(flatten( + safeComplex64s, + safeComplex128s, + ))) + runTestCases(t, false, "interfacesTo", interfacesTo(pointersTo(flatten( + safeComplex64s, + safeComplex128s, + )))) +} + +func TestComplexTruncation(t *testing.T) { + runTestCases(t, true, "pass", []interface{}{ + truncatingComplex64{save: complex(onlyDouble, onlyDouble)}, + truncatingComplex64{save: complex(1.0, onlyDouble)}, + truncatingComplex64{save: complex(onlyDouble, 1.0)}, + }) + runTestCases(t, false, "fail", []interface{}{ + truncatingComplex64{save: complex(1.0, 1.0)}, + }) +} diff --git a/pkg/state/tests/integer.go b/pkg/state/tests/integer.go new file mode 100644 index 000000000..ca403eed1 --- /dev/null +++ b/pkg/state/tests/integer.go @@ -0,0 +1,163 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "gvisor.dev/gvisor/pkg/state" +) + +// +stateify type +type truncatingUint8 struct { + save uint64 + load uint8 `state:"nosave"` +} + +func (t *truncatingUint8) StateSave(m state.Sink) { + m.Save(0, &t.save) +} + +func (t *truncatingUint8) StateLoad(m state.Source) { + m.Load(0, &t.load) + t.save = uint64(t.load) + t.load = 0 +} + +var _ state.SaverLoader = (*truncatingUint8)(nil) + +// +stateify type +type truncatingUint16 struct { + save uint64 + load uint16 `state:"nosave"` +} + +func (t *truncatingUint16) StateSave(m state.Sink) { + m.Save(0, &t.save) +} + +func (t *truncatingUint16) StateLoad(m state.Source) { + m.Load(0, &t.load) + t.save = uint64(t.load) + t.load = 0 +} + +var _ state.SaverLoader = (*truncatingUint16)(nil) + +// +stateify type +type truncatingUint32 struct { + save uint64 + load uint32 `state:"nosave"` +} + +func (t *truncatingUint32) StateSave(m state.Sink) { + m.Save(0, &t.save) +} + +func (t *truncatingUint32) StateLoad(m state.Source) { + m.Load(0, &t.load) + t.save = uint64(t.load) + t.load = 0 +} + +var _ state.SaverLoader = (*truncatingUint32)(nil) + +// +stateify type +type truncatingInt8 struct { + save int64 + load int8 `state:"nosave"` +} + +func (t *truncatingInt8) StateSave(m state.Sink) { + m.Save(0, &t.save) +} + +func (t *truncatingInt8) StateLoad(m state.Source) { + m.Load(0, &t.load) + t.save = int64(t.load) + t.load = 0 +} + +var _ state.SaverLoader = (*truncatingInt8)(nil) + +// +stateify type +type truncatingInt16 struct { + save int64 + load int16 `state:"nosave"` +} + +func (t *truncatingInt16) StateSave(m state.Sink) { + m.Save(0, &t.save) +} + +func (t *truncatingInt16) StateLoad(m state.Source) { + m.Load(0, &t.load) + t.save = int64(t.load) + t.load = 0 +} + +var _ state.SaverLoader = (*truncatingInt16)(nil) + +// +stateify type +type truncatingInt32 struct { + save int64 + load int32 `state:"nosave"` +} + +func (t *truncatingInt32) StateSave(m state.Sink) { + m.Save(0, &t.save) +} + +func (t *truncatingInt32) StateLoad(m state.Source) { + m.Load(0, &t.load) + t.save = int64(t.load) + t.load = 0 +} + +var _ state.SaverLoader = (*truncatingInt32)(nil) + +// +stateify type +type truncatingFloat32 struct { + save float64 + load float32 `state:"nosave"` +} + +func (t *truncatingFloat32) StateSave(m state.Sink) { + m.Save(0, &t.save) +} + +func (t *truncatingFloat32) StateLoad(m state.Source) { + m.Load(0, &t.load) + t.save = float64(t.load) + t.load = 0 +} + +var _ state.SaverLoader = (*truncatingFloat32)(nil) + +// +stateify type +type truncatingComplex64 struct { + save complex128 + load complex64 `state:"nosave"` +} + +func (t *truncatingComplex64) StateSave(m state.Sink) { + m.Save(0, &t.save) +} + +func (t *truncatingComplex64) StateLoad(m state.Source) { + m.Load(0, &t.load) + t.save = complex128(t.load) + t.load = 0 +} + +var _ state.SaverLoader = (*truncatingComplex64)(nil) diff --git a/pkg/state/tests/integer_test.go b/pkg/state/tests/integer_test.go new file mode 100644 index 000000000..d3931c952 --- /dev/null +++ b/pkg/state/tests/integer_test.go @@ -0,0 +1,94 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "math" + "testing" +) + +var ( + allIntTs = []int{-1, 0, 1} + allInt8s = []int8{math.MinInt8, -1, 0, 1, math.MaxInt8} + allInt16s = []int16{math.MinInt16, -1, 0, 1, math.MaxInt16} + allInt32s = []int32{math.MinInt32, -1, 0, 1, math.MaxInt32} + allInt64s = []int64{math.MinInt64, -1, 0, 1, math.MaxInt64} + allUintTs = []uint{0, 1} + allUintptrs = []uintptr{0, 1, ^uintptr(0)} + allUint8s = []uint8{0, 1, math.MaxUint8} + allUint16s = []uint16{0, 1, math.MaxUint16} + allUint32s = []uint32{0, 1, math.MaxUint32} + allUint64s = []uint64{0, 1, math.MaxUint64} +) + +var allInts = flatten( + allIntTs, + allInt8s, + allInt16s, + allInt32s, + allInt64s, +) + +var allUints = flatten( + allUintTs, + allUintptrs, + allUint8s, + allUint16s, + allUint32s, + allUint64s, +) + +func TestInt(t *testing.T) { + runTestCases(t, false, "plain", allInts) + runTestCases(t, false, "pointers", pointersTo(allInts)) + runTestCases(t, false, "interfaces", interfacesTo(allInts)) + runTestCases(t, false, "interfacesTo", interfacesTo(pointersTo(allInts))) +} + +func TestIntTruncation(t *testing.T) { + runTestCases(t, true, "pass", []interface{}{ + truncatingInt8{save: math.MinInt8 - 1}, + truncatingInt16{save: math.MinInt16 - 1}, + truncatingInt32{save: math.MinInt32 - 1}, + truncatingInt8{save: math.MaxInt8 + 1}, + truncatingInt16{save: math.MaxInt16 + 1}, + truncatingInt32{save: math.MaxInt32 + 1}, + }) + runTestCases(t, false, "fail", []interface{}{ + truncatingInt8{save: 1}, + truncatingInt16{save: 1}, + truncatingInt32{save: 1}, + }) +} + +func TestUint(t *testing.T) { + runTestCases(t, false, "plain", allUints) + runTestCases(t, false, "pointers", pointersTo(allUints)) + runTestCases(t, false, "interfaces", interfacesTo(allUints)) + runTestCases(t, false, "interfacesTo", interfacesTo(pointersTo(allUints))) +} + +func TestUintTruncation(t *testing.T) { + runTestCases(t, true, "pass", []interface{}{ + truncatingUint8{save: math.MaxUint8 + 1}, + truncatingUint16{save: math.MaxUint16 + 1}, + truncatingUint32{save: math.MaxUint32 + 1}, + }) + runTestCases(t, false, "fail", []interface{}{ + truncatingUint8{save: 1}, + truncatingUint16{save: 1}, + truncatingUint32{save: 1}, + }) +} diff --git a/pkg/state/tests/load.go b/pkg/state/tests/load.go new file mode 100644 index 000000000..a8350c0f3 --- /dev/null +++ b/pkg/state/tests/load.go @@ -0,0 +1,61 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +// +stateify savable +type genericContainer struct { + v interface{} +} + +// +stateify savable +type afterLoadStruct struct { + v int `state:"nosave"` +} + +func (a *afterLoadStruct) afterLoad() { + a.v++ +} + +// +stateify savable +type valueLoadStruct struct { + v int `state:".(int64)"` +} + +func (v *valueLoadStruct) saveV() int64 { + return int64(v.v) // Save as int64. +} + +func (v *valueLoadStruct) loadV(value int64) { + v.v = int(value) // Load as int. +} + +// +stateify savable +type cycleStruct struct { + c *cycleStruct +} + +// +stateify savable +type badCycleStruct struct { + b *badCycleStruct `state:"wait"` +} + +func (b *badCycleStruct) afterLoad() { + if b.b != b { + // This is not executable, since AfterLoad requires that the + // object and all dependencies are complete. This should cause + // a deadlock error during load. + panic("badCycleStruct.afterLoad called") + } +} diff --git a/pkg/state/tests/load_test.go b/pkg/state/tests/load_test.go new file mode 100644 index 000000000..1e9794296 --- /dev/null +++ b/pkg/state/tests/load_test.go @@ -0,0 +1,70 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "testing" +) + +func TestLoadHooks(t *testing.T) { + runTestCases(t, false, "load-hooks", []interface{}{ + &afterLoadStruct{v: 1}, + &valueLoadStruct{v: 1}, + &genericContainer{v: &afterLoadStruct{v: 1}}, + &genericContainer{v: &valueLoadStruct{v: 1}}, + &sliceContainer{v: []interface{}{&afterLoadStruct{v: 1}}}, + &sliceContainer{v: []interface{}{&valueLoadStruct{v: 1}}}, + &mapContainer{v: map[int]interface{}{0: &afterLoadStruct{v: 1}}}, + &mapContainer{v: map[int]interface{}{0: &valueLoadStruct{v: 1}}}, + }) +} + +func TestCycles(t *testing.T) { + // cs is a single object cycle. + cs := cycleStruct{nil} + cs.c = &cs + + // cs1 and cs2 are in a two object cycle. + cs1 := cycleStruct{nil} + cs2 := cycleStruct{nil} + cs1.c = &cs2 + cs2.c = &cs1 + + runTestCases(t, false, "cycles", []interface{}{ + cs, + cs1, + }) +} + +func TestDeadlock(t *testing.T) { + // bs is a single object cycle. This does not cause deadlock because an + // object cannot wait for itself. + bs := badCycleStruct{nil} + bs.b = &bs + + runTestCases(t, false, "self", []interface{}{ + &bs, + }) + + // bs2 and bs2 are in a deadlocking cycle. + bs1 := badCycleStruct{nil} + bs2 := badCycleStruct{nil} + bs1.b = &bs2 + bs2.b = &bs1 + + runTestCases(t, true, "deadlock", []interface{}{ + &bs1, + }) +} diff --git a/pkg/state/tests/map.go b/pkg/state/tests/map.go new file mode 100644 index 000000000..db4e548f1 --- /dev/null +++ b/pkg/state/tests/map.go @@ -0,0 +1,28 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +// +stateify savable +type mapContainer struct { + v map[int]interface{} +} + +// +stateify savable +type mapPtrContainer struct { + v *map[int]interface{} +} + +// +stateify savable +type registeredMapStruct struct{} diff --git a/pkg/state/tests/map_test.go b/pkg/state/tests/map_test.go new file mode 100644 index 000000000..92bf0fc01 --- /dev/null +++ b/pkg/state/tests/map_test.go @@ -0,0 +1,90 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "reflect" + "testing" +) + +var allMapPrimitives = []interface{}{ + bool(true), + int(1), + int8(1), + int16(1), + int32(1), + int64(1), + uint(1), + uintptr(1), + uint8(1), + uint16(1), + uint32(1), + uint64(1), + string(""), + registeredMapStruct{}, +} + +var allMapKeys = flatten(allMapPrimitives, pointersTo(allMapPrimitives)) + +var allMapValues = flatten(allMapPrimitives, pointersTo(allMapPrimitives), interfacesTo(allMapPrimitives)) + +var emptyMaps = combine(allMapKeys, allMapValues, func(v1, v2 interface{}) interface{} { + m := reflect.MakeMap(reflect.MapOf(reflect.TypeOf(v1), reflect.TypeOf(v2))) + return m.Interface() +}) + +var fullMaps = combine(allMapKeys, allMapValues, func(v1, v2 interface{}) interface{} { + m := reflect.MakeMap(reflect.MapOf(reflect.TypeOf(v1), reflect.TypeOf(v2))) + m.SetMapIndex(reflect.Zero(reflect.TypeOf(v1)), reflect.Zero(reflect.TypeOf(v2))) + return m.Interface() +}) + +func TestMapAliasing(t *testing.T) { + v := make(map[int]int) + ptrToV := &v + aliases := []map[int]int{v, v} + runTestCases(t, false, "", []interface{}{ptrToV, aliases}) +} + +func TestMapsEmpty(t *testing.T) { + runTestCases(t, false, "plain", emptyMaps) + runTestCases(t, false, "pointers", pointersTo(emptyMaps)) + runTestCases(t, false, "interfaces", interfacesTo(emptyMaps)) + runTestCases(t, false, "interfacesToPointers", interfacesTo(pointersTo(emptyMaps))) +} + +func TestMapsFull(t *testing.T) { + runTestCases(t, false, "plain", fullMaps) + runTestCases(t, false, "pointers", pointersTo(fullMaps)) + runTestCases(t, false, "interfaces", interfacesTo(fullMaps)) + runTestCases(t, false, "interfacesToPointer", interfacesTo(pointersTo(fullMaps))) +} + +func TestMapContainers(t *testing.T) { + var ( + nilMap map[int]interface{} + emptyMap = make(map[int]interface{}) + fullMap = map[int]interface{}{0: nil} + ) + runTestCases(t, false, "", []interface{}{ + mapContainer{v: nilMap}, + mapContainer{v: emptyMap}, + mapContainer{v: fullMap}, + mapPtrContainer{v: nil}, + mapPtrContainer{v: &nilMap}, + mapPtrContainer{v: &emptyMap}, + mapPtrContainer{v: &fullMap}, + }) +} diff --git a/pkg/state/tests/register.go b/pkg/state/tests/register.go new file mode 100644 index 000000000..074d86315 --- /dev/null +++ b/pkg/state/tests/register.go @@ -0,0 +1,21 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +// +stateify savable +type alreadyRegisteredStruct struct{} + +// +stateify savable +type alreadyRegisteredOther int diff --git a/pkg/state/tests/register_test.go b/pkg/state/tests/register_test.go new file mode 100644 index 000000000..c829753cc --- /dev/null +++ b/pkg/state/tests/register_test.go @@ -0,0 +1,167 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/state" +) + +// faker calls itself whatever is in the name field. +type faker struct { + Name string + Fields []string +} + +func (f *faker) StateTypeName() string { + return f.Name +} + +func (f *faker) StateFields() []string { + return f.Fields +} + +// fakerWithSaverLoader has all it needs. +type fakerWithSaverLoader struct { + faker +} + +func (f *fakerWithSaverLoader) StateSave(m state.Sink) {} + +func (f *fakerWithSaverLoader) StateLoad(m state.Source) {} + +// fakerOther calls itself .. uh, itself? +type fakerOther string + +func (f *fakerOther) StateTypeName() string { + return string(*f) +} + +func (f *fakerOther) StateFields() []string { + return nil +} + +func newFakerOther(name string) *fakerOther { + f := fakerOther(name) + return &f +} + +// fakerOtherBadFields returns non-nil fields. +type fakerOtherBadFields string + +func (f *fakerOtherBadFields) StateTypeName() string { + return string(*f) +} + +func (f *fakerOtherBadFields) StateFields() []string { + return []string{string(*f)} +} + +func newFakerOtherBadFields(name string) *fakerOtherBadFields { + f := fakerOtherBadFields(name) + return &f +} + +// fakerOtherSaverLoader implements SaverLoader methods. +type fakerOtherSaverLoader string + +func (f *fakerOtherSaverLoader) StateTypeName() string { + return string(*f) +} + +func (f *fakerOtherSaverLoader) StateFields() []string { + return nil +} + +func (f *fakerOtherSaverLoader) StateSave(m state.Sink) {} + +func (f *fakerOtherSaverLoader) StateLoad(m state.Source) {} + +func newFakerOtherSaverLoader(name string) *fakerOtherSaverLoader { + f := fakerOtherSaverLoader(name) + return &f +} + +func TestRegisterPrimitives(t *testing.T) { + for _, typeName := range []string{ + "int", + "int8", + "int16", + "int32", + "int64", + "uint", + "uintptr", + "uint8", + "uint16", + "uint32", + "uint64", + "float32", + "float64", + "complex64", + "complex128", + "string", + } { + t.Run("struct/"+typeName, func(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("Registering type %q did not panic", typeName) + } + }() + state.Register(&faker{ + Name: typeName, + }) + }) + t.Run("other/"+typeName, func(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("Registering type %q did not panic", typeName) + } + }() + state.Register(newFakerOther(typeName)) + }) + } +} + +func TestRegisterBad(t *testing.T) { + const ( + goodName = "foo" + firstField = "a" + secondField = "b" + ) + for name, object := range map[string]state.Type{ + "non-struct-with-fields": newFakerOtherBadFields(goodName), + "non-struct-with-saverloader": newFakerOtherSaverLoader(goodName), + "struct-without-saverloader": &faker{Name: goodName}, + "non-struct-duplicate-with-struct": newFakerOther((new(alreadyRegisteredStruct)).StateTypeName()), + "non-struct-duplicate-with-non-struct": newFakerOther((new(alreadyRegisteredOther)).StateTypeName()), + "struct-duplicate-with-struct": &fakerWithSaverLoader{faker{Name: (new(alreadyRegisteredStruct)).StateTypeName()}}, + "struct-duplicate-with-non-struct": &fakerWithSaverLoader{faker{Name: (new(alreadyRegisteredOther)).StateTypeName()}}, + "struct-with-empty-field": &fakerWithSaverLoader{faker{Name: goodName, Fields: []string{""}}}, + "struct-with-empty-field-and-non-empty": &fakerWithSaverLoader{faker{Name: goodName, Fields: []string{firstField, ""}}}, + "struct-with-duplicate-field": &fakerWithSaverLoader{faker{Name: goodName, Fields: []string{firstField, firstField}}}, + "struct-with-duplicate-field-and-non-dup": &fakerWithSaverLoader{faker{Name: goodName, Fields: []string{firstField, secondField, firstField}}}, + } { + t.Run(name, func(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("Registering object %#v did not panic", object) + } + }() + state.Register(object) + }) + + } +} diff --git a/pkg/state/tests/string_test.go b/pkg/state/tests/string_test.go new file mode 100644 index 000000000..44f5a562c --- /dev/null +++ b/pkg/state/tests/string_test.go @@ -0,0 +1,34 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "testing" +) + +const nonEmptyString = "hello world" + +var allStrings = []string{ + "", + nonEmptyString, + "\\0", +} + +func TestString(t *testing.T) { + runTestCases(t, false, "plain", flatten(allStrings)) + runTestCases(t, false, "pointers", pointersTo(flatten(allStrings))) + runTestCases(t, false, "interfaces", interfacesTo(flatten(allStrings))) + runTestCases(t, false, "interfacesToPointers", interfacesTo(pointersTo(flatten(allStrings)))) +} diff --git a/pkg/state/tests/struct.go b/pkg/state/tests/struct.go new file mode 100644 index 000000000..bd2c2b399 --- /dev/null +++ b/pkg/state/tests/struct.go @@ -0,0 +1,65 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +type unregisteredEmptyStruct struct{} + +// typeOnlyEmptyStruct just implements the state.Type interface. +type typeOnlyEmptyStruct struct{} + +func (*typeOnlyEmptyStruct) StateTypeName() string { return "registeredEmptyStruct" } + +func (*typeOnlyEmptyStruct) StateFields() []string { return nil } + +// +stateify savable +type savableEmptyStruct struct{} + +// +stateify savable +type emptyStructPointer struct { + nothing *struct{} +} + +// +stateify savable +type outerSame struct { + inner inner +} + +// +stateify savable +type outerFieldFirst struct { + inner inner + v int64 +} + +// +stateify savable +type outerFieldSecond struct { + v int64 + inner inner +} + +// +stateify savable +type outerArray struct { + inner [2]inner +} + +// +stateify savable +type inner struct { + v int64 +} + +// +stateify savable +type system struct { + v1 interface{} + v2 interface{} +} diff --git a/pkg/state/tests/struct_test.go b/pkg/state/tests/struct_test.go new file mode 100644 index 000000000..de9d17aa7 --- /dev/null +++ b/pkg/state/tests/struct_test.go @@ -0,0 +1,89 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/state" +) + +func TestEmptyStruct(t *testing.T) { + runTestCases(t, false, "plain", []interface{}{ + unregisteredEmptyStruct{}, + typeOnlyEmptyStruct{}, + savableEmptyStruct{}, + }) + runTestCases(t, false, "pointers", pointersTo([]interface{}{ + unregisteredEmptyStruct{}, + typeOnlyEmptyStruct{}, + savableEmptyStruct{}, + })) + runTestCases(t, false, "interfaces-pass", interfacesTo([]interface{}{ + // Only registered types can be dispatched via interfaces. All + // other types should fail, even if it is the empty struct. + savableEmptyStruct{}, + })) + runTestCases(t, true, "interfaces-fail", interfacesTo([]interface{}{ + unregisteredEmptyStruct{}, + typeOnlyEmptyStruct{}, + })) + runTestCases(t, false, "interfacesToPointers-pass", interfacesTo(pointersTo([]interface{}{ + savableEmptyStruct{}, + }))) + runTestCases(t, true, "interfacesToPointers-fail", interfacesTo(pointersTo([]interface{}{ + unregisteredEmptyStruct{}, + typeOnlyEmptyStruct{}, + }))) + + // Ensuring empty struct aliasing works. + es := emptyStructPointer{new(struct{})} + runTestCases(t, false, "empty-struct-pointers", []interface{}{ + emptyStructPointer{}, + es, + []emptyStructPointer{es, es}, // Same pointer. + }) +} + +func TestRegisterTypeOnlyStruct(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("Register did not panic") + } + }() + state.Register((*typeOnlyEmptyStruct)(nil)) +} + +func TestEmbeddedPointers(t *testing.T) { + var ( + ofs outerSame + of1 outerFieldFirst + of2 outerFieldSecond + oa outerArray + ) + + runTestCases(t, false, "embedded-pointers", []interface{}{ + system{&ofs, &ofs.inner}, + system{&ofs.inner, &ofs}, + system{&of1, &of1.inner}, + system{&of1.inner, &of1}, + system{&of2, &of2.inner}, + system{&of2.inner, &of2}, + system{&oa, &oa.inner[0]}, + system{&oa, &oa.inner[1]}, + system{&oa.inner[0], &oa}, + system{&oa.inner[1], &oa}, + }) +} diff --git a/pkg/state/tests/tests.go b/pkg/state/tests/tests.go new file mode 100644 index 000000000..435a0e9db --- /dev/null +++ b/pkg/state/tests/tests.go @@ -0,0 +1,215 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tests tests the state packages. +package tests + +import ( + "bytes" + "context" + "fmt" + "math" + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/pretty" +) + +// discard is an implementation of wire.Writer. +type discard struct{} + +// Write implements wire.Writer.Write. +func (discard) Write(p []byte) (int, error) { return len(p), nil } + +// WriteByte implements wire.Writer.WriteByte. +func (discard) WriteByte(byte) error { return nil } + +// checkEqual checks if two objects are equal. +// +// N.B. This only handles one level of dereferences for NaN. Otherwise we +// would need to fork the entire implementation of reflect.DeepEqual. +func checkEqual(root, loadedValue interface{}) bool { + if reflect.DeepEqual(root, loadedValue) { + return true + } + + // NaN is not equal to itself. We handle the case of raw floating point + // primitives here, but don't handle this case nested. + rf32, ok1 := root.(float32) + lf32, ok2 := loadedValue.(float32) + if ok1 && ok2 && math.IsNaN(float64(rf32)) && math.IsNaN(float64(lf32)) { + return true + } + rf64, ok1 := root.(float64) + lf64, ok2 := loadedValue.(float64) + if ok1 && ok2 && math.IsNaN(rf64) && math.IsNaN(lf64) { + return true + } + + // Same real for complex numbers. + rc64, ok1 := root.(complex64) + lc64, ok2 := root.(complex64) + if ok1 && ok2 { + return checkEqual(real(rc64), real(lc64)) && checkEqual(imag(rc64), imag(lc64)) + } + rc128, ok1 := root.(complex128) + lc128, ok2 := root.(complex128) + if ok1 && ok2 { + return checkEqual(real(rc128), real(lc128)) && checkEqual(imag(rc128), imag(lc128)) + } + + return false +} + +// runTestCases runs a test for each object in objects. +func runTestCases(t *testing.T, shouldFail bool, prefix string, objects []interface{}) { + t.Helper() + for i, root := range objects { + t.Run(fmt.Sprintf("%s%d", prefix, i), func(t *testing.T) { + t.Logf("Original object:\n%#v", root) + + // Save the passed object. + saveBuffer := &bytes.Buffer{} + saveObjectPtr := reflect.New(reflect.TypeOf(root)) + saveObjectPtr.Elem().Set(reflect.ValueOf(root)) + saveStats, err := state.Save(context.Background(), saveBuffer, saveObjectPtr.Interface()) + if err != nil { + if shouldFail { + return + } + t.Fatalf("Save failed unexpectedly: %v", err) + } + + // Dump the serialized proto to aid with debugging. + var ppBuf bytes.Buffer + t.Logf("Raw state:\n%v", saveBuffer.Bytes()) + if err := pretty.PrintText(&ppBuf, bytes.NewReader(saveBuffer.Bytes())); err != nil { + // We don't count this as a test failure if we + // have shouldFail set, but we will count as a + // failure if we were not expecting to fail. + if !shouldFail { + t.Errorf("PrettyPrint(html=false) failed unexpected: %v", err) + } + } + if err := pretty.PrintHTML(discard{}, bytes.NewReader(saveBuffer.Bytes())); err != nil { + // See above. + if !shouldFail { + t.Errorf("PrettyPrint(html=true) failed unexpected: %v", err) + } + } + t.Logf("Encoded state:\n%s", ppBuf.String()) + t.Logf("Save stats:\n%s", saveStats.String()) + + // Load a new copy of the object. + loadObjectPtr := reflect.New(reflect.TypeOf(root)) + loadStats, err := state.Load(context.Background(), bytes.NewReader(saveBuffer.Bytes()), loadObjectPtr.Interface()) + if err != nil { + if shouldFail { + return + } + t.Fatalf("Load failed unexpectedly: %v", err) + } + + // Compare the values. + loadedValue := loadObjectPtr.Elem().Interface() + if !checkEqual(root, loadedValue) { + if shouldFail { + return + } + t.Fatalf("Objects differ:\n\toriginal: %#v\n\tloaded: %#v\n", root, loadedValue) + } + + // Everything went okay. Is that good? + if shouldFail { + t.Fatalf("This test was expected to fail, but didn't.") + } + t.Logf("Load stats:\n%s", loadStats.String()) + + // Truncate half the bytes in the byte stream, + // and ensure that we can't restore. Then + // truncate only the final byte and ensure that + // we can't restore. + l := saveBuffer.Len() + halfReader := bytes.NewReader(saveBuffer.Bytes()[:l/2]) + if _, err := state.Load(context.Background(), halfReader, loadObjectPtr.Interface()); err == nil { + t.Errorf("Load with half bytes succeeded unexpectedly.") + } + missingByteReader := bytes.NewReader(saveBuffer.Bytes()[:l-1]) + if _, err := state.Load(context.Background(), missingByteReader, loadObjectPtr.Interface()); err == nil { + t.Errorf("Load with missing byte succeeded unexpectedly.") + } + }) + } +} + +// convert converts the slice to an []interface{}. +func convert(v interface{}) (r []interface{}) { + s := reflect.ValueOf(v) // Must be slice. + for i := 0; i < s.Len(); i++ { + r = append(r, s.Index(i).Interface()) + } + return r +} + +// flatten flattens multiple slices. +func flatten(vs ...interface{}) (r []interface{}) { + for _, v := range vs { + r = append(r, convert(v)...) + } + return r +} + +// filter maps from one slice to another. +func filter(vs interface{}, fn func(interface{}) (interface{}, bool)) (r []interface{}) { + s := reflect.ValueOf(vs) + for i := 0; i < s.Len(); i++ { + v, ok := fn(s.Index(i).Interface()) + if ok { + r = append(r, v) + } + } + return r +} + +// combine combines objects in two slices as specified. +func combine(v1, v2 interface{}, fn func(_, _ interface{}) interface{}) (r []interface{}) { + s1 := reflect.ValueOf(v1) + s2 := reflect.ValueOf(v2) + for i := 0; i < s1.Len(); i++ { + for j := 0; j < s2.Len(); j++ { + // Combine using the given function. + r = append(r, fn(s1.Index(i).Interface(), s2.Index(j).Interface())) + } + } + return r +} + +// pointersTo is a filter function that returns pointers. +func pointersTo(vs interface{}) []interface{} { + return filter(vs, func(o interface{}) (interface{}, bool) { + v := reflect.New(reflect.TypeOf(o)) + v.Elem().Set(reflect.ValueOf(o)) + return v.Interface(), true + }) +} + +// interfacesTo is a filter function that returns interface objects. +func interfacesTo(vs interface{}) []interface{} { + return filter(vs, func(o interface{}) (interface{}, bool) { + var v [1]interface{} + v[0] = o + return v, true + }) +} diff --git a/pkg/state/types.go b/pkg/state/types.go new file mode 100644 index 000000000..215ef80f8 --- /dev/null +++ b/pkg/state/types.go @@ -0,0 +1,361 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package state + +import ( + "reflect" + "sort" + + "gvisor.dev/gvisor/pkg/state/wire" +) + +// assertValidType asserts that the type is valid. +func assertValidType(name string, fields []string) { + if name == "" { + Failf("type has empty name") + } + fieldsCopy := make([]string, len(fields)) + for i := 0; i < len(fields); i++ { + if fields[i] == "" { + Failf("field has empty name for type %q", name) + } + fieldsCopy[i] = fields[i] + } + sort.Slice(fieldsCopy, func(i, j int) bool { + return fieldsCopy[i] < fieldsCopy[j] + }) + for i := range fieldsCopy { + if i > 0 && fieldsCopy[i-1] == fieldsCopy[i] { + Failf("duplicate field %q for type %s", fieldsCopy[i], name) + } + } +} + +// typeEntry is an entry in the typeDatabase. +type typeEntry struct { + ID typeID + wire.Type +} + +// reconciledTypeEntry is a reconciled entry in the typeDatabase. +type reconciledTypeEntry struct { + wire.Type + LocalType reflect.Type + FieldOrder []int +} + +// typeEncodeDatabase is an internal TypeInfo database for encoding. +type typeEncodeDatabase struct { + // byType maps by type to the typeEntry. + byType map[reflect.Type]*typeEntry + + // lastID is the last used ID. + lastID typeID +} + +// makeTypeEncodeDatabase makes a typeDatabase. +func makeTypeEncodeDatabase() typeEncodeDatabase { + return typeEncodeDatabase{ + byType: make(map[reflect.Type]*typeEntry), + } +} + +// typeDecodeDatabase is an internal TypeInfo database for decoding. +type typeDecodeDatabase struct { + // byID maps by ID to type. + byID []*reconciledTypeEntry + + // pending are entries that are pending validation by Lookup. These + // will be reconciled with actual objects. Note that these will also be + // used to lookup types by name, since they may not be reconciled and + // there's little value to deleting from this map. + pending []*wire.Type +} + +// makeTypeDecodeDatabase makes a typeDatabase. +func makeTypeDecodeDatabase() typeDecodeDatabase { + return typeDecodeDatabase{} +} + +// lookupNameFields extracts the name and fields from an object. +func lookupNameFields(typ reflect.Type) (string, []string, bool) { + v := reflect.Zero(reflect.PtrTo(typ)).Interface() + t, ok := v.(Type) + if !ok { + // Is this a primitive? + if typ.Kind() == reflect.Interface { + return interfaceType, nil, true + } + name := typ.Name() + if _, ok := primitiveTypeDatabase[name]; !ok { + // This is not a known type, and not a primitive. The + // encoder may proceed for anonymous empty structs, or + // it may deference the type pointer and try again. + return "", nil, false + } + return name, nil, true + } + // Extract the name from the object. + name := t.StateTypeName() + fields := t.StateFields() + assertValidType(name, fields) + return name, fields, true +} + +// Lookup looks up or registers the given object. +// +// The bool indicates whether this is an existing entry: false means the entry +// did not exist, and true means the entry did exist. If this bool is false and +// the returned typeEntry are nil, then the obj did not implement the Type +// interface. +func (tdb *typeEncodeDatabase) Lookup(typ reflect.Type) (*typeEntry, bool) { + te, ok := tdb.byType[typ] + if !ok { + // Lookup the type information. + name, fields, ok := lookupNameFields(typ) + if !ok { + // Empty structs may still be encoded, so let the + // caller decide what to do from here. + return nil, false + } + + // Register the new type. + tdb.lastID++ + te = &typeEntry{ + ID: tdb.lastID, + Type: wire.Type{ + Name: name, + Fields: fields, + }, + } + + // All done. + tdb.byType[typ] = te + return te, false + } + return te, true +} + +// Register adds a typeID entry. +func (tbd *typeDecodeDatabase) Register(typ *wire.Type) { + assertValidType(typ.Name, typ.Fields) + tbd.pending = append(tbd.pending, typ) +} + +// LookupName looks up the type name by ID. +func (tbd *typeDecodeDatabase) LookupName(id typeID) string { + if len(tbd.pending) < int(id) { + // This is likely an encoder error? + Failf("type ID %d not available", id) + } + return tbd.pending[id-1].Name +} + +// LookupType looks up the type by ID. +func (tbd *typeDecodeDatabase) LookupType(id typeID) reflect.Type { + name := tbd.LookupName(id) + typ, ok := globalTypeDatabase[name] + if !ok { + // If not available, see if it's primitive. + typ, ok = primitiveTypeDatabase[name] + if !ok && name == interfaceType { + // Matches the built-in interface type. + var i interface{} + return reflect.TypeOf(&i).Elem() + } + if !ok { + // The type is perhaps not registered? + Failf("type name %q is not available", name) + } + return typ // Primitive type. + } + return typ // Registered type. +} + +// singleFieldOrder defines the field order for a single field. +var singleFieldOrder = []int{0} + +// Lookup looks up or registers the given object. +// +// First, the typeID is searched to see if this has already been appropriately +// reconciled. If no, then a reconcilation will take place that may result in a +// field ordering. If a nil reconciledTypeEntry is returned from this method, +// then the object does not support the Type interface. +// +// This method never returns nil. +func (tbd *typeDecodeDatabase) Lookup(id typeID, typ reflect.Type) *reconciledTypeEntry { + if len(tbd.byID) > int(id) && tbd.byID[id-1] != nil { + // Already reconciled. + return tbd.byID[id-1] + } + // The ID has not been reconciled yet. That's fine. We need to make + // sure it aligns with the current provided object. + if len(tbd.pending) < int(id) { + // This id was never registered. Probably an encoder error? + Failf("typeDatabase does not contain id %d", id) + } + // Extract the pending info. + pending := tbd.pending[id-1] + // Grow the byID list. + if len(tbd.byID) < int(id) { + tbd.byID = append(tbd.byID, make([]*reconciledTypeEntry, int(id)-len(tbd.byID))...) + } + // Reconcile the type. + name, fields, ok := lookupNameFields(typ) + if !ok { + // Empty structs are decoded only when the type is nil. Since + // this isn't the case, we fail here. + Failf("unsupported type %q during decode; can't reconcile", pending.Name) + } + if name != pending.Name { + // Are these the same type? Print a helpful message as this may + // actually happen in practice if types change. + Failf("typeDatabase contains conflicting definitions for id %d: %s->%v (current) and %s->%v (existing)", + id, name, fields, pending.Name, pending.Fields) + } + rte := &reconciledTypeEntry{ + Type: wire.Type{ + Name: name, + Fields: fields, + }, + LocalType: typ, + } + // If there are zero or one fields, then we skip allocating the field + // slice. There is special handling for decoding in this case. If the + // field name does not match, it will be caught in the general purpose + // code below. + if len(fields) != len(pending.Fields) { + Failf("type %q contains different fields: %v (decode) and %v (encode)", + name, fields, pending.Fields) + } + if len(fields) == 0 { + tbd.byID[id-1] = rte // Save. + return rte + } + if len(fields) == 1 && fields[0] == pending.Fields[0] { + tbd.byID[id-1] = rte // Save. + rte.FieldOrder = singleFieldOrder + return rte + } + // For each field in the current object's information, match it to a + // field in the destination object. We know from the assertion above + // and the insertion on insertion to pending that neither field + // contains any duplicates. + fieldOrder := make([]int, len(fields)) + for i, name := range fields { + fieldOrder[i] = -1 // Sentinel. + // Is it an exact match? + if pending.Fields[i] == name { + fieldOrder[i] = i + continue + } + // Find the matching field. + for j, otherName := range pending.Fields { + if name == otherName { + fieldOrder[i] = j + break + } + } + if fieldOrder[i] == -1 { + // The type name matches but we are lacking some common fields. + Failf("type %q has mismatched fields: %v (decode) and %v (encode)", + name, fields, pending.Fields) + } + } + // The type has been reeconciled. + rte.FieldOrder = fieldOrder + tbd.byID[id-1] = rte + return rte +} + +// interfaceType defines all interfaces. +const interfaceType = "interface" + +// primitiveTypeDatabase is a set of fixed types. +var primitiveTypeDatabase = func() map[string]reflect.Type { + r := make(map[string]reflect.Type) + for _, t := range []reflect.Type{ + reflect.TypeOf(false), + reflect.TypeOf(int(0)), + reflect.TypeOf(int8(0)), + reflect.TypeOf(int16(0)), + reflect.TypeOf(int32(0)), + reflect.TypeOf(int64(0)), + reflect.TypeOf(uint(0)), + reflect.TypeOf(uintptr(0)), + reflect.TypeOf(uint8(0)), + reflect.TypeOf(uint16(0)), + reflect.TypeOf(uint32(0)), + reflect.TypeOf(uint64(0)), + reflect.TypeOf(""), + reflect.TypeOf(float32(0.0)), + reflect.TypeOf(float64(0.0)), + reflect.TypeOf(complex64(0.0)), + reflect.TypeOf(complex128(0.0)), + } { + r[t.Name()] = t + } + return r +}() + +// globalTypeDatabase is used for dispatching interfaces on decode. +var globalTypeDatabase = map[string]reflect.Type{} + +// Register registers a type. +// +// This must be called on init and only done once. +func Register(t Type) { + name := t.StateTypeName() + fields := t.StateFields() + assertValidType(name, fields) + // Register must always be called on pointers. + typ := reflect.TypeOf(t) + if typ.Kind() != reflect.Ptr { + Failf("Register must be called on pointers") + } + typ = typ.Elem() + if typ.Kind() == reflect.Struct { + // All registered structs must implement SaverLoader. We allow + // the registration is non-struct types with just the Type + // interface, but we need to call StateSave/StateLoad methods + // on aggregate types. + if _, ok := t.(SaverLoader); !ok { + Failf("struct %T does not implement SaverLoader", t) + } + } else { + // Non-structs must not have any fields. We don't support + // calling StateSave/StateLoad methods on any non-struct types. + // If custom behavior is required, these types should be + // wrapped in a structure of some kind. + if len(fields) != 0 { + Failf("non-struct %T has non-zero fields %v", t, fields) + } + // We don't allow non-structs to implement StateSave/StateLoad + // methods, because they won't be called and it's confusing. + if _, ok := t.(SaverLoader); ok { + Failf("non-struct %T implements SaverLoader", t) + } + } + if _, ok := primitiveTypeDatabase[name]; ok { + Failf("conflicting primitiveTypeDatabase entry for %T: used by primitive", t) + } + if _, ok := globalTypeDatabase[name]; ok { + Failf("conflicting globalTypeDatabase entries for %T: name conflict", t) + } + if name == interfaceType { + Failf("conflicting name for %T: matches interfaceType", t) + } + globalTypeDatabase[name] = typ +} diff --git a/pkg/state/wire/BUILD b/pkg/state/wire/BUILD new file mode 100644 index 000000000..311b93dcb --- /dev/null +++ b/pkg/state/wire/BUILD @@ -0,0 +1,12 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "wire", + srcs = ["wire.go"], + marshal = False, + stateify = False, + visibility = ["//:sandbox"], + deps = ["//pkg/gohacks"], +) diff --git a/pkg/state/wire/wire.go b/pkg/state/wire/wire.go new file mode 100644 index 000000000..93dee6740 --- /dev/null +++ b/pkg/state/wire/wire.go @@ -0,0 +1,970 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package wire contains a few basic types that can be composed to serialize +// graph information for the state package. This package defines the wire +// protocol. +// +// Note that these types are careful about how they implement the relevant +// interfaces (either value receiver or pointer receiver), so that native-sized +// types, such as integers and simple pointers, can fit inside the interface +// object. +// +// This package also uses panic as control flow, so called should be careful to +// wrap calls in appropriate handlers. +// +// Testing for this package is driven by the state test package. +package wire + +import ( + "fmt" + "io" + "math" + + "gvisor.dev/gvisor/pkg/gohacks" +) + +// Reader is the required reader interface. +type Reader interface { + io.Reader + ReadByte() (byte, error) +} + +// Writer is the required writer interface. +type Writer interface { + io.Writer + WriteByte(byte) error +} + +// readFull is a utility. The equivalent is not needed for Write, but the API +// contract dictates that it must always complete all bytes given or return an +// error. +func readFull(r io.Reader, p []byte) { + for done := 0; done < len(p); { + n, err := r.Read(p[done:]) + done += n + if n == 0 && err != nil { + panic(err) + } + } +} + +// Object is a generic object. +type Object interface { + // save saves the given object. + // + // Panic is used for error control flow. + save(Writer) + + // load loads a new object of the given type. + // + // Panic is used for error control flow. + load(Reader) Object +} + +// Bool is a boolean. +type Bool bool + +// loadBool loads an object of type Bool. +func loadBool(r Reader) Bool { + b := loadUint(r) + return Bool(b == 1) +} + +// save implements Object.save. +func (b Bool) save(w Writer) { + var v Uint + if b { + v = 1 + } else { + v = 0 + } + v.save(w) +} + +// load implements Object.load. +func (Bool) load(r Reader) Object { return loadBool(r) } + +// Int is a signed integer. +// +// This uses varint encoding. +type Int int64 + +// loadInt loads an object of type Int. +func loadInt(r Reader) Int { + u := loadUint(r) + x := Int(u >> 1) + if u&1 != 0 { + x = ^x + } + return x +} + +// save implements Object.save. +func (i Int) save(w Writer) { + u := Uint(i) << 1 + if i < 0 { + u = ^u + } + u.save(w) +} + +// load implements Object.load. +func (Int) load(r Reader) Object { return loadInt(r) } + +// Uint is an unsigned integer. +type Uint uint64 + +// loadUint loads an object of type Uint. +func loadUint(r Reader) Uint { + var ( + u Uint + s uint + ) + for i := 0; i <= 9; i++ { + b, err := r.ReadByte() + if err != nil { + panic(err) + } + if b < 0x80 { + if i == 9 && b > 1 { + panic("overflow") + } + u |= Uint(b) << s + return u + } + u |= Uint(b&0x7f) << s + s += 7 + } + panic("unreachable") +} + +// save implements Object.save. +func (u Uint) save(w Writer) { + for u >= 0x80 { + if err := w.WriteByte(byte(u) | 0x80); err != nil { + panic(err) + } + u >>= 7 + } + if err := w.WriteByte(byte(u)); err != nil { + panic(err) + } +} + +// load implements Object.load. +func (Uint) load(r Reader) Object { return loadUint(r) } + +// Float32 is a 32-bit floating point number. +type Float32 float32 + +// loadFloat32 loads an object of type Float32. +func loadFloat32(r Reader) Float32 { + n := loadUint(r) + return Float32(math.Float32frombits(uint32(n))) +} + +// save implements Object.save. +func (f Float32) save(w Writer) { + n := Uint(math.Float32bits(float32(f))) + n.save(w) +} + +// load implements Object.load. +func (Float32) load(r Reader) Object { return loadFloat32(r) } + +// Float64 is a 64-bit floating point number. +type Float64 float64 + +// loadFloat64 loads an object of type Float64. +func loadFloat64(r Reader) Float64 { + n := loadUint(r) + return Float64(math.Float64frombits(uint64(n))) +} + +// save implements Object.save. +func (f Float64) save(w Writer) { + n := Uint(math.Float64bits(float64(f))) + n.save(w) +} + +// load implements Object.load. +func (Float64) load(r Reader) Object { return loadFloat64(r) } + +// Complex64 is a 64-bit complex number. +type Complex64 complex128 + +// loadComplex64 loads an object of type Complex64. +func loadComplex64(r Reader) Complex64 { + re := loadFloat32(r) + im := loadFloat32(r) + return Complex64(complex(float32(re), float32(im))) +} + +// save implements Object.save. +func (c *Complex64) save(w Writer) { + re := Float32(real(*c)) + im := Float32(imag(*c)) + re.save(w) + im.save(w) +} + +// load implements Object.load. +func (*Complex64) load(r Reader) Object { + c := loadComplex64(r) + return &c +} + +// Complex128 is a 128-bit complex number. +type Complex128 complex128 + +// loadComplex128 loads an object of type Complex128. +func loadComplex128(r Reader) Complex128 { + re := loadFloat64(r) + im := loadFloat64(r) + return Complex128(complex(float64(re), float64(im))) +} + +// save implements Object.save. +func (c *Complex128) save(w Writer) { + re := Float64(real(*c)) + im := Float64(imag(*c)) + re.save(w) + im.save(w) +} + +// load implements Object.load. +func (*Complex128) load(r Reader) Object { + c := loadComplex128(r) + return &c +} + +// String is a string. +type String string + +// loadString loads an object of type String. +func loadString(r Reader) String { + l := loadUint(r) + p := make([]byte, l) + readFull(r, p) + return String(gohacks.StringFromImmutableBytes(p)) +} + +// save implements Object.save. +func (s *String) save(w Writer) { + l := Uint(len(*s)) + l.save(w) + p := gohacks.ImmutableBytesFromString(string(*s)) + _, err := w.Write(p) // Must write all bytes. + if err != nil { + panic(err) + } +} + +// load implements Object.load. +func (*String) load(r Reader) Object { + s := loadString(r) + return &s +} + +// Dot is a kind of reference: one of Index and FieldName. +type Dot interface { + isDot() +} + +// Index is a reference resolution. +type Index uint32 + +func (Index) isDot() {} + +// FieldName is a reference resolution. +type FieldName string + +func (*FieldName) isDot() {} + +// Ref is a reference to an object. +type Ref struct { + // Root is the root object. + Root Uint + + // Dots is the set of traversals required from the Root object above. + // Note that this will be stored in reverse order for efficiency. + Dots []Dot + + // Type is the base type for the root object. This is non-nil iff Dots + // is non-zero length (that is, this is a complex reference). This is + // not *strictly* necessary, but can be used to simplify decoding. + Type TypeSpec +} + +// loadRef loads an object of type Ref (abstract). +func loadRef(r Reader) Ref { + ref := Ref{ + Root: loadUint(r), + } + l := loadUint(r) + ref.Dots = make([]Dot, l) + for i := 0; i < int(l); i++ { + // Disambiguate between an Index (non-negative) and a field + // name (negative). This does some space and avoids a dedicate + // loadDot function. See Ref.save for the other side. + d := loadInt(r) + if d >= 0 { + ref.Dots[i] = Index(d) + continue + } + p := make([]byte, -d) + readFull(r, p) + fieldName := FieldName(gohacks.StringFromImmutableBytes(p)) + ref.Dots[i] = &fieldName + } + if l != 0 { + // Only if dots is non-zero. + ref.Type = loadTypeSpec(r) + } + return ref +} + +// save implements Object.save. +func (r *Ref) save(w Writer) { + r.Root.save(w) + l := Uint(len(r.Dots)) + l.save(w) + for _, d := range r.Dots { + // See LoadRef. We use non-negative numbers to encode Index + // objects and negative numbers to encode field lengths. + switch x := d.(type) { + case Index: + i := Int(x) + i.save(w) + case *FieldName: + d := Int(-len(*x)) + d.save(w) + p := gohacks.ImmutableBytesFromString(string(*x)) + if _, err := w.Write(p); err != nil { + panic(err) + } + default: + panic("unknown dot implementation") + } + } + if l != 0 { + // See above. + saveTypeSpec(w, r.Type) + } +} + +// load implements Object.load. +func (*Ref) load(r Reader) Object { + ref := loadRef(r) + return &ref +} + +// Nil is a primitive zero value of any type. +type Nil struct{} + +// loadNil loads an object of type Nil. +func loadNil(r Reader) Nil { + return Nil{} +} + +// save implements Object.save. +func (Nil) save(w Writer) {} + +// load implements Object.load. +func (Nil) load(r Reader) Object { return loadNil(r) } + +// Slice is a slice value. +type Slice struct { + Length Uint + Capacity Uint + Ref Ref +} + +// loadSlice loads an object of type Slice. +func loadSlice(r Reader) Slice { + return Slice{ + Length: loadUint(r), + Capacity: loadUint(r), + Ref: loadRef(r), + } +} + +// save implements Object.save. +func (s *Slice) save(w Writer) { + s.Length.save(w) + s.Capacity.save(w) + s.Ref.save(w) +} + +// load implements Object.load. +func (*Slice) load(r Reader) Object { + s := loadSlice(r) + return &s +} + +// Array is an array value. +type Array struct { + Contents []Object +} + +// loadArray loads an object of type Array. +func loadArray(r Reader) Array { + l := loadUint(r) + if l == 0 { + // Note that there isn't a single object available to encode + // the type of, so we need this additional branch. + return Array{} + } + // All the objects here have the same type, so use dynamic dispatch + // only once. All other objects will automatically take the same type + // as the first object. + contents := make([]Object, l) + v := Load(r) + contents[0] = v + for i := 1; i < int(l); i++ { + contents[i] = v.load(r) + } + return Array{ + Contents: contents, + } +} + +// save implements Object.save. +func (a *Array) save(w Writer) { + l := Uint(len(a.Contents)) + l.save(w) + if l == 0 { + // See LoadArray. + return + } + // See above. + Save(w, a.Contents[0]) + for i := 1; i < int(l); i++ { + a.Contents[i].save(w) + } +} + +// load implements Object.load. +func (*Array) load(r Reader) Object { + a := loadArray(r) + return &a +} + +// Map is a map value. +type Map struct { + Keys []Object + Values []Object +} + +// loadMap loads an object of type Map. +func loadMap(r Reader) Map { + l := loadUint(r) + if l == 0 { + // See LoadArray. + return Map{} + } + // See type dispatch notes in Array. + keys := make([]Object, l) + values := make([]Object, l) + k := Load(r) + v := Load(r) + keys[0] = k + values[0] = v + for i := 1; i < int(l); i++ { + keys[i] = k.load(r) + values[i] = v.load(r) + } + return Map{ + Keys: keys, + Values: values, + } +} + +// save implements Object.save. +func (m *Map) save(w Writer) { + l := Uint(len(m.Keys)) + if int(l) != len(m.Values) { + panic(fmt.Sprintf("mismatched keys (%d) Aand values (%d)", len(m.Keys), len(m.Values))) + } + l.save(w) + if l == 0 { + // See LoadArray. + return + } + // See above. + Save(w, m.Keys[0]) + Save(w, m.Values[0]) + for i := 1; i < int(l); i++ { + m.Keys[i].save(w) + m.Values[i].save(w) + } +} + +// load implements Object.load. +func (*Map) load(r Reader) Object { + m := loadMap(r) + return &m +} + +// TypeSpec is a type dereference. +type TypeSpec interface { + isTypeSpec() +} + +// TypeID is a concrete type ID. +type TypeID Uint + +func (TypeID) isTypeSpec() {} + +// TypeSpecPointer is a pointer type. +type TypeSpecPointer struct { + Type TypeSpec +} + +func (*TypeSpecPointer) isTypeSpec() {} + +// TypeSpecArray is an array type. +type TypeSpecArray struct { + Count Uint + Type TypeSpec +} + +func (*TypeSpecArray) isTypeSpec() {} + +// TypeSpecSlice is a slice type. +type TypeSpecSlice struct { + Type TypeSpec +} + +func (*TypeSpecSlice) isTypeSpec() {} + +// TypeSpecMap is a map type. +type TypeSpecMap struct { + Key TypeSpec + Value TypeSpec +} + +func (*TypeSpecMap) isTypeSpec() {} + +// TypeSpecNil is an empty type. +type TypeSpecNil struct{} + +func (TypeSpecNil) isTypeSpec() {} + +// TypeSpec types. +// +// These use a distinct encoding on the wire, as they are used only in the +// interface object. They are decoded through the dedicated loadTypeSpec and +// saveTypeSpec functions. +const ( + typeSpecTypeID Uint = iota + typeSpecPointer + typeSpecArray + typeSpecSlice + typeSpecMap + typeSpecNil +) + +// loadTypeSpec loads TypeSpec values. +func loadTypeSpec(r Reader) TypeSpec { + switch hdr := loadUint(r); hdr { + case typeSpecTypeID: + return TypeID(loadUint(r)) + case typeSpecPointer: + return &TypeSpecPointer{ + Type: loadTypeSpec(r), + } + case typeSpecArray: + return &TypeSpecArray{ + Count: loadUint(r), + Type: loadTypeSpec(r), + } + case typeSpecSlice: + return &TypeSpecSlice{ + Type: loadTypeSpec(r), + } + case typeSpecMap: + return &TypeSpecMap{ + Key: loadTypeSpec(r), + Value: loadTypeSpec(r), + } + case typeSpecNil: + return TypeSpecNil{} + default: + // This is not a valid stream? + panic(fmt.Errorf("unknown header: %d", hdr)) + } +} + +// saveTypeSpec saves TypeSpec values. +func saveTypeSpec(w Writer, t TypeSpec) { + switch x := t.(type) { + case TypeID: + typeSpecTypeID.save(w) + Uint(x).save(w) + case *TypeSpecPointer: + typeSpecPointer.save(w) + saveTypeSpec(w, x.Type) + case *TypeSpecArray: + typeSpecArray.save(w) + x.Count.save(w) + saveTypeSpec(w, x.Type) + case *TypeSpecSlice: + typeSpecSlice.save(w) + saveTypeSpec(w, x.Type) + case *TypeSpecMap: + typeSpecMap.save(w) + saveTypeSpec(w, x.Key) + saveTypeSpec(w, x.Value) + case TypeSpecNil: + typeSpecNil.save(w) + default: + // This should not happen? + panic(fmt.Errorf("unknown type %T", t)) + } +} + +// Interface is an interface value. +type Interface struct { + Type TypeSpec + Value Object +} + +// loadInterface loads an object of type Interface. +func loadInterface(r Reader) Interface { + return Interface{ + Type: loadTypeSpec(r), + Value: Load(r), + } +} + +// save implements Object.save. +func (i *Interface) save(w Writer) { + saveTypeSpec(w, i.Type) + Save(w, i.Value) +} + +// load implements Object.load. +func (*Interface) load(r Reader) Object { + i := loadInterface(r) + return &i +} + +// Type is type information. +type Type struct { + Name string + Fields []string +} + +// loadType loads an object of type Type. +func loadType(r Reader) Type { + name := string(loadString(r)) + l := loadUint(r) + fields := make([]string, l) + for i := 0; i < int(l); i++ { + fields[i] = string(loadString(r)) + } + return Type{ + Name: name, + Fields: fields, + } +} + +// save implements Object.save. +func (t *Type) save(w Writer) { + s := String(t.Name) + s.save(w) + l := Uint(len(t.Fields)) + l.save(w) + for i := 0; i < int(l); i++ { + s := String(t.Fields[i]) + s.save(w) + } +} + +// load implements Object.load. +func (*Type) load(r Reader) Object { + t := loadType(r) + return &t +} + +// multipleObjects is a special type for serializing multiple objects. +type multipleObjects []Object + +// loadMultipleObjects loads a series of objects. +func loadMultipleObjects(r Reader) multipleObjects { + l := loadUint(r) + m := make(multipleObjects, l) + for i := 0; i < int(l); i++ { + m[i] = Load(r) + } + return m +} + +// save implements Object.save. +func (m *multipleObjects) save(w Writer) { + l := Uint(len(*m)) + l.save(w) + for i := 0; i < int(l); i++ { + Save(w, (*m)[i]) + } +} + +// load implements Object.load. +func (*multipleObjects) load(r Reader) Object { + m := loadMultipleObjects(r) + return &m +} + +// noObjects represents no objects. +type noObjects struct{} + +// loadNoObjects loads a sentinel. +func loadNoObjects(r Reader) noObjects { return noObjects{} } + +// save implements Object.save. +func (noObjects) save(w Writer) {} + +// load implements Object.load. +func (noObjects) load(r Reader) Object { return loadNoObjects(r) } + +// Struct is a basic composite value. +type Struct struct { + TypeID TypeID + fields Object // Optionally noObjects or *multipleObjects. +} + +// Field returns a pointer to the given field slot. +// +// This must be called after Alloc. +func (s *Struct) Field(i int) *Object { + if fields, ok := s.fields.(*multipleObjects); ok { + return &((*fields)[i]) + } + if _, ok := s.fields.(noObjects); ok { + // Alloc may be optionally called; can't call twice. + panic("Field called inappropriately, wrong Alloc?") + } + return &s.fields +} + +// Alloc allocates the given number of fields. +// +// This must be called before Add and Save. +// +// Precondition: slots must be positive. +func (s *Struct) Alloc(slots int) { + switch { + case slots == 0: + s.fields = noObjects{} + case slots == 1: + // Leave it alone. + case slots > 1: + fields := make(multipleObjects, slots) + s.fields = &fields + default: + // Violates precondition. + panic(fmt.Sprintf("Alloc called with negative slots %d?", slots)) + } +} + +// Fields returns the number of fields. +func (s *Struct) Fields() int { + switch x := s.fields.(type) { + case *multipleObjects: + return len(*x) + case noObjects: + return 0 + default: + return 1 + } +} + +// loadStruct loads an object of type Struct. +func loadStruct(r Reader) Struct { + return Struct{ + TypeID: TypeID(loadUint(r)), + fields: Load(r), + } +} + +// save implements Object.save. +// +// Precondition: Alloc must have been called, and the fields all filled in +// appropriately. See Alloc and Add for more details. +func (s *Struct) save(w Writer) { + Uint(s.TypeID).save(w) + Save(w, s.fields) +} + +// load implements Object.load. +func (*Struct) load(r Reader) Object { + s := loadStruct(r) + return &s +} + +// Object types. +// +// N.B. Be careful about changing the order or introducing new elements in the +// middle here. This is part of the wire format and shouldn't change. +const ( + typeBool Uint = iota + typeInt + typeUint + typeFloat32 + typeFloat64 + typeNil + typeRef + typeString + typeSlice + typeArray + typeMap + typeStruct + typeNoObjects + typeMultipleObjects + typeInterface + typeComplex64 + typeComplex128 + typeType +) + +// Save saves the given object. +// +// +checkescape all +// +// N.B. This function will panic on error. +func Save(w Writer, obj Object) { + switch x := obj.(type) { + case Bool: + typeBool.save(w) + x.save(w) + case Int: + typeInt.save(w) + x.save(w) + case Uint: + typeUint.save(w) + x.save(w) + case Float32: + typeFloat32.save(w) + x.save(w) + case Float64: + typeFloat64.save(w) + x.save(w) + case Nil: + typeNil.save(w) + x.save(w) + case *Ref: + typeRef.save(w) + x.save(w) + case *String: + typeString.save(w) + x.save(w) + case *Slice: + typeSlice.save(w) + x.save(w) + case *Array: + typeArray.save(w) + x.save(w) + case *Map: + typeMap.save(w) + x.save(w) + case *Struct: + typeStruct.save(w) + x.save(w) + case noObjects: + typeNoObjects.save(w) + x.save(w) + case *multipleObjects: + typeMultipleObjects.save(w) + x.save(w) + case *Interface: + typeInterface.save(w) + x.save(w) + case *Type: + typeType.save(w) + x.save(w) + case *Complex64: + typeComplex64.save(w) + x.save(w) + case *Complex128: + typeComplex128.save(w) + x.save(w) + default: + panic(fmt.Errorf("unknown type: %#v", obj)) + } +} + +// Load loads a new object. +// +// +checkescape all +// +// N.B. This function will panic on error. +func Load(r Reader) Object { + switch hdr := loadUint(r); hdr { + case typeBool: + return loadBool(r) + case typeInt: + return loadInt(r) + case typeUint: + return loadUint(r) + case typeFloat32: + return loadFloat32(r) + case typeFloat64: + return loadFloat64(r) + case typeNil: + return loadNil(r) + case typeRef: + return ((*Ref)(nil)).load(r) // Escapes. + case typeString: + return ((*String)(nil)).load(r) // Escapes. + case typeSlice: + return ((*Slice)(nil)).load(r) // Escapes. + case typeArray: + return ((*Array)(nil)).load(r) // Escapes. + case typeMap: + return ((*Map)(nil)).load(r) // Escapes. + case typeStruct: + return ((*Struct)(nil)).load(r) // Escapes. + case typeNoObjects: // Special for struct. + return loadNoObjects(r) + case typeMultipleObjects: // Special for struct. + return ((*multipleObjects)(nil)).load(r) // Escapes. + case typeInterface: + return ((*Interface)(nil)).load(r) // Escapes. + case typeComplex64: + return ((*Complex64)(nil)).load(r) // Escapes. + case typeComplex128: + return ((*Complex128)(nil)).load(r) // Escapes. + case typeType: + return ((*Type)(nil)).load(r) // Escapes. + default: + // This is not a valid stream? + panic(fmt.Errorf("unknown header: %d", hdr)) + } +} + +// LoadUint loads a single unsigned integer. +// +// N.B. This function will panic on error. +func LoadUint(r Reader) uint64 { + return uint64(loadUint(r)) +} + +// SaveUint saves a single unsigned integer. +// +// N.B. This function will panic on error. +func SaveUint(w Writer, v uint64) { + Uint(v).save(w) +} diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index af3538ef0..dae9b3b3e 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -45,7 +45,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/platform", - "//pkg/state", + "//pkg/state/pretty", "//pkg/state/statefile", "//pkg/sync", "//pkg/unet", diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go index e6f1907da..daed9e728 100644 --- a/runsc/cmd/statefile.go +++ b/runsc/cmd/statefile.go @@ -20,7 +20,7 @@ import ( "os" "github.com/google/subcommands" - "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/pretty" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/runsc/flag" ) @@ -105,8 +105,14 @@ func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interfac if err != nil { Fatalf("error parsing statefile: %v", err) } - if err := state.PrettyPrint(output, rc, s.html); err != nil { - Fatalf("error printing state: %v", err) + if s.html { + if err := pretty.PrintHTML(output, rc); err != nil { + Fatalf("error printing state: %v", err) + } + } else { + if err := pretty.PrintText(output, rc); err != nil { + Fatalf("error printing state: %v", err) + } } return subcommands.ExitSuccess } diff --git a/tools/checkescape/checkescape.go b/tools/checkescape/checkescape.go index 571e9a6e6..f8def4823 100644 --- a/tools/checkescape/checkescape.go +++ b/tools/checkescape/checkescape.go @@ -88,7 +88,7 @@ const ( testMagic = "// +mustescape:" // exempt is the exemption annotation. - exempt = "// escapes:" + exempt = "// escapes" ) // escapingBuiltins are builtins known to escape. @@ -546,7 +546,7 @@ func run(pass *analysis.Pass) (interface{}, error) { for _, cg := range f.Comments { for _, c := range cg.List { p := pass.Fset.Position(c.Slash) - if strings.HasPrefix(c.Text, exempt) { + if strings.HasPrefix(strings.ToLower(c.Text), exempt) { exemptions[LinePosition{ Filename: filepath.Base(p.Filename), Line: p.Line, diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go index 309ee9c21..4f6ed208a 100644 --- a/tools/go_stateify/main.go +++ b/tools/go_stateify/main.go @@ -103,7 +103,7 @@ type scanFunctions struct { // skipped if nil. // // Fields tagged nosave are skipped. -func scanFields(ss *ast.StructType, fn scanFunctions) { +func scanFields(ss *ast.StructType, prefix string, fn scanFunctions) { if ss.Fields.List == nil { // No fields. return @@ -127,7 +127,16 @@ func scanFields(ss *ast.StructType, fn scanFunctions) { continue } - switch tag := extractStateTag(field.Tag); tag { + // Is this a anonymous struct? If yes, then continue the + // recursion with the given prefix. We don't pay attention to + // any tags on the top-level struct field. + tag := extractStateTag(field.Tag) + if anon, ok := field.Type.(*ast.StructType); ok && tag == "" { + scanFields(anon, name+".", fn) + continue + } + + switch tag { case "zerovalue": if fn.zerovalue != nil { fn.zerovalue(name) @@ -201,28 +210,12 @@ func main() { // initCalls is dumped at the end. var initCalls []string - // Declare our emission closures. + // Common closures. emitRegister := func(name string) { - initCalls = append(initCalls, fmt.Sprintf("%sRegister(\"%s.%s\", (*%s)(nil), state.Fns{Save: (*%s).save, Load: (*%s).load})", statePrefix, *fullPkg, name, name, name, name)) + initCalls = append(initCalls, fmt.Sprintf("%sRegister((*%s)(nil))", statePrefix, name)) } emitZeroCheck := func(name string) { - fmt.Fprintf(outputFile, " if !%sIsZeroValue(&x.%s) { m.Failf(\"%s is %%#v, expected zero\", &x.%s) }\n", statePrefix, name, name, name) - } - emitLoadValue := func(name, typName string) { - fmt.Fprintf(outputFile, " m.LoadValue(\"%s\", new(%s), func(y interface{}) { x.load%s(y.(%s)) })\n", name, typName, camelCased(name), typName) - } - emitLoad := func(name string) { - fmt.Fprintf(outputFile, " m.Load(\"%s\", &x.%s)\n", name, name) - } - emitLoadWait := func(name string) { - fmt.Fprintf(outputFile, " m.LoadWait(\"%s\", &x.%s)\n", name, name) - } - emitSaveValue := func(name, typName string) { - fmt.Fprintf(outputFile, " var %s %s = x.save%s()\n", name, typName, camelCased(name)) - fmt.Fprintf(outputFile, " m.SaveValue(\"%s\", %s)\n", name, name) - } - emitSave := func(name string) { - fmt.Fprintf(outputFile, " m.Save(\"%s\", &x.%s)\n", name, name) + fmt.Fprintf(outputFile, " if !%sIsZeroValue(&x.%s) { %sFailf(\"%s is %%#v, expected zero\", &x.%s) }\n", statePrefix, name, statePrefix, name, name) } // Automated warning. @@ -329,87 +322,140 @@ func main() { continue } - // Only generate code for types marked - // "// +stateify savable" in one of the proceeding - // comment lines. + // Only generate code for types marked "// +stateify + // savable" in one of the proceeding comment lines. If + // the line is marked "// +stateify type" then only + // generate type information and register the type. if d.Doc == nil { continue } - savable := false + var ( + generateTypeInfo = false + generateSaverLoader = false + ) for _, l := range d.Doc.List { if l.Text == "// +stateify savable" { - savable = true + generateTypeInfo = true + generateSaverLoader = true break } + if l.Text == "// +stateify type" { + generateTypeInfo = true + } } - if !savable { + if !generateTypeInfo && !generateSaverLoader { continue } for _, gs := range d.Specs { ts := gs.(*ast.TypeSpec) - switch ts.Type.(type) { - case *ast.InterfaceType, *ast.ChanType, *ast.FuncType, *ast.ParenExpr, *ast.StarExpr: - // Don't register. - break + switch x := ts.Type.(type) { case *ast.StructType: maybeEmitImports() - ss := ts.Type.(*ast.StructType) + // Record the slot for each field. + fieldCount := 0 + fields := make(map[string]int) + emitField := func(name string) { + fmt.Fprintf(outputFile, " \"%s\",\n", name) + fields[name] = fieldCount + fieldCount++ + } + emitFieldValue := func(name string, _ string) { + emitField(name) + } + emitLoadValue := func(name, typName string) { + fmt.Fprintf(outputFile, " m.LoadValue(%d, new(%s), func(y interface{}) { x.load%s(y.(%s)) })\n", fields[name], typName, camelCased(name), typName) + } + emitLoad := func(name string) { + fmt.Fprintf(outputFile, " m.Load(%d, &x.%s)\n", fields[name], name) + } + emitLoadWait := func(name string) { + fmt.Fprintf(outputFile, " m.LoadWait(%d, &x.%s)\n", fields[name], name) + } + emitSaveValue := func(name, typName string) { + fmt.Fprintf(outputFile, " var %s %s = x.save%s()\n", name, typName, camelCased(name)) + fmt.Fprintf(outputFile, " m.SaveValue(%d, %s)\n", fields[name], name) + } + emitSave := func(name string) { + fmt.Fprintf(outputFile, " m.Save(%d, &x.%s)\n", fields[name], name) + } + + // Generate the type name method. + fmt.Fprintf(outputFile, "func (x *%s) StateTypeName() string {\n", ts.Name.Name) + fmt.Fprintf(outputFile, " return \"%s.%s\"\n", *fullPkg, ts.Name.Name) + fmt.Fprintf(outputFile, "}\n\n") + + // Generate the fields method. + fmt.Fprintf(outputFile, "func (x *%s) StateFields() []string {\n", ts.Name.Name) + fmt.Fprintf(outputFile, " return []string{\n") + scanFields(x, "", scanFunctions{ + normal: emitField, + wait: emitField, + value: emitFieldValue, + }) + fmt.Fprintf(outputFile, " }\n") + fmt.Fprintf(outputFile, "}\n\n") - // Define beforeSave if a definition was not found. This - // prevents the code from compiling if a custom beforeSave - // was defined in a file not provided to this binary and - // prevents inherited methods from being called multiple times - // by overriding them. - if _, ok := simpleMethods[method{ts.Name.Name, "beforeSave"}]; !ok { - fmt.Fprintf(outputFile, "func (x *%s) beforeSave() {}\n", ts.Name.Name) + // Define beforeSave if a definition was not found. This prevents + // the code from compiling if a custom beforeSave was defined in a + // file not provided to this binary and prevents inherited methods + // from being called multiple times by overriding them. + if _, ok := simpleMethods[method{ts.Name.Name, "beforeSave"}]; !ok && generateSaverLoader { + fmt.Fprintf(outputFile, "func (x *%s) beforeSave() {}\n\n", ts.Name.Name) } // Generate the save method. - fmt.Fprintf(outputFile, "func (x *%s) save(m %sMap) {\n", ts.Name.Name, statePrefix) - fmt.Fprintf(outputFile, " x.beforeSave()\n") - scanFields(ss, scanFunctions{zerovalue: emitZeroCheck}) - scanFields(ss, scanFunctions{value: emitSaveValue}) - scanFields(ss, scanFunctions{normal: emitSave, wait: emitSave}) - fmt.Fprintf(outputFile, "}\n\n") + // + // N.B. For historical reasons, we perform the value saves first, + // and perform the value loads last. There should be no dependency + // on this specific behavior, but the ability to specify slots + // allows a manual implementation to be order-dependent. + if generateSaverLoader { + fmt.Fprintf(outputFile, "func (x *%s) StateSave(m %sSink) {\n", ts.Name.Name, statePrefix) + fmt.Fprintf(outputFile, " x.beforeSave()\n") + scanFields(x, "", scanFunctions{zerovalue: emitZeroCheck}) + scanFields(x, "", scanFunctions{value: emitSaveValue}) + scanFields(x, "", scanFunctions{normal: emitSave, wait: emitSave}) + fmt.Fprintf(outputFile, "}\n\n") + } - // Define afterLoad if a definition was not found. We do this - // for the same reason that we do it for beforeSave. + // Define afterLoad if a definition was not found. We do this for + // the same reason that we do it for beforeSave. _, hasAfterLoad := simpleMethods[method{ts.Name.Name, "afterLoad"}] - if !hasAfterLoad { - fmt.Fprintf(outputFile, "func (x *%s) afterLoad() {}\n", ts.Name.Name) + if !hasAfterLoad && generateSaverLoader { + fmt.Fprintf(outputFile, "func (x *%s) afterLoad() {}\n\n", ts.Name.Name) } // Generate the load method. // - // Note that the manual loads always follow the - // automated loads. - fmt.Fprintf(outputFile, "func (x *%s) load(m %sMap) {\n", ts.Name.Name, statePrefix) - scanFields(ss, scanFunctions{normal: emitLoad, wait: emitLoadWait}) - scanFields(ss, scanFunctions{value: emitLoadValue}) - if hasAfterLoad { - // The call to afterLoad is made conditionally, because when - // AfterLoad is called, the object encodes a dependency on - // referred objects (i.e. fields). This means that afterLoad - // will not be called until the other afterLoads are called. - fmt.Fprintf(outputFile, " m.AfterLoad(x.afterLoad)\n") + // N.B. See the comment above for the save method. + if generateSaverLoader { + fmt.Fprintf(outputFile, "func (x *%s) StateLoad(m %sSource) {\n", ts.Name.Name, statePrefix) + scanFields(x, "", scanFunctions{normal: emitLoad, wait: emitLoadWait}) + scanFields(x, "", scanFunctions{value: emitLoadValue}) + if hasAfterLoad { + // The call to afterLoad is made conditionally, because when + // AfterLoad is called, the object encodes a dependency on + // referred objects (i.e. fields). This means that afterLoad + // will not be called until the other afterLoads are called. + fmt.Fprintf(outputFile, " m.AfterLoad(x.afterLoad)\n") + } + fmt.Fprintf(outputFile, "}\n\n") } - fmt.Fprintf(outputFile, "}\n\n") // Add to our registration. emitRegister(ts.Name.Name) + case *ast.Ident, *ast.SelectorExpr, *ast.ArrayType: maybeEmitImports() - _, val := resolveTypeName(ts.Name.Name, ts.Type) - - // Dispatch directly. - fmt.Fprintf(outputFile, "func (x *%s) save(m %sMap) {\n", ts.Name.Name, statePrefix) - fmt.Fprintf(outputFile, " m.SaveValue(\"\", (%s)(*x))\n", val) + // Generate the info methods. + fmt.Fprintf(outputFile, "func (x *%s) StateTypeName() string {\n", ts.Name.Name) + fmt.Fprintf(outputFile, " return \"%s.%s\"\n", *fullPkg, ts.Name.Name) fmt.Fprintf(outputFile, "}\n\n") - fmt.Fprintf(outputFile, "func (x *%s) load(m %sMap) {\n", ts.Name.Name, statePrefix) - fmt.Fprintf(outputFile, " m.LoadValue(\"\", new(%s), func(y interface{}) { *x = (%s)(y.(%s)) })\n", val, ts.Name.Name, val) + fmt.Fprintf(outputFile, "func (x *%s) StateFields() []string {\n", ts.Name.Name) + fmt.Fprintf(outputFile, " return nil\n") fmt.Fprintf(outputFile, "}\n\n") // See above. -- cgit v1.2.3 From 10930b0f8c1ff2ac83c7a30cc1f78112a35e3183 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Wed, 24 Jun 2020 13:54:58 -0700 Subject: Remove waiter.Entry.Context This field is redundant since state can be stored in the callback. PiperOrigin-RevId: 318134855 --- pkg/sentry/kernel/epoll/epoll.go | 13 +++++++------ pkg/sentry/kernel/epoll/epoll_state.go | 3 +-- pkg/waiter/waiter.go | 18 ++++++------------ 3 files changed, 14 insertions(+), 20 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 3d78cd48f..679ab495d 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -271,11 +271,13 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { // readyCallback is called when one of the files we're polling becomes ready. It // moves said file to the readyList if it's currently in the waiting list. -type readyCallback struct{} +type readyCallback struct { + context *pollEntry +} // Callback implements waiter.EntryCallback.Callback. -func (*readyCallback) Callback(w *waiter.Entry) { - entry := w.Context.(*pollEntry) +func (r *readyCallback) Callback(*waiter.Entry) { + entry := r.context e := entry.epoll e.listsMu.Lock() @@ -310,7 +312,7 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) { // Check if the file happens to already be in a ready state. ready := f.Readiness(entry.mask) & entry.mask if ready != 0 { - (*readyCallback).Callback(nil, &entry.waiter) + (&readyCallback{context: entry}).Callback(&entry.waiter) } } @@ -380,10 +382,9 @@ func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.Ev userData: data, epoll: e, flags: flags, - waiter: waiter.Entry{Callback: &readyCallback{}}, mask: mask, } - entry.waiter.Context = entry + entry.waiter.Callback = &readyCallback{context: entry} e.files[id] = entry entry.file = refs.NewWeakRef(id.File, entry) diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go index 8e9f200d0..02f9aabfa 100644 --- a/pkg/sentry/kernel/epoll/epoll_state.go +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -21,8 +21,7 @@ import ( // afterLoad is invoked by stateify. func (p *pollEntry) afterLoad() { - p.waiter = waiter.Entry{Callback: &readyCallback{}} - p.waiter.Context = p + p.waiter.Callback = &readyCallback{context: p} p.file = refs.NewWeakRef(p.id.File, p) p.id.File.EventRegister(&p.waiter, p.mask) } diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go index 707eb085b..67a950444 100644 --- a/pkg/waiter/waiter.go +++ b/pkg/waiter/waiter.go @@ -128,13 +128,6 @@ type EntryCallback interface { // // +stateify savable type Entry struct { - // Context stores any state the waiter may wish to store in the entry - // itself, which may be used at wake up time. - // - // Note that use of this field is optional and state may alternatively be - // stored in the callback itself. - Context interface{} - Callback EntryCallback // The following fields are protected by the queue lock. @@ -142,13 +135,14 @@ type Entry struct { waiterEntry } -type channelCallback struct{} +type channelCallback struct { + ch chan struct{} +} // Callback implements EntryCallback.Callback. -func (*channelCallback) Callback(e *Entry) { - ch := e.Context.(chan struct{}) +func (c *channelCallback) Callback(*Entry) { select { - case ch <- struct{}{}: + case c.ch <- struct{}{}: default: } } @@ -164,7 +158,7 @@ func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) { c = make(chan struct{}, 1) } - return Entry{Context: c, Callback: &channelCallback{}}, c + return Entry{Callback: &channelCallback{ch: c}}, c } // Queue represents the wait queue where waiters can be added and -- cgit v1.2.3 From 4069461877d843654d18db74a5962b332f1226aa Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 25 Jun 2020 14:17:00 -0700 Subject: Avoid an allocation in epoll PiperOrigin-RevId: 318346153 --- pkg/sentry/kernel/epoll/epoll.go | 32 ++++++++++++++------------------ pkg/sentry/kernel/epoll/epoll_state.go | 2 +- 2 files changed, 15 insertions(+), 19 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 679ab495d..4c0f1e41f 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -107,7 +107,7 @@ type EventPoll struct { // different lock to avoid circular lock acquisition order involving // the wait queue mutexes and mu. The full order is mu, observed file // wait queue mutex, then listsMu; this allows listsMu to be acquired - // when readyCallback is called. + // when (*pollEntry).Callback is called. // // An entry is always in one of the following lists: // readyList -- when there's a chance that it's ready to have @@ -116,7 +116,7 @@ type EventPoll struct { // readEvents() functions always call the entry's file // Readiness() function to confirm it's ready. // waitingList -- when there's no chance that the entry is ready, - // so it's waiting for the readyCallback to be called + // so it's waiting for the (*pollEntry).Callback to be called // on it before it gets moved to the readyList. // disabledList -- when the entry is disabled. This happens when // a one-shot entry gets delivered via readEvents(). @@ -269,23 +269,19 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { return ret } -// readyCallback is called when one of the files we're polling becomes ready. It -// moves said file to the readyList if it's currently in the waiting list. -type readyCallback struct { - context *pollEntry -} - // Callback implements waiter.EntryCallback.Callback. -func (r *readyCallback) Callback(*waiter.Entry) { - entry := r.context - e := entry.epoll +// +// Callback is called when one of the files we're polling becomes ready. It +// moves said file to the readyList if it's currently in the waiting list. +func (p *pollEntry) Callback(*waiter.Entry) { + e := p.epoll e.listsMu.Lock() - if entry.curList == &e.waitingList { - e.waitingList.Remove(entry) - e.readyList.PushBack(entry) - entry.curList = &e.readyList + if p.curList == &e.waitingList { + e.waitingList.Remove(p) + e.readyList.PushBack(p) + p.curList = &e.readyList e.listsMu.Unlock() e.Notify(waiter.EventIn) @@ -312,7 +308,7 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) { // Check if the file happens to already be in a ready state. ready := f.Readiness(entry.mask) & entry.mask if ready != 0 { - (&readyCallback{context: entry}).Callback(&entry.waiter) + entry.Callback(&entry.waiter) } } @@ -384,7 +380,7 @@ func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.Ev flags: flags, mask: mask, } - entry.waiter.Callback = &readyCallback{context: entry} + entry.waiter.Callback = entry e.files[id] = entry entry.file = refs.NewWeakRef(id.File, entry) @@ -407,7 +403,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter } // Unregister the old mask and remove entry from the list it's in, so - // readyCallback is guaranteed to not be called on this entry anymore. + // (*pollEntry).Callback is guaranteed to not be called on this entry anymore. entry.id.File.EventUnregister(&entry.waiter) // Remove entry from whatever list it's in. This ensure that no other diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go index 02f9aabfa..7c61e0258 100644 --- a/pkg/sentry/kernel/epoll/epoll_state.go +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -21,7 +21,7 @@ import ( // afterLoad is invoked by stateify. func (p *pollEntry) afterLoad() { - p.waiter.Callback = &readyCallback{context: p} + p.waiter.Callback = p p.file = refs.NewWeakRef(p.id.File, p) p.id.File.EventRegister(&p.waiter, p.mask) } -- cgit v1.2.3 From 9cfc15497581824f1c6ba2b9f9ee653d0be0bc5a Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Fri, 26 Jun 2020 16:23:15 -0700 Subject: Require CAP_SYS_ADMIN in the root user namespace for TTY theft PiperOrigin-RevId: 318563543 --- pkg/sentry/kernel/thread_group.go | 3 ++- test/syscalls/linux/pty_root.cc | 22 ++++++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 52849f5b3..4dfd2c990 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -366,7 +366,8 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error { // terminal is stolen, and all processes that had it as controlling // terminal lose it." - tty_ioctl(4) if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { - if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 { + // Stealing requires CAP_SYS_ADMIN in the root user namespace. + if creds := auth.CredentialsFromContext(tg.leader); !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) || arg != 1 { return syserror.EPERM } // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. diff --git a/test/syscalls/linux/pty_root.cc b/test/syscalls/linux/pty_root.cc index 14a4af980..1d7dbefdb 100644 --- a/test/syscalls/linux/pty_root.cc +++ b/test/syscalls/linux/pty_root.cc @@ -25,16 +25,26 @@ namespace gvisor { namespace testing { -// These tests should be run as root. namespace { +// StealTTY tests whether privileged processes can steal controlling terminals. +// If the stealing process has CAP_SYS_ADMIN in the root user namespace, the +// test ensures that stealing works. If it has non-root CAP_SYS_ADMIN, it +// ensures stealing fails. TEST(JobControlRootTest, StealTTY) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); - // Make this a session leader, which also drops the controlling terminal. - // In the gVisor test environment, this test will be run as the session - // leader already (as the sentry init process). + bool true_root = true; if (!IsRunningOnGvisor()) { + // If running in Linux, we may only have CAP_SYS_ADMIN in a non-root user + // namespace (i.e. we are not truly root). We use init_module as a proxy for + // whether we are true root, as it returns EPERM immediately. + ASSERT_THAT(syscall(SYS_init_module, nullptr, 0, nullptr), SyscallFails()); + true_root = errno != EPERM; + + // Make this a session leader, which also drops the controlling terminal. + // In the gVisor test environment, this test will be run as the session + // leader already (as the sentry init process). ASSERT_THAT(setsid(), SyscallSucceeds()); } @@ -53,8 +63,8 @@ TEST(JobControlRootTest, StealTTY) { ASSERT_THAT(setsid(), SyscallSucceeds()); // We shouldn't be able to steal the terminal with the wrong arg value. TEST_PCHECK(ioctl(slave.get(), TIOCSCTTY, 0)); - // We should be able to steal it here. - TEST_PCHECK(!ioctl(slave.get(), TIOCSCTTY, 1)); + // We should be able to steal it if we are true root. + TEST_PCHECK(true_root == !ioctl(slave.get(), TIOCSCTTY, 1)); _exit(0); } -- cgit v1.2.3 From e8f1a5c1f652ba7abb8c4bd842d6afdcab03865a Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Sat, 27 Jun 2020 21:32:16 -0700 Subject: Port GETOWN, SETOWN fcntls to vfs2. Also make some fixes to vfs1's F_SETOWN. The fcntl test now entirely passes on vfs2. Fixes #2920. PiperOrigin-RevId: 318669529 --- pkg/abi/linux/fcntl.go | 2 +- pkg/sentry/kernel/fasync/BUILD | 1 + pkg/sentry/kernel/fasync/fasync.go | 8 +- pkg/sentry/syscalls/linux/sys_file.go | 15 ++-- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/fd.go | 93 ++++++++++++++++++++++ pkg/sentry/vfs/file_description.go | 74 ++++++++++++++++-- test/syscalls/linux/BUILD | 1 + test/syscalls/linux/fcntl.cc | 140 ++++++++++++++++++++++++++++++---- 9 files changed, 308 insertions(+), 27 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go index 6663a199c..9242e80a5 100644 --- a/pkg/abi/linux/fcntl.go +++ b/pkg/abi/linux/fcntl.go @@ -55,7 +55,7 @@ type Flock struct { _ [4]byte } -// Flags for F_SETOWN_EX and F_GETOWN_EX. +// Owner types for F_SETOWN_EX and F_GETOWN_EX. const ( F_OWNER_TID = 0 F_OWNER_PID = 1 diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index b9126e946..2b3955598 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -11,6 +11,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index d32c3e90a..323f1dfa5 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -20,15 +20,21 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) -// New creates a new FileAsync. +// New creates a new fs.FileAsync. func New() fs.FileAsync { return &FileAsync{} } +// NewVFS2 creates a new vfs.FileAsync. +func NewVFS2() vfs.FileAsync { + return &FileAsync{} +} + // FileAsync sends signals when the registered file is ready for IO. // // +stateify savable diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 35eba20c5..2797c6a72 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -900,14 +900,20 @@ func fGetOwn(t *kernel.Task, file *fs.File) int32 { // // If who is positive, it represents a PID. If negative, it represents a PGID. // If the PID or PGID is invalid, the owner is silently unset. -func fSetOwn(t *kernel.Task, file *fs.File, who int32) { +func fSetOwn(t *kernel.Task, file *fs.File, who int32) error { a := file.Async(fasync.New).(*fasync.FileAsync) if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return syserror.EINVAL + } pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who)) a.SetOwnerProcessGroup(t, pg) + } else { + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who)) + a.SetOwnerThreadGroup(t, tg) } - tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who)) - a.SetOwnerThreadGroup(t, tg) + return nil } // Fcntl implements linux syscall fcntl(2). @@ -1042,8 +1048,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_GETOWN: return uintptr(fGetOwn(t, file)), nil, nil case linux.F_SETOWN: - fSetOwn(t, file, args[2].Int()) - return 0, nil, nil + return 0, nil, fSetOwn(t, file, args[2].Int()) case linux.F_GETOWN_EX: addr := args[2].Pointer() owner := fGetOwnEx(t, file) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index c301a0991..0c740335b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -54,6 +54,7 @@ go_library( "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/fasync", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index e68b20bed..7e4c6a56e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -154,6 +155,47 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } return uintptr(n), nil, nil + case linux.F_GETOWN: + a := file.AsyncHandler() + if a == nil { + return 0, nil, nil + } + owner := getAsyncOwner(t, a.(*fasync.FileAsync)) + if owner.Type == linux.F_OWNER_PGRP { + return uintptr(-owner.PID), nil, nil + } + return uintptr(owner.PID), nil, nil + case linux.F_SETOWN: + who := args[2].Int() + ownerType := int32(linux.F_OWNER_PID) + if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return 0, nil, syserror.EINVAL + } + ownerType = linux.F_OWNER_PGRP + who = -who + } + a := file.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) + return 0, nil, setAsyncOwner(t, a, ownerType, who) + case linux.F_GETOWN_EX: + a := file.AsyncHandler() + if a == nil { + return 0, nil, nil + } + addr := args[2].Pointer() + owner := getAsyncOwner(t, a.(*fasync.FileAsync)) + _, err := t.CopyOut(addr, &owner) + return 0, nil, err + case linux.F_SETOWN_EX: + addr := args[2].Pointer() + var owner linux.FOwnerEx + n, err := t.CopyIn(addr, &owner) + if err != nil { + return 0, nil, err + } + a := file.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) + return uintptr(n), nil, setAsyncOwner(t, a, owner.Type, owner.PID) case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { @@ -177,6 +219,57 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } +func getAsyncOwner(t *kernel.Task, a *fasync.FileAsync) linux.FOwnerEx { + ot, otg, opg := a.Owner() + switch { + case ot != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_TID, + PID: int32(t.PIDNamespace().IDOfTask(ot)), + } + case otg != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_PID, + PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), + } + case opg != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_PGRP, + PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), + } + default: + return linux.FOwnerEx{} + } +} + +func setAsyncOwner(t *kernel.Task, a *fasync.FileAsync, ownerType, pid int32) error { + switch ownerType { + case linux.F_OWNER_TID: + task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) + if task == nil { + return syserror.ESRCH + } + a.SetOwnerTask(t, task) + return nil + case linux.F_OWNER_PID: + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid)) + if tg == nil { + return syserror.ESRCH + } + a.SetOwnerThreadGroup(t, tg) + return nil + case linux.F_OWNER_PGRP: + pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid)) + if pg == nil { + return syserror.ESRCH + } + a.SetOwnerProcessGroup(t, pg) + return nil + default: + return syserror.EINVAL + } +} + func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error { // Copy in the lock request. flockAddr := args[2].Pointer() diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index e0538ea53..cd1db14ac 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -42,11 +42,20 @@ type FileDescription struct { // operations. refs int64 + // flagsMu protects statusFlags and asyncHandler below. + flagsMu sync.Mutex + // statusFlags contains status flags, "initialized by open(2) and possibly - // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic - // memory operations. + // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic + // memory operations when it does not need to be synchronized with an + // access to asyncHandler. statusFlags uint32 + // asyncHandler handles O_ASYNC signal generation. It is set with the + // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must + // also be set by fcntl(2). + asyncHandler FileAsync + // epolls is the set of epollInterests registered for this FileDescription. // epolls is protected by epollMu. epollMu sync.Mutex @@ -193,6 +202,13 @@ func (fd *FileDescription) DecRef() { fd.vd.mount.EndWrite() } fd.vd.DecRef() + fd.flagsMu.Lock() + // TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1. + if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { + fd.asyncHandler.Unregister(fd) + } + fd.asyncHandler = nil + fd.flagsMu.Unlock() } else if refs < 0 { panic("FileDescription.DecRef() called without holding a reference") } @@ -276,7 +292,18 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede } // TODO(jamieliu): FileDescriptionImpl.SetOAsync()? const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK - atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags)) + fd.flagsMu.Lock() + if fd.asyncHandler != nil { + // Use fd.statusFlags instead of oldFlags, which may have become outdated, + // to avoid double registering/unregistering. + if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 { + fd.asyncHandler.Register(fd) + } else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 { + fd.asyncHandler.Unregister(fd) + } + } + fd.statusFlags = (oldFlags &^ settableFlags) | (flags & settableFlags) + fd.flagsMu.Unlock() return nil } @@ -533,17 +560,23 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.impl.StatFS(ctx) } -// Readiness returns fd's I/O readiness. +// Readiness implements waiter.Waitable.Readiness. +// +// It returns fd's I/O readiness. func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fd.impl.Readiness(mask) } -// EventRegister registers e for I/O readiness events in mask. +// EventRegister implements waiter.Waitable.EventRegister. +// +// It registers e for I/O readiness events in mask. func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { fd.impl.EventRegister(e, mask) } -// EventUnregister unregisters e for I/O readiness events. +// EventUnregister implements waiter.Waitable.EventUnregister. +// +// It unregisters e for I/O readiness events. func (fd *FileDescription) EventUnregister(e *waiter.Entry) { fd.impl.EventUnregister(e) } @@ -770,3 +803,32 @@ func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t l func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error { return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence) } + +// A FileAsync sends signals to its owner when w is ready for IO. This is only +// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this +// interface to avoid circular dependencies. +type FileAsync interface { + Register(w waiter.Waitable) + Unregister(w waiter.Waitable) +} + +// AsyncHandler returns the FileAsync for fd. +func (fd *FileDescription) AsyncHandler() FileAsync { + fd.flagsMu.Lock() + defer fd.flagsMu.Unlock() + return fd.asyncHandler +} + +// SetAsyncHandler sets fd.asyncHandler if it has not been set before and +// returns it. +func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync { + fd.flagsMu.Lock() + defer fd.flagsMu.Unlock() + if fd.asyncHandler == nil { + fd.asyncHandler = newHandler() + if fd.statusFlags&linux.O_ASYNC != 0 { + fd.asyncHandler.Register(fd) + } + } + return fd.asyncHandler +} diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 270b9e4c4..e141a86bb 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -805,6 +805,7 @@ cc_binary( "//test/util:save_util", "//test/util:temp_path", "//test/util:test_util", + "//test/util:thread_util", "//test/util:timer_util", ], ) diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 25bef2522..9130618fa 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -37,6 +37,7 @@ #include "test/util/save_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" +#include "test/util/thread_util.h" #include "test/util/timer_util.h" ABSL_FLAG(std::string, child_setlock_on, "", @@ -953,15 +954,18 @@ TEST(FcntlTest, DupAfterO_ASYNC) { EXPECT_EQ(after & O_ASYNC, O_ASYNC); } -TEST(FcntlTest, GetOwn) { +TEST(FcntlTest, GetOwnNone) { FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); - EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), 0); + // Use the raw syscall because the glibc wrapper may convert F_{GET,SET}OWN + // into F_{GET,SET}OWN_EX. + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + SyscallSucceedsWithValue(0)); MaybeSave(); } -TEST(FcntlTest, GetOwnEx) { +TEST(FcntlTest, GetOwnExNone) { FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); @@ -970,6 +974,70 @@ TEST(FcntlTest, GetOwnEx) { SyscallSucceedsWithValue(0)); } +TEST(FcntlTest, SetOwnInvalidPid) { + SKIP_IF(IsRunningWithVFS1()); + + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 12345678), + SyscallFailsWithErrno(ESRCH)); +} + +TEST(FcntlTest, SetOwnInvalidPgrp) { + SKIP_IF(IsRunningWithVFS1()); + + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -12345678), + SyscallFailsWithErrno(ESRCH)); +} + +TEST(FcntlTest, SetOwnPid) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + pid_t pid; + EXPECT_THAT(pid = getpid(), SyscallSucceeds()); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds()); + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + SyscallSucceedsWithValue(pid)); + MaybeSave(); +} + +TEST(FcntlTest, SetOwnPgrp) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + pid_t pgid; + EXPECT_THAT(pgid = getpgrp(), SyscallSucceeds()); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), SyscallSucceeds()); + + // Verify with F_GETOWN_EX; using F_GETOWN on Linux may incorrectly treat the + // negative return value as an error, converting the return value to -1 and + // setting errno accordingly. + f_owner_ex got_owner = {}; + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(got_owner.type, F_OWNER_PGRP); + EXPECT_EQ(got_owner.pid, pgid); + MaybeSave(); +} + +// F_SETOWN flips the sign of negative values, an operation that is guarded +// against overflow. +TEST(FcntlTest, SetOwnOverflow) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, INT_MIN), + SyscallFailsWithErrno(EINVAL)); +} + TEST(FcntlTest, SetOwnExInvalidType) { FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); @@ -1027,7 +1095,8 @@ TEST(FcntlTest, SetOwnExTid) { ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), SyscallSucceeds()); - EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid); + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + SyscallSucceedsWithValue(owner.pid)); MaybeSave(); } @@ -1042,7 +1111,8 @@ TEST(FcntlTest, SetOwnExPid) { ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), SyscallSucceeds()); - EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid); + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + SyscallSucceedsWithValue(owner.pid)); MaybeSave(); } @@ -1050,18 +1120,21 @@ TEST(FcntlTest, SetOwnExPgrp) { FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); - f_owner_ex owner = {}; - owner.type = F_OWNER_PGRP; - EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds()); + f_owner_ex set_owner = {}; + set_owner.type = F_OWNER_PGRP; + EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), SyscallSucceeds()); - // NOTE(igudger): I don't understand why, but this is flaky on Linux. - // GetOwnExPgrp (below) does not have this issue. - SKIP_IF(!IsRunningOnGvisor()); - - EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), -owner.pid); + // Verify with F_GETOWN_EX; using F_GETOWN on Linux may incorrectly treat the + // negative return value as an error, converting the return value to -1 and + // setting errno accordingly. + f_owner_ex got_owner = {}; + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(got_owner.type, set_owner.type); + EXPECT_EQ(got_owner.pid, set_owner.pid); MaybeSave(); } @@ -1119,6 +1192,45 @@ TEST(FcntlTest, GetOwnExPgrp) { EXPECT_EQ(got_owner.pid, set_owner.pid); } +// Make sure that making multiple concurrent changes to async signal generation +// does not cause any race issues. +TEST(FcntlTest, SetFlSetOwnDoNotRace) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + pid_t pid; + EXPECT_THAT(pid = getpid(), SyscallSucceeds()); + + constexpr absl::Duration runtime = absl::Milliseconds(300); + auto setAsync = [&s, &runtime] { + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETFL, O_ASYNC), + SyscallSucceeds()); + sched_yield(); + } + }; + auto resetAsync = [&s, &runtime] { + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETFL, 0), SyscallSucceeds()); + sched_yield(); + } + }; + auto setOwn = [&s, &pid, &runtime] { + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), + SyscallSucceeds()); + sched_yield(); + } + }; + + std::list threads; + for (int i = 0; i < 10; i++) { + threads.emplace_back(setAsync); + threads.emplace_back(resetAsync); + threads.emplace_back(setOwn); + } +} + } // namespace } // namespace testing -- cgit v1.2.3 From cda2979b63fad37a33706f8aa430664a9c4d0b3b Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 1 Jul 2020 08:40:31 -0700 Subject: Complete async signal delivery support in vfs2. - Support FIOASYNC, FIO{SET,GET}OWN, SIOC{G,S}PGRP (refactor getting/setting owner in the process). - Unset signal recipient when setting owner with pid == 0 and valid owner type. Updates #2923. PiperOrigin-RevId: 319231420 --- pkg/sentry/kernel/fasync/fasync.go | 10 ++++++ pkg/sentry/syscalls/linux/vfs2/fd.go | 54 +++++++++++++++++++------------ pkg/sentry/syscalls/linux/vfs2/ioctl.go | 43 +++++++++++++++++++++++++ test/syscalls/BUILD | 1 + test/syscalls/linux/fcntl.cc | 57 +++++++++++++++++++++++++++++++++ 5 files changed, 144 insertions(+), 21 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 323f1dfa5..153d2cd9b 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -176,3 +176,13 @@ func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kern a.recipientTG = nil a.recipientPG = recipient } + +// ClearOwner unsets the current signal recipient. +func (a *FileAsync) ClearOwner() { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = nil + a.recipientT = nil + a.recipientTG = nil + a.recipientPG = nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 7e4c6a56e..517394ba9 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -156,11 +156,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } return uintptr(n), nil, nil case linux.F_GETOWN: - a := file.AsyncHandler() - if a == nil { + owner, hasOwner := getAsyncOwner(t, file) + if !hasOwner { return 0, nil, nil } - owner := getAsyncOwner(t, a.(*fasync.FileAsync)) if owner.Type == linux.F_OWNER_PGRP { return uintptr(-owner.PID), nil, nil } @@ -176,26 +175,21 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall ownerType = linux.F_OWNER_PGRP who = -who } - a := file.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) - return 0, nil, setAsyncOwner(t, a, ownerType, who) + return 0, nil, setAsyncOwner(t, file, ownerType, who) case linux.F_GETOWN_EX: - a := file.AsyncHandler() - if a == nil { + owner, hasOwner := getAsyncOwner(t, file) + if !hasOwner { return 0, nil, nil } - addr := args[2].Pointer() - owner := getAsyncOwner(t, a.(*fasync.FileAsync)) - _, err := t.CopyOut(addr, &owner) + _, err := t.CopyOut(args[2].Pointer(), &owner) return 0, nil, err case linux.F_SETOWN_EX: - addr := args[2].Pointer() var owner linux.FOwnerEx - n, err := t.CopyIn(addr, &owner) + n, err := t.CopyIn(args[2].Pointer(), &owner) if err != nil { return 0, nil, err } - a := file.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) - return uintptr(n), nil, setAsyncOwner(t, a, owner.Type, owner.PID) + return uintptr(n), nil, setAsyncOwner(t, file, owner.Type, owner.PID) case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { @@ -219,30 +213,48 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } -func getAsyncOwner(t *kernel.Task, a *fasync.FileAsync) linux.FOwnerEx { - ot, otg, opg := a.Owner() +func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) { + a := fd.AsyncHandler() + if a == nil { + return linux.FOwnerEx{}, false + } + + ot, otg, opg := a.(*fasync.FileAsync).Owner() switch { case ot != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_TID, PID: int32(t.PIDNamespace().IDOfTask(ot)), - } + }, true case otg != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_PID, PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), - } + }, true case opg != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_PGRP, PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), - } + }, true default: - return linux.FOwnerEx{} + return linux.FOwnerEx{}, true } } -func setAsyncOwner(t *kernel.Task, a *fasync.FileAsync, ownerType, pid int32) error { +func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error { + switch ownerType { + case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: + // Acceptable type. + default: + return syserror.EINVAL + } + + a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) + if pid == 0 { + a.ClearOwner() + return nil + } + switch ownerType { case linux.F_OWNER_TID: task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index 0399c0db4..fd6ab94b2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -57,6 +57,49 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall flags &^= linux.O_NONBLOCK } return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) + + case linux.FIOASYNC: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.StatusFlags() + if set != 0 { + flags |= linux.O_ASYNC + } else { + flags &^= linux.O_ASYNC + } + file.SetStatusFlags(t, t.Credentials(), flags) + return 0, nil, nil + + case linux.FIOGETOWN, linux.SIOCGPGRP: + var who int32 + owner, hasOwner := getAsyncOwner(t, file) + if hasOwner { + if owner.Type == linux.F_OWNER_PGRP { + who = -owner.PID + } else { + who = owner.PID + } + } + _, err := t.CopyOut(args[2].Pointer(), &who) + return 0, nil, err + + case linux.FIOSETOWN, linux.SIOCSPGRP: + var who int32 + if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil { + return 0, nil, err + } + ownerType := int32(linux.F_OWNER_PID) + if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return 0, nil, syserror.EINVAL + } + ownerType = linux.F_OWNER_PGRP + who = -who + } + return 0, nil, setAsyncOwner(t, file, ownerType, who) } ret, err := file.Ioctl(t, t.MemoryManager(), args) diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 36c178e4a..88ed36b69 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -288,6 +288,7 @@ syscall_test( size = "medium", add_overlay = True, test = "//test/syscalls/linux:ioctl_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 4b9b4db99..5467fa2c8 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -1031,6 +1031,30 @@ TEST(FcntlTest, SetOwnPgrp) { MaybeSave(); } +TEST(FcntlTest, SetOwnUnset) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + // Set and unset pid. + pid_t pid; + EXPECT_THAT(pid = getpid(), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), SyscallSucceeds()); + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + SyscallSucceedsWithValue(0)); + + // Set and unset pgid. + pid_t pgid; + EXPECT_THAT(pgid = getpgrp(), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), SyscallSucceeds()); + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + SyscallSucceedsWithValue(0)); + MaybeSave(); +} + // F_SETOWN flips the sign of negative values, an operation that is guarded // against overflow. TEST(FcntlTest, SetOwnOverflow) { @@ -1141,6 +1165,39 @@ TEST(FcntlTest, SetOwnExPgrp) { MaybeSave(); } +TEST(FcntlTest, SetOwnExUnset) { + SKIP_IF(IsRunningWithVFS1()); + + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + // Set and unset pid. + f_owner_ex owner = {}; + owner.type = F_OWNER_PID; + EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallSucceeds()); + owner.pid = 0; + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallSucceeds()); + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + SyscallSucceedsWithValue(0)); + + // Set and unset pgid. + owner.type = F_OWNER_PGRP; + EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallSucceeds()); + owner.pid = 0; + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallSucceeds()); + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + SyscallSucceedsWithValue(0)); + MaybeSave(); +} + TEST(FcntlTest, GetOwnExTid) { FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); -- cgit v1.2.3 From 6a90c88b97481a6d81b05f09d5c8ed7158225dd5 Mon Sep 17 00:00:00 2001 From: Zach Koopmans Date: Wed, 1 Jul 2020 13:13:04 -0700 Subject: Port fallocate to VFS2. PiperOrigin-RevId: 319283715 --- pkg/p9/p9.go | 13 ++++++++ pkg/sentry/fsimpl/gofer/regular_file.go | 29 ++++++++++++++++ pkg/sentry/fsimpl/host/host.go | 10 ++++++ pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 5 +++ pkg/sentry/fsimpl/tmpfs/regular_file.go | 15 +++++++++ pkg/sentry/kernel/pipe/vfs.go | 5 +++ pkg/sentry/socket/hostinet/socket_vfs2.go | 7 +++- pkg/sentry/syscalls/linux/vfs2/filesystem.go | 50 ++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 2 +- pkg/sentry/vfs/file_description.go | 4 +++ pkg/sentry/vfs/file_description_impl_util.go | 11 ++++++ pkg/sentry/vfs/inotify.go | 5 +++ test/syscalls/BUILD | 1 + test/syscalls/linux/BUILD | 5 +++ test/syscalls/linux/fallocate.cc | 45 +++++++++++++++++++++++++ 15 files changed, 205 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go index 28d851ff5..122c457d2 100644 --- a/pkg/p9/p9.go +++ b/pkg/p9/p9.go @@ -1091,6 +1091,19 @@ type AllocateMode struct { Unshare bool } +// ToAllocateMode returns an AllocateMode from a fallocate(2) mode. +func ToAllocateMode(mode uint64) AllocateMode { + return AllocateMode{ + KeepSize: mode&unix.FALLOC_FL_KEEP_SIZE != 0, + PunchHole: mode&unix.FALLOC_FL_PUNCH_HOLE != 0, + NoHideStale: mode&unix.FALLOC_FL_NO_HIDE_STALE != 0, + CollapseRange: mode&unix.FALLOC_FL_COLLAPSE_RANGE != 0, + ZeroRange: mode&unix.FALLOC_FL_ZERO_RANGE != 0, + InsertRange: mode&unix.FALLOC_FL_INSERT_RANGE != 0, + Unshare: mode&unix.FALLOC_FL_UNSHARE_RANGE != 0, + } +} + // ToLinux converts to a value compatible with fallocate(2)'s mode. func (a *AllocateMode) ToLinux() uint32 { rv := uint32(0) diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 404a452c4..faba73af2 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -67,6 +68,34 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { return d.handle.file.flush(ctx) } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + + d := fd.dentry() + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + size := offset + length + + // Allocating a smaller size is a noop. + if size <= d.size { + return nil + } + + d.handleMu.Lock() + defer d.handleMu.Unlock() + + err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + if err != nil { + return err + } + d.size = size + if !d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + return nil +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index d5e73ddae..007b3332e 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -543,6 +543,16 @@ func (f *fileDescription) Release() { // noop } +// Allocate implements vfs.FileDescriptionImpl. +func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + if !f.inode.seekable { + return syserror.ESPIPE + } + + // TODO(gvisor.dev/issue/2923): Implement Allocate for non-pipe hostfds. + return syserror.EOPNOTSUPP +} + // PRead implements FileDescriptionImpl. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { i := f.inode diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 5f7853a2a..ca8b8c63b 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -236,6 +236,11 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio return inode.SetStat(ctx, fd.filesystem(), creds, opts) } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) +} + // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index b805aadd0..6691add96 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -274,6 +274,21 @@ func (fd *regularFileFD) Release() { // noop } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + f := fd.inode().impl.(*regularFile) + + f.inode.mu.Lock() + defer f.inode.mu.Unlock() + oldSize := f.size + size := offset + length + if oldSize >= size { + return nil + } + _, err := f.truncateLocked(size) + return err +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index a4519363f..45d4c5fc1 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -200,6 +200,11 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { } } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ESPIPE +} + // EventRegister implements waiter.Waitable.EventRegister. func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { fd.pipe.EventRegister(e, mask) diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index ad5f64799..8f192c62f 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -96,7 +96,12 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal return ioctl(ctx, s.fd, uio, args) } -// PRead implements vfs.FileDescriptionImpl. +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + +// PRead implements vfs.FileDescriptionImpl.PRead. func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, syserror.ESPIPE } diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 5dac77e4d..b12b5967b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -239,6 +240,55 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd }) } +// Fallocate implements linux system call fallocate(2). +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].Uint64() + offset := args[2].Int64() + length := args[3].Int64() + + file := t.GetFileVFS2(fd) + + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + if !file.IsWritable() { + return 0, nil, syserror.EBADF + } + + if mode != 0 { + return 0, nil, syserror.ENOTSUP + } + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + + size := offset + length + + if size < 0 { + return 0, nil, syserror.EFBIG + } + + limit := limits.FromContext(t).Get(limits.FileSize).Cur + + if uint64(size) >= limit { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + return 0, nil, file.Impl().Allocate(t, mode, uint64(offset), uint64(length)) + + // File length modified, generate notification. + // TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported. + // file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) +} + // Rmdir implements Linux syscall rmdir(2). func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 9e60c4a1c..8f497ecc7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -138,7 +138,7 @@ func Override() { s.Table[282] = syscalls.Supported("signalfd", Signalfd) s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) s.Table[284] = syscalls.Supported("eventfd", Eventfd) - delete(s.Table, 285) // fallocate + s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil) s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) s.Table[288] = syscalls.Supported("accept4", Accept4) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index eb5dfd7e2..0c42574db 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -354,6 +354,10 @@ type FileDescriptionImpl interface { // represented by the FileDescription. StatFS(ctx context.Context) (linux.Statfs, error) + // Allocate grows file represented by FileDescription to offset + length bytes. + // Only mode == 0 is supported currently. + Allocate(ctx context.Context, mode, offset, length uint64) error + // waiter.Waitable methods may be used to poll for I/O events. waiter.Waitable diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 3fec0d6d6..6b8b4ad49 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -56,6 +56,12 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err return linux.Statfs{}, syserror.ENOSYS } +// Allocate implements FileDescriptionImpl.Allocate analogously to +// fallocate called on regular file, directory or FIFO in Linux. +func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + // Readiness implements waiter.Waitable.Readiness analogously to // file_operations::poll == NULL in Linux. func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { @@ -158,6 +164,11 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) // implementations of non-directory I/O methods that return EISDIR. type DirectoryFileDescriptionDefaultImpl struct{} +// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. +func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + // PRead implements FileDescriptionImpl.PRead. func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, syserror.EISDIR diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 509034531..c2e21ac5f 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -148,6 +148,11 @@ func (i *Inotify) Release() { } } +// Allocate implements FileDescription.Allocate. +func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error { + panic("Allocate should not be called on read-only inotify fds") +} + // EventRegister implements waiter.Waitable. func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) { i.queue.EventRegister(e, mask) diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 88ed36b69..67645cc83 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -199,6 +199,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:fallocate_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 8c7d54b21..9e097c888 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -748,9 +748,14 @@ cc_binary( linkstatic = 1, deps = [ ":file_base", + ":socket_test_util", "//test/util:cleanup", + "//test/util:eventfd_util", "//test/util:file_descriptor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/time", gtest, + "//test/util:posix_error", "//test/util:temp_path", "//test/util:test_main", "//test/util:test_util", diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc index 7819f4ac3..cabc2b751 100644 --- a/test/syscalls/linux/fallocate.cc +++ b/test/syscalls/linux/fallocate.cc @@ -15,16 +15,27 @@ #include #include #include +#include #include +#include +#include #include +#include #include #include #include +#include + #include "gtest/gtest.h" +#include "absl/strings/str_cat.h" +#include "absl/time/time.h" #include "test/syscalls/linux/file_base.h" +#include "test/syscalls/linux/socket_test_util.h" #include "test/util/cleanup.h" +#include "test/util/eventfd_util.h" #include "test/util/file_descriptor.h" +#include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" @@ -70,6 +81,12 @@ TEST_F(AllocateTest, Fallocate) { ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds()); ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds()); EXPECT_EQ(buf.st_size, 40); + + // Given length 0 should fail with EINVAL. + ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 50, 0), + SyscallFailsWithErrno(EINVAL)); + ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds()); + EXPECT_EQ(buf.st_size, 40); } TEST_F(AllocateTest, FallocateInvalid) { @@ -136,6 +153,34 @@ TEST_F(AllocateTest, FallocateRlimit) { ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds()); } +TEST_F(AllocateTest, FallocateOtherFDs) { + int fd; + ASSERT_THAT(fd = timerfd_create(CLOCK_MONOTONIC, 0), SyscallSucceeds()); + auto timer_fd = FileDescriptor(fd); + EXPECT_THAT(fallocate(timer_fd.get(), 0, 0, 10), + SyscallFailsWithErrno(ENODEV)); + + sigset_t mask; + sigemptyset(&mask); + ASSERT_THAT(fd = signalfd(-1, &mask, 0), SyscallSucceeds()); + auto sfd = FileDescriptor(fd); + EXPECT_THAT(fallocate(sfd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); + + auto efd = + ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE)); + EXPECT_THAT(fallocate(efd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); + + auto sockfd = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + EXPECT_THAT(fallocate(sockfd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); + + int socks[2]; + ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, socks), + SyscallSucceeds()); + auto sock0 = FileDescriptor(socks[0]); + auto sock1 = FileDescriptor(socks[1]); + EXPECT_THAT(fallocate(sock0.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From abffebde7be2dcdb4564e45f845d7c150ced0ccb Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Tue, 7 Jul 2020 21:48:25 -0400 Subject: Gate FUSE behind a runsc flag This change gates all FUSE commands (by gating /dev/fuse) behind a runsc flag. In order to use FUSE commands, use the --fuse flag with the --vfs2 flag. Check if FUSE is enabled by running dmesg in the sandbox. --- pkg/sentry/fsimpl/fuse/BUILD | 1 + pkg/sentry/fsimpl/fuse/dev.go | 5 +++++ pkg/sentry/kernel/kernel.go | 4 ++++ pkg/sentry/kernel/syslog.go | 9 +++++++++ runsc/boot/config.go | 7 +++++++ runsc/boot/loader.go | 4 ++++ runsc/boot/vfs.go | 14 ++++++++++---- runsc/main.go | 2 ++ test/runner/defs.bzl | 20 +++++++++++++++++++- test/runner/runner.go | 10 ++++++++++ test/syscalls/BUILD | 1 + test/syscalls/linux/dev.cc | 2 +- test/util/test_util.cc | 6 ++++++ test/util/test_util.h | 1 + 14 files changed, 80 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 41567967d..3e00c2abb 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -12,6 +12,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/kernel", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index f6a67d005..dc33268af 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -30,6 +31,10 @@ type fuseDevice struct{} // Open implements vfs.Device.Open. func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + if !kernel.FUSEEnabled { + return nil, syserror.ENOENT + } + var fd DeviceFD if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 2177b785a..240cd6fe0 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -81,6 +81,10 @@ import ( // easy access everywhere. To be removed once VFS2 becomes the default. var VFS2Enabled = false +// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow +// easy access everywhere. To be removed once FUSE is completed. +var FUSEEnabled = false + // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index 4607cde2f..a83ce219c 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -98,6 +98,15 @@ func (s *syslog) Log() []byte { s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...) } + if VFS2Enabled { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS2..."))...) + if FUSEEnabled { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...) + } + } + time += rand.Float64() / 2 s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...) diff --git a/runsc/boot/config.go b/runsc/boot/config.go index bb01b8fb5..80da8b3e6 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -274,6 +274,9 @@ type Config struct { // Enables VFS2 (not plumbled through yet). VFS2 bool + + // Enables FUSE usage (not plumbled through yet). + FUSE bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -325,5 +328,9 @@ func (c *Config) ToFlags() []string { f = append(f, "--vfs2=true") } + if c.FUSE { + f = append(f, "--fuse=true") + } + return f } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 0c0423ab2..93ac7ec41 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -205,6 +205,10 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true + if args.Conf.FUSE { + kernel.FUSEEnabled = true + } + vfs2.Override() } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 6ee6fae04..56f4ba15d 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -86,9 +86,12 @@ func registerFilesystems(k *kernel.Kernel) error { return fmt.Errorf("registering ttydev: %w", err) } - if err := fuse.Register(vfsObj); err != nil { - return fmt.Errorf("registering fusedev: %w", err) + if kernel.FUSEEnabled { + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering fusedev: %w", err) + } } + if err := tundev.Register(vfsObj); err != nil { return fmt.Errorf("registering tundev: %v", err) } @@ -110,8 +113,11 @@ func registerFilesystems(k *kernel.Kernel) error { if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { return fmt.Errorf("creating tundev devtmpfs files: %v", err) } - if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { - return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + + if kernel.FUSEEnabled { + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + } } return nil } diff --git a/runsc/main.go b/runsc/main.go index c9f47c579..69cb505fa 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -88,6 +88,7 @@ var ( referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") + fuseEnabled = flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -242,6 +243,7 @@ func main() { OverlayfsStaleRead: *overlayfsStaleRead, CPUNumFromQuota: *cpuNumFromQuota, VFS2: *vfs2Enabled, + FUSE: *fuseEnabled, QDisc: queueingDiscipline, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl index 921e499be..600cb5192 100644 --- a/test/runner/defs.bzl +++ b/test/runner/defs.bzl @@ -61,7 +61,8 @@ def _syscall_test( file_access = "exclusive", overlay = False, add_uds_tree = False, - vfs2 = False): + vfs2 = False, + fuse = False): # Prepend "runsc" to non-native platform names. full_platform = platform if platform == "native" else "runsc_" + platform @@ -73,6 +74,8 @@ def _syscall_test( name += "_overlay" if vfs2: name += "_vfs2" + if fuse: + name += "_fuse" if network != "none": name += "_" + network + "net" @@ -107,6 +110,7 @@ def _syscall_test( "--overlay=" + str(overlay), "--add-uds-tree=" + str(add_uds_tree), "--vfs2=" + str(vfs2), + "--fuse=" + str(fuse), ] # Call the rule above. @@ -129,6 +133,7 @@ def syscall_test( add_uds_tree = False, add_hostinet = False, vfs2 = False, + fuse = False, tags = None): """syscall_test is a macro that will create targets for all platforms. @@ -188,6 +193,19 @@ def syscall_test( vfs2 = True, ) + if vfs2 and fuse: + _syscall_test( + test = test, + shard_count = shard_count, + size = size, + platform = default_platform, + use_tmpfs = use_tmpfs, + add_uds_tree = add_uds_tree, + tags = platforms[default_platform] + vfs2_tags, + vfs2 = True, + fuse = True, + ) + # TODO(gvisor.dev/issue/1487): Enable VFS2 overlay tests. if add_overlay: _syscall_test( diff --git a/test/runner/runner.go b/test/runner/runner.go index 5456e46a6..2296f3a46 100644 --- a/test/runner/runner.go +++ b/test/runner/runner.go @@ -47,6 +47,7 @@ var ( fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay") vfs2 = flag.Bool("vfs2", false, "enable VFS2") + fuse = flag.Bool("fuse", false, "enable FUSE") parallel = flag.Bool("parallel", false, "run tests in parallel") runscPath = flag.String("runsc", "", "path to runsc binary") @@ -149,6 +150,9 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error { } if *vfs2 { args = append(args, "-vfs2") + if *fuse { + args = append(args, "-fuse") + } } if *debug { args = append(args, "-debug", "-log-packets=true") @@ -358,6 +362,12 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) { vfsVar := "GVISOR_VFS" if *vfs2 { env = append(env, vfsVar+"=VFS2") + fuseVar := "FUSE_ENABLED" + if *fuse { + env = append(env, fuseVar+"=TRUE") + } else { + env = append(env, fuseVar+"=FALSE") + } } else { env = append(env, vfsVar+"=VFS1") } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 28ef55945..c06a75ada 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -146,6 +146,7 @@ syscall_test( ) syscall_test( + fuse = "True", test = "//test/syscalls/linux:dev_test", vfs2 = "True", ) diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc index 3c88c4cbd..6fa16208e 100644 --- a/test/syscalls/linux/dev.cc +++ b/test/syscalls/linux/dev.cc @@ -156,7 +156,7 @@ TEST(DevTest, TTYExists) { TEST(DevTest, OpenDevFuse) { // Note(gvisor.dev/issue/3076) This won't work in the sentry until the new // device registration is complete. - SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor()); + SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor() || !IsFUSEEnabled()); ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_RDONLY)); } diff --git a/test/util/test_util.cc b/test/util/test_util.cc index 8a037f45f..d0c1d6426 100644 --- a/test/util/test_util.cc +++ b/test/util/test_util.cc @@ -42,6 +42,7 @@ namespace testing { constexpr char kGvisorNetwork[] = "GVISOR_NETWORK"; constexpr char kGvisorVfs[] = "GVISOR_VFS"; +constexpr char kFuseEnabled[] = "FUSE_ENABLED"; bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; } @@ -68,6 +69,11 @@ bool IsRunningWithVFS1() { return strcmp(env, "VFS1") == 0; } +bool IsFUSEEnabled() { + const char* env = getenv(kFuseEnabled); + return env && strcmp(env, "TRUE") == 0; +} + // Inline cpuid instruction. Preserve %ebx/%rbx register. In PIC compilations // %ebx contains the address of the global offset table. %rbx is occasionally // used to address stack variables in presence of dynamic allocas. diff --git a/test/util/test_util.h b/test/util/test_util.h index 109078fc7..89ac575bd 100644 --- a/test/util/test_util.h +++ b/test/util/test_util.h @@ -225,6 +225,7 @@ const std::string GvisorPlatform(); bool IsRunningWithHostinet(); // TODO(gvisor.dev/issue/1624): Delete once VFS1 is gone. bool IsRunningWithVFS1(); +bool IsFUSEEnabled(); #ifdef __linux__ void SetupGvisorDeathTest(); -- cgit v1.2.3 From 59a5479409094b141a60cfcc65f0a53d7871e2e1 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 13 Jul 2020 15:41:07 -0700 Subject: Disable debug time adjustment logging When --debug is enabled, the following log messages are printed every second filling up the log: D0430 18:04:42.823775 129561 parameters.go:238] Clock(Monotonic): error: 46 ns, adjusted frequency from 3591713733 Hz to 3591714196 Hz D0430 18:04:42.823870 129561 parameters.go:238] Clock(Realtime): error: 36 ns, adjusted frequency from 3591714003 Hz to 3591714169 Hz D0430 18:04:42.823892 129561 timekeeper.go:209] Updating VDSO parameters: {monotonicReady:1 monotonicBaseCycles:15758797714254696 monotonicBaseRef:29000233837 monotonicFrequency:3591714196 realtimeReady:1 realtimeBaseCycles:15758797714610880 realtimeBaseRef:1588269882823867374 realtimeFrequency:3591714169} Info and warning messages for larger changes are kept the same. PiperOrigin-RevId: 321048523 --- pkg/sentry/kernel/timekeeper.go | 3 --- pkg/sentry/time/parameters.go | 12 ++++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index 0adf25691..5f3908d8b 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -210,9 +210,6 @@ func (t *Timekeeper) startUpdater() { p.realtimeBaseRef = int64(realtimeParams.BaseRef) p.realtimeFrequency = realtimeParams.Frequency } - - log.Debugf("Updating VDSO parameters: %+v", p) - return p }); err != nil { log.Warningf("Unable to update VDSO parameter page: %v", err) diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go index 65868cb26..cd1b95117 100644 --- a/pkg/sentry/time/parameters.go +++ b/pkg/sentry/time/parameters.go @@ -228,11 +228,15 @@ func errorAdjust(prevParams Parameters, newParams Parameters, now TSCValue) (Par // // The log level is determined by the error severity. func logErrorAdjustment(clock ClockID, errorNS ReferenceNS, orig, adjusted Parameters) { - fn := log.Debugf - if int64(errorNS.Magnitude()) > time.Millisecond.Nanoseconds() { + magNS := int64(errorNS.Magnitude()) + if magNS <= 10*time.Microsecond.Nanoseconds() { + // Don't log small errors. + return + } + fn := log.Infof + if magNS > time.Millisecond.Nanoseconds() { + // Upgrade large errors to warning. fn = log.Warningf - } else if int64(errorNS.Magnitude()) > 10*time.Microsecond.Nanoseconds() { - fn = log.Infof } fn("Clock(%v): error: %v ns, adjusted frequency from %v Hz to %v Hz", clock, errorNS, orig.Frequency, adjusted.Frequency) -- cgit v1.2.3 From 8fed97794edcbaa7069dbd39604030e4fbb6891c Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 23 Jul 2020 16:22:41 -0700 Subject: Add task work mechanism. Like task_work in Linux, this allows us to register callbacks to be executed before returning to userspace. This is needed for kcov support, which requires coverage information to be up-to-date whenever we are in user mode. We will provide coverage data through the kcov interface to enable coverage-directed fuzzing in syzkaller. One difference from Linux is that task work cannot queue work before the transition to userspace that it precedes; queued work will be picked up before the next transition. PiperOrigin-RevId: 322889984 --- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/task.go | 15 +++++++++++++++ pkg/sentry/kernel/task_run.go | 17 ++++++++++++++++- pkg/sentry/kernel/task_work.go | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 pkg/sentry/kernel/task_work.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 25fe1921b..f6886a758 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -132,6 +132,7 @@ go_library( "task_stop.go", "task_syscall.go", "task_usermem.go", + "task_work.go", "thread_group.go", "threads.go", "timekeeper.go", diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index f48247c94..b3d655b6e 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -68,6 +68,21 @@ type Task struct { // runState is exclusive to the task goroutine. runState taskRunState + // taskWorkCount represents the current size of the task work queue. It is + // used to avoid acquiring taskWorkMu when the queue is empty. + // + // Must accessed with atomic memory operations. + taskWorkCount int32 + + // taskWorkMu protects taskWork. + taskWorkMu sync.Mutex `state:"nosave"` + + // taskWork is a queue of work to be executed before resuming user execution. + // It is similar to the task_work mechanism in Linux. + // + // taskWork is exclusive to the task goroutine. + taskWork []TaskWorker + // haveSyscallReturn is true if tc.Arch().Return() represents a value // returned by a syscall (or set by ptrace after a syscall). // diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index d654dd997..7d4f44caf 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -167,7 +167,22 @@ func (app *runApp) execute(t *Task) taskRunState { return (*runInterrupt)(nil) } - // We're about to switch to the application again. If there's still a + // Execute any task work callbacks before returning to user space. + if atomic.LoadInt32(&t.taskWorkCount) > 0 { + t.taskWorkMu.Lock() + queue := t.taskWork + t.taskWork = nil + atomic.StoreInt32(&t.taskWorkCount, 0) + t.taskWorkMu.Unlock() + + // Do not hold taskWorkMu while executing task work, which may register + // more work. + for _, work := range queue { + work.TaskWork(t) + } + } + + // We're about to switch to the application again. If there's still an // unhandled SyscallRestartErrno that wasn't translated to an EINTR, // restart the syscall that was interrupted. If there's a saved signal // mask, restore it. (Note that restoring the saved signal mask may unblock diff --git a/pkg/sentry/kernel/task_work.go b/pkg/sentry/kernel/task_work.go new file mode 100644 index 000000000..dda5a433a --- /dev/null +++ b/pkg/sentry/kernel/task_work.go @@ -0,0 +1,38 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "sync/atomic" + +// TaskWorker is a deferred task. +// +// This must be savable. +type TaskWorker interface { + // TaskWork will be executed prior to returning to user space. Note that + // TaskWork may call RegisterWork again, but this will not be executed until + // the next return to user space, unlike in Linux. This effectively allows + // registration of indefinite user return hooks, but not by default. + TaskWork(t *Task) +} + +// RegisterWork can be used to register additional task work that will be +// performed prior to returning to user space. See TaskWorker.TaskWork for +// semantics regarding registration. +func (t *Task) RegisterWork(work TaskWorker) { + t.taskWorkMu.Lock() + defer t.taskWorkMu.Unlock() + atomic.AddInt32(&t.taskWorkCount, 1) + t.taskWork = append(t.taskWork, work) +} -- cgit v1.2.3 From 4ec351633206fdbd191bc3aef29a007925a731cc Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 23 Jul 2020 17:40:46 -0700 Subject: Implement get/set_robust_list. PiperOrigin-RevId: 322904430 --- pkg/abi/linux/futex.go | 18 +++++ pkg/sentry/kernel/futex/futex.go | 8 +-- pkg/sentry/kernel/task.go | 4 ++ pkg/sentry/kernel/task_exec.go | 3 + pkg/sentry/kernel/task_exit.go | 3 + pkg/sentry/kernel/task_futex.go | 125 +++++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/linux64.go | 4 +- pkg/sentry/syscalls/linux/sys_futex.go | 48 ++++++++++++- test/syscalls/linux/futex.cc | 92 ++++++++++++++++++++++++ 9 files changed, 298 insertions(+), 7 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go index 08bfde3b5..8138088a6 100644 --- a/pkg/abi/linux/futex.go +++ b/pkg/abi/linux/futex.go @@ -60,3 +60,21 @@ const ( FUTEX_WAITERS = 0x80000000 FUTEX_OWNER_DIED = 0x40000000 ) + +// FUTEX_BITSET_MATCH_ANY has all bits set. +const FUTEX_BITSET_MATCH_ANY = 0xffffffff + +// ROBUST_LIST_LIMIT protects against a deliberately circular list. +const ROBUST_LIST_LIMIT = 2048 + +// RobustListHead corresponds to Linux's struct robust_list_head. +// +// +marshal +type RobustListHead struct { + List uint64 + FutexOffset uint64 + ListOpPending uint64 +} + +// SizeOfRobustListHead is the size of a RobustListHead struct. +var SizeOfRobustListHead = (*RobustListHead)(nil).SizeBytes() diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 732e66da4..bcc1b29a8 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -717,10 +717,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3 } } -// UnlockPI unlock the futex following the Priority-inheritance futex -// rules. The address provided must contain the caller's TID. If there are -// waiters, TID of the next waiter (FIFO) is set to the given address, and the -// waiter woken up. If there are no waiters, 0 is set to the address. +// UnlockPI unlocks the futex following the Priority-inheritance futex rules. +// The address provided must contain the caller's TID. If there are waiters, +// TID of the next waiter (FIFO) is set to the given address, and the waiter +// woken up. If there are no waiters, 0 is set to the address. func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { k, err := getKey(t, addr, private) if err != nil { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index b3d655b6e..c4db05bd8 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -565,6 +565,10 @@ type Task struct { // futexWaiter is exclusive to the task goroutine. futexWaiter *futex.Waiter `state:"nosave"` + // robustList is a pointer to the head of the tasks's robust futex + // list. + robustList usermem.Addr + // startTime is the real time at which the task started. It is set when // a Task is created or invokes execve(2). // diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 9b69f3cbe..7803b98d0 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -207,6 +207,9 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { return flags.CloseOnExec }) + // Handle the robust futex list. + t.exitRobustList() + // NOTE(b/30815691): We currently do not implement privileged // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index c4ade6e8e..231ac548a 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -253,6 +253,9 @@ func (*runExitMain) execute(t *Task) taskRunState { } } + // Handle the robust futex list. + t.exitRobustList() + // Deactivate the address space and update max RSS before releasing the // task's MM. t.Deactivate() diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index a53e77c9f..4b535c949 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/usermem" ) @@ -52,3 +53,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { return t.MemoryManager().GetSharedFutexKey(t, addr) } + +// GetRobustList sets the robust futex list for the task. +func (t *Task) GetRobustList() usermem.Addr { + t.mu.Lock() + addr := t.robustList + t.mu.Unlock() + return addr +} + +// SetRobustList sets the robust futex list for the task. +func (t *Task) SetRobustList(addr usermem.Addr) { + t.mu.Lock() + t.robustList = addr + t.mu.Unlock() +} + +// exitRobustList walks the robust futex list, marking locks dead and notifying +// wakers. It corresponds to Linux's exit_robust_list(). Following Linux, +// errors are silently ignored. +func (t *Task) exitRobustList() { + t.mu.Lock() + addr := t.robustList + t.robustList = 0 + t.mu.Unlock() + + if addr == 0 { + return + } + + var rl linux.RobustListHead + if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil { + return + } + + next := rl.List + done := 0 + var pendingLockAddr usermem.Addr + if rl.ListOpPending != 0 { + pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset) + } + + // Wake up normal elements. + for usermem.Addr(next) != addr { + // We traverse to the next element of the list before we + // actually wake anything. This prevents the race where waking + // this futex causes a modification of the list. + thisLockAddr := usermem.Addr(next + rl.FutexOffset) + + // Try to decode the next element in the list before waking the + // current futex. But don't check the error until after we've + // woken the current futex. Linux does it in this order too + _, nextErr := t.CopyIn(usermem.Addr(next), &next) + + // Wakeup the current futex if it's not pending. + if thisLockAddr != pendingLockAddr { + t.wakeRobustListOne(thisLockAddr) + } + + // If there was an error copying the next futex, we must bail. + if nextErr != nil { + break + } + + // This is a user structure, so it could be a massive list, or + // even contain a loop if they are trying to mess with us. We + // cap traversal to prevent that. + done++ + if done >= linux.ROBUST_LIST_LIMIT { + break + } + } + + // Is there a pending entry to wake? + if pendingLockAddr != 0 { + t.wakeRobustListOne(pendingLockAddr) + } +} + +// wakeRobustListOne wakes a single futex from the robust list. +func (t *Task) wakeRobustListOne(addr usermem.Addr) { + // Bit 0 in address signals PI futex. + pi := addr&1 == 1 + addr = addr &^ 1 + + // Load the futex. + f, err := t.LoadUint32(addr) + if err != nil { + // Can't read this single value? Ignore the problem. + // We can wake the other futexes in the list. + return + } + + tid := uint32(t.ThreadID()) + for { + // Is this held by someone else? + if f&linux.FUTEX_TID_MASK != tid { + return + } + + // This thread is dying and it's holding this futex. We need to + // set the owner died bit and wake up any waiters. + newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED + if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil { + return + } else if curF != f { + // Futex changed out from under us. Try again... + f = curF + continue + } + + // Wake waiters if there are any. + if f&linux.FUTEX_WAITERS != 0 { + private := f&linux.FUTEX_PRIVATE_FLAG != 0 + if pi { + t.Futex().UnlockPI(t, addr, tid, private) + return + } + t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1) + } + + // Done. + return + } +} diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index ea4f9b1a7..80c65164a 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -325,8 +325,8 @@ var AMD64 = &kernel.SyscallTable{ 270: syscalls.Supported("pselect", Pselect), 271: syscalls.Supported("ppoll", Ppoll), 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 273: syscalls.Supported("set_robust_list", SetRobustList), + 274: syscalls.Supported("get_robust_list", GetRobustList), 275: syscalls.Supported("splice", Splice), 276: syscalls.Supported("tee", Tee), 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index b68261f72..f04d78856 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -198,7 +198,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch cmd { case linux.FUTEX_WAIT: // WAIT uses a relative timeout. - mask = ^uint32(0) + mask = linux.FUTEX_BITSET_MATCH_ANY var timeoutDur time.Duration if !forever { timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond @@ -286,3 +286,49 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.ENOSYS } } + +// SetRobustList implements linux syscall set_robust_list(2). +func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Despite the syscall using the name 'pid' for this variable, it is + // very much a tid. + head := args[0].Pointer() + length := args[1].SizeT() + + if length != uint(linux.SizeOfRobustListHead) { + return 0, nil, syserror.EINVAL + } + t.SetRobustList(head) + return 0, nil, nil +} + +// GetRobustList implements linux syscall get_robust_list(2). +func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Despite the syscall using the name 'pid' for this variable, it is + // very much a tid. + tid := args[0].Int() + head := args[1].Pointer() + size := args[2].Pointer() + + if tid < 0 { + return 0, nil, syserror.EINVAL + } + + ot := t + if tid != 0 { + if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { + return 0, nil, syserror.ESRCH + } + } + + // Copy out head pointer. + if _, err := t.CopyOut(head, uint64(ot.GetRobustList())); err != nil { + return 0, nil, err + } + + // Copy out size, which is a constant. + if _, err := t.CopyOut(size, uint64(linux.SizeOfRobustListHead)); err != nil { + return 0, nil, err + } + + return 0, nil, nil +} diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc index 40c80a6e1..90b1f0508 100644 --- a/test/syscalls/linux/futex.cc +++ b/test/syscalls/linux/futex.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -737,6 +738,97 @@ TEST_P(PrivateAndSharedFutexTest, PITryLockConcurrency_NoRandomSave) { } } +int get_robust_list(int pid, struct robust_list_head** head_ptr, + size_t* len_ptr) { + return syscall(__NR_get_robust_list, pid, head_ptr, len_ptr); +} + +int set_robust_list(struct robust_list_head* head, size_t len) { + return syscall(__NR_set_robust_list, head, len); +} + +TEST(RobustFutexTest, BasicSetGet) { + struct robust_list_head hd = {}; + struct robust_list_head* hd_ptr = &hd; + + // Set! + EXPECT_THAT(set_robust_list(hd_ptr, sizeof(hd)), SyscallSucceedsWithValue(0)); + + // Get! + struct robust_list_head* new_hd_ptr = hd_ptr; + size_t len; + EXPECT_THAT(get_robust_list(0, &new_hd_ptr, &len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(new_hd_ptr, hd_ptr); + EXPECT_EQ(len, sizeof(hd)); +} + +TEST(RobustFutexTest, GetFromOtherTid) { + // Get the current tid and list head. + pid_t tid = gettid(); + struct robust_list_head* hd_ptr = {}; + size_t len; + EXPECT_THAT(get_robust_list(0, &hd_ptr, &len), SyscallSucceedsWithValue(0)); + + // Create a new thread. + ScopedThread t([&] { + // Current tid list head should be different from parent tid. + struct robust_list_head* got_hd_ptr = {}; + EXPECT_THAT(get_robust_list(0, &got_hd_ptr, &len), + SyscallSucceedsWithValue(0)); + EXPECT_NE(hd_ptr, got_hd_ptr); + + // Get the parent list head by passing its tid. + EXPECT_THAT(get_robust_list(tid, &got_hd_ptr, &len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(hd_ptr, got_hd_ptr); + }); + + // Wait for thread. + t.Join(); +} + +TEST(RobustFutexTest, InvalidSize) { + struct robust_list_head* hd = {}; + EXPECT_THAT(set_robust_list(hd, sizeof(*hd) + 1), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(RobustFutexTest, PthreadMutexAttr) { + constexpr int kNumMutexes = 3; + + // Create a bunch of robust mutexes. + pthread_mutexattr_t attrs[kNumMutexes]; + pthread_mutex_t mtxs[kNumMutexes]; + for (int i = 0; i < kNumMutexes; i++) { + TEST_PCHECK(pthread_mutexattr_init(&attrs[i]) == 0); + TEST_PCHECK(pthread_mutexattr_setrobust(&attrs[i], PTHREAD_MUTEX_ROBUST) == + 0); + TEST_PCHECK(pthread_mutex_init(&mtxs[i], &attrs[i]) == 0); + } + + // Start thread to lock the mutexes and then exit. + ScopedThread t([&] { + for (int i = 0; i < kNumMutexes; i++) { + TEST_PCHECK(pthread_mutex_lock(&mtxs[i]) == 0); + } + pthread_exit(NULL); + }); + + // Wait for thread. + t.Join(); + + // Now try to take the mutexes. + for (int i = 0; i < kNumMutexes; i++) { + // Should get EOWNERDEAD. + EXPECT_EQ(pthread_mutex_lock(&mtxs[i]), EOWNERDEAD); + // Make the mutex consistent. + EXPECT_EQ(pthread_mutex_consistent(&mtxs[i]), 0); + // Unlock. + EXPECT_EQ(pthread_mutex_unlock(&mtxs[i]), 0); + } +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From 82a5cada5944390e738a8b7235fb861965ca40f7 Mon Sep 17 00:00:00 2001 From: Sam Balana Date: Thu, 23 Jul 2020 17:59:12 -0700 Subject: Add AfterFunc to tcpip.Clock Changes the API of tcpip.Clock to also provide a method for scheduling and rescheduling work after a specified duration. This change also implements the AfterFunc method for existing implementations of tcpip.Clock. This is the groundwork required to mock time within tests. All references to CancellableTimer has been replaced with the tcpip.Job interface, allowing for custom implementations of scheduling work. This is a BREAKING CHANGE for clients that implement their own tcpip.Clock or use tcpip.CancellableTimer. Migration plan: 1. Add AfterFunc(d, f) to tcpip.Clock 2. Replace references of tcpip.CancellableTimer with tcpip.Job 3. Replace calls to tcpip.CancellableTimer#StopLocked with tcpip.Job#Cancel 4. Replace calls to tcpip.CancellableTimer#Reset with tcpip.Job#Schedule 5. Replace calls to tcpip.NewCancellableTimer with tcpip.NewJob. PiperOrigin-RevId: 322906897 --- pkg/sentry/kernel/kernel.go | 5 ++ pkg/sentry/kernel/time/BUILD | 1 + pkg/sentry/kernel/time/tcpip.go | 131 +++++++++++++++++++++++++++++ pkg/tcpip/stack/ndp.go | 140 +++++++++++++++---------------- pkg/tcpip/stack/ndp_test.go | 28 +++---- pkg/tcpip/stack/stack.go | 12 ++- pkg/tcpip/tcpip.go | 27 +++++- pkg/tcpip/time_unsafe.go | 30 ++++++- pkg/tcpip/timer.go | 147 ++++++++++++++++++++------------- pkg/tcpip/timer_test.go | 91 ++++++++++---------- pkg/tcpip/transport/icmp/endpoint.go | 2 +- pkg/tcpip/transport/packet/endpoint.go | 2 +- pkg/tcpip/transport/raw/endpoint.go | 2 +- pkg/tcpip/transport/udp/endpoint.go | 2 +- 14 files changed, 429 insertions(+), 191 deletions(-) create mode 100644 pkg/sentry/kernel/time/tcpip.go (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 240cd6fe0..15dae0f5b 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1469,6 +1469,11 @@ func (k *Kernel) NowMonotonic() int64 { return now } +// AfterFunc implements tcpip.Clock.AfterFunc. +func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer { + return ktime.TcpipAfterFunc(k.realtimeClock, d, f) +} + // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or // LoadFrom. func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 7ba7dc50c..2817aa3ba 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -6,6 +6,7 @@ go_library( name = "time", srcs = [ "context.go", + "tcpip.go", "time.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/kernel/time/tcpip.go b/pkg/sentry/kernel/time/tcpip.go new file mode 100644 index 000000000..c4474c0cf --- /dev/null +++ b/pkg/sentry/kernel/time/tcpip.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package time + +import ( + "sync" + "time" +) + +// TcpipAfterFunc waits for duration to elapse according to clock then runs fn. +// The timer is started immediately and will fire exactly once. +func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer { + timer := &TcpipTimer{ + clock: clock, + } + timer.notifier = functionNotifier{ + fn: func() { + // tcpip.Timer.Stop() explicitly states that the function is called in a + // separate goroutine that Stop() does not synchronize with. + // Timer.Destroy() synchronizes with calls to TimerListener.Notify(). + // This is semantically meaningful because, in the former case, it's + // legal to call tcpip.Timer.Stop() while holding locks that may also be + // taken by the function, but this isn't so in the latter case. Most + // immediately, Timer calls TimerListener.Notify() while holding + // Timer.mu. A deadlock occurs without spawning a goroutine: + // T1: (Timer expires) + // => Timer.Tick() <- Timer.mu.Lock() called + // => TimerListener.Notify() + // => Timer.Stop() + // => Timer.Destroy() <- Timer.mu.Lock() called, deadlock! + // + // Spawning a goroutine avoids the deadlock: + // T1: (Timer expires) + // => Timer.Tick() <- Timer.mu.Lock() called + // => TimerListener.Notify() <- Launches T2 + // T2: + // => Timer.Stop() + // => Timer.Destroy() <- Timer.mu.Lock() called, blocks + // T1: + // => (returns) <- Timer.mu.Unlock() called + // T2: + // => (continues) <- No deadlock! + go func() { + timer.Stop() + fn() + }() + }, + } + timer.Reset(duration) + return timer +} + +// TcpipTimer is a resettable timer with variable duration expirations. +// Implements tcpip.Timer, which does not define a Destroy method; instead, all +// resources are released after timer expiration and calls to Timer.Stop. +// +// Must be created by AfterFunc. +type TcpipTimer struct { + // clock is the time source. clock is immutable. + clock Clock + + // notifier is called when the Timer expires. notifier is immutable. + notifier functionNotifier + + // mu protects t. + mu sync.Mutex + + // t stores the latest running Timer. This is replaced whenever Reset is + // called since Timer cannot be restarted once it has been Destroyed by Stop. + // + // This field is nil iff Stop has been called. + t *Timer +} + +// Stop implements tcpip.Timer.Stop. +func (r *TcpipTimer) Stop() bool { + r.mu.Lock() + defer r.mu.Unlock() + + if r.t == nil { + return false + } + _, lastSetting := r.t.Swap(Setting{}) + r.t.Destroy() + r.t = nil + return lastSetting.Enabled +} + +// Reset implements tcpip.Timer.Reset. +func (r *TcpipTimer) Reset(d time.Duration) { + r.mu.Lock() + defer r.mu.Unlock() + + if r.t == nil { + r.t = NewTimer(r.clock, &r.notifier) + } + + r.t.Swap(Setting{ + Enabled: true, + Period: 0, + Next: r.clock.Now().Add(d), + }) +} + +// functionNotifier is a TimerListener that runs a function. +// +// functionNotifier cannot be saved or loaded. +type functionNotifier struct { + fn func() +} + +// Notify implements ktime.TimerListener.Notify. +func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) { + f.fn() + return Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (f *functionNotifier) Destroy() {} diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index e28c23d66..9dce11a97 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -469,7 +469,7 @@ type ndpState struct { rtrSolicit struct { // The timer used to send the next router solicitation message. - timer *time.Timer + timer tcpip.Timer // Used to let the Router Solicitation timer know that it has been stopped. // @@ -503,7 +503,7 @@ type ndpState struct { // to the DAD goroutine that DAD should stop. type dadState struct { // The DAD timer to send the next NS message, or resolve the address. - timer *time.Timer + timer tcpip.Timer // Used to let the DAD timer know that it has been stopped. // @@ -515,38 +515,38 @@ type dadState struct { // defaultRouterState holds data associated with a default router discovered by // a Router Advertisement (RA). type defaultRouterState struct { - // Timer to invalidate the default router. + // Job to invalidate the default router. // // Must not be nil. - invalidationTimer *tcpip.CancellableTimer + invalidationJob *tcpip.Job } // onLinkPrefixState holds data associated with an on-link prefix discovered by // a Router Advertisement's Prefix Information option (PI) when the NDP // configurations was configured to do so. type onLinkPrefixState struct { - // Timer to invalidate the on-link prefix. + // Job to invalidate the on-link prefix. // // Must not be nil. - invalidationTimer *tcpip.CancellableTimer + invalidationJob *tcpip.Job } // tempSLAACAddrState holds state associated with a temporary SLAAC address. type tempSLAACAddrState struct { - // Timer to deprecate the temporary SLAAC address. + // Job to deprecate the temporary SLAAC address. // // Must not be nil. - deprecationTimer *tcpip.CancellableTimer + deprecationJob *tcpip.Job - // Timer to invalidate the temporary SLAAC address. + // Job to invalidate the temporary SLAAC address. // // Must not be nil. - invalidationTimer *tcpip.CancellableTimer + invalidationJob *tcpip.Job - // Timer to regenerate the temporary SLAAC address. + // Job to regenerate the temporary SLAAC address. // // Must not be nil. - regenTimer *tcpip.CancellableTimer + regenJob *tcpip.Job createdAt time.Time @@ -561,15 +561,15 @@ type tempSLAACAddrState struct { // slaacPrefixState holds state associated with a SLAAC prefix. type slaacPrefixState struct { - // Timer to deprecate the prefix. + // Job to deprecate the prefix. // // Must not be nil. - deprecationTimer *tcpip.CancellableTimer + deprecationJob *tcpip.Job - // Timer to invalidate the prefix. + // Job to invalidate the prefix. // // Must not be nil. - invalidationTimer *tcpip.CancellableTimer + invalidationJob *tcpip.Job // Nonzero only when the address is not valid forever. validUntil time.Time @@ -651,12 +651,12 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref } var done bool - var timer *time.Timer + var timer tcpip.Timer // We initially start a timer to fire immediately because some of the DAD work // cannot be done while holding the NIC's lock. This is effectively the same // as starting a goroutine but we use a timer that fires immediately so we can // reset it for the next DAD iteration. - timer = time.AfterFunc(0, func() { + timer = ndp.nic.stack.Clock().AfterFunc(0, func() { ndp.nic.mu.Lock() defer ndp.nic.mu.Unlock() @@ -871,9 +871,9 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { case ok && rl != 0: // This is an already discovered default router. Update - // the invalidation timer. - rtr.invalidationTimer.StopLocked() - rtr.invalidationTimer.Reset(rl) + // the invalidation job. + rtr.invalidationJob.Cancel() + rtr.invalidationJob.Schedule(rl) ndp.defaultRouters[ip] = rtr case ok && rl == 0: @@ -950,7 +950,7 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) { return } - rtr.invalidationTimer.StopLocked() + rtr.invalidationJob.Cancel() delete(ndp.defaultRouters, ip) // Let the integrator know a discovered default router is invalidated. @@ -979,12 +979,12 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) { } state := defaultRouterState{ - invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { ndp.invalidateDefaultRouter(ip) }), } - state.invalidationTimer.Reset(rl) + state.invalidationJob.Schedule(rl) ndp.defaultRouters[ip] = state } @@ -1009,13 +1009,13 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) } state := onLinkPrefixState{ - invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { ndp.invalidateOnLinkPrefix(prefix) }), } if l < header.NDPInfiniteLifetime { - state.invalidationTimer.Reset(l) + state.invalidationJob.Schedule(l) } ndp.onLinkPrefixes[prefix] = state @@ -1033,7 +1033,7 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) { return } - s.invalidationTimer.StopLocked() + s.invalidationJob.Cancel() delete(ndp.onLinkPrefixes, prefix) // Let the integrator know a discovered on-link prefix is invalidated. @@ -1082,14 +1082,14 @@ func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformatio // This is an already discovered on-link prefix with a // new non-zero valid lifetime. // - // Update the invalidation timer. + // Update the invalidation job. - prefixState.invalidationTimer.StopLocked() + prefixState.invalidationJob.Cancel() if vl < header.NDPInfiniteLifetime { - // Prefix is valid for a finite lifetime, reset the timer to expire after + // Prefix is valid for a finite lifetime, schedule the job to execute after // the new valid lifetime. - prefixState.invalidationTimer.Reset(vl) + prefixState.invalidationJob.Schedule(vl) } ndp.onLinkPrefixes[prefix] = prefixState @@ -1154,7 +1154,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { } state := slaacPrefixState{ - deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix)) @@ -1162,7 +1162,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { ndp.deprecateSLAACAddress(state.stableAddr.ref) }), - invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix)) @@ -1184,19 +1184,19 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { if !ndp.generateSLAACAddr(prefix, &state) { // We were unable to generate an address for the prefix, we do not nothing - // further as there is no reason to maintain state or timers for a prefix we + // further as there is no reason to maintain state or jobs for a prefix we // do not have an address for. return } - // Setup the initial timers to deprecate and invalidate prefix. + // Setup the initial jobs to deprecate and invalidate prefix. if pl < header.NDPInfiniteLifetime && pl != 0 { - state.deprecationTimer.Reset(pl) + state.deprecationJob.Schedule(pl) } if vl < header.NDPInfiniteLifetime { - state.invalidationTimer.Reset(vl) + state.invalidationJob.Schedule(vl) state.validUntil = now.Add(vl) } @@ -1428,7 +1428,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla } state := tempSLAACAddrState{ - deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr)) @@ -1441,7 +1441,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla ndp.deprecateSLAACAddress(tempAddrState.ref) }), - invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr)) @@ -1454,7 +1454,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState) }), - regenTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + regenJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr)) @@ -1481,9 +1481,9 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla ref: ref, } - state.deprecationTimer.Reset(pl) - state.invalidationTimer.Reset(vl) - state.regenTimer.Reset(pl - ndp.configs.RegenAdvanceDuration) + state.deprecationJob.Schedule(pl) + state.invalidationJob.Schedule(vl) + state.regenJob.Schedule(pl - ndp.configs.RegenAdvanceDuration) prefixState.generationAttempts++ prefixState.tempAddrs[generatedAddr.Address] = state @@ -1518,16 +1518,16 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat prefixState.stableAddr.ref.deprecated = false } - // If prefix was preferred for some finite lifetime before, stop the - // deprecation timer so it can be reset. - prefixState.deprecationTimer.StopLocked() + // If prefix was preferred for some finite lifetime before, cancel the + // deprecation job so it can be reset. + prefixState.deprecationJob.Cancel() now := time.Now() - // Reset the deprecation timer if prefix has a finite preferred lifetime. + // Schedule the deprecation job if prefix has a finite preferred lifetime. if pl < header.NDPInfiniteLifetime { if !deprecated { - prefixState.deprecationTimer.Reset(pl) + prefixState.deprecationJob.Schedule(pl) } prefixState.preferredUntil = now.Add(pl) } else { @@ -1546,9 +1546,9 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat // 3) Otherwise, reset the valid lifetime of the prefix to 2 hours. if vl >= header.NDPInfiniteLifetime { - // Handle the infinite valid lifetime separately as we do not keep a timer - // in this case. - prefixState.invalidationTimer.StopLocked() + // Handle the infinite valid lifetime separately as we do not schedule a + // job in this case. + prefixState.invalidationJob.Cancel() prefixState.validUntil = time.Time{} } else { var effectiveVl time.Duration @@ -1569,8 +1569,8 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat } if effectiveVl != 0 { - prefixState.invalidationTimer.StopLocked() - prefixState.invalidationTimer.Reset(effectiveVl) + prefixState.invalidationJob.Cancel() + prefixState.invalidationJob.Schedule(effectiveVl) prefixState.validUntil = now.Add(effectiveVl) } } @@ -1582,7 +1582,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat } // Note, we do not need to update the entries in the temporary address map - // after updating the timers because the timers are held as pointers. + // after updating the jobs because the jobs are held as pointers. var regenForAddr tcpip.Address allAddressesRegenerated := true for tempAddr, tempAddrState := range prefixState.tempAddrs { @@ -1596,14 +1596,14 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat } // If the address is no longer valid, invalidate it immediately. Otherwise, - // reset the invalidation timer. + // reset the invalidation job. newValidLifetime := validUntil.Sub(now) if newValidLifetime <= 0 { ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, tempAddr, tempAddrState) continue } - tempAddrState.invalidationTimer.StopLocked() - tempAddrState.invalidationTimer.Reset(newValidLifetime) + tempAddrState.invalidationJob.Cancel() + tempAddrState.invalidationJob.Schedule(newValidLifetime) // As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary // address is the lower of the preferred lifetime of the stable address or @@ -1616,17 +1616,17 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat } // If the address is no longer preferred, deprecate it immediately. - // Otherwise, reset the deprecation timer. + // Otherwise, schedule the deprecation job again. newPreferredLifetime := preferredUntil.Sub(now) - tempAddrState.deprecationTimer.StopLocked() + tempAddrState.deprecationJob.Cancel() if newPreferredLifetime <= 0 { ndp.deprecateSLAACAddress(tempAddrState.ref) } else { tempAddrState.ref.deprecated = false - tempAddrState.deprecationTimer.Reset(newPreferredLifetime) + tempAddrState.deprecationJob.Schedule(newPreferredLifetime) } - tempAddrState.regenTimer.StopLocked() + tempAddrState.regenJob.Cancel() if tempAddrState.regenerated { } else { allAddressesRegenerated = false @@ -1637,7 +1637,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat // immediately after we finish iterating over the temporary addresses. regenForAddr = tempAddr } else { - tempAddrState.regenTimer.Reset(newPreferredLifetime - ndp.configs.RegenAdvanceDuration) + tempAddrState.regenJob.Schedule(newPreferredLifetime - ndp.configs.RegenAdvanceDuration) } } } @@ -1717,7 +1717,7 @@ func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPr ndp.cleanupSLAACPrefixResources(prefix, state) } -// cleanupSLAACPrefixResources cleansup a SLAAC prefix's timers and entry. +// cleanupSLAACPrefixResources cleans up a SLAAC prefix's jobs and entry. // // Panics if the SLAAC prefix is not known. // @@ -1729,8 +1729,8 @@ func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaa } state.stableAddr.ref = nil - state.deprecationTimer.StopLocked() - state.invalidationTimer.StopLocked() + state.deprecationJob.Cancel() + state.invalidationJob.Cancel() delete(ndp.slaacPrefixes, prefix) } @@ -1775,13 +1775,13 @@ func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWi } // cleanupTempSLAACAddrResourcesAndNotify cleans up a temporary SLAAC address's -// timers and entry. +// jobs and entry. // // The NIC that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupTempSLAACAddrResources(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) { - tempAddrState.deprecationTimer.StopLocked() - tempAddrState.invalidationTimer.StopLocked() - tempAddrState.regenTimer.StopLocked() + tempAddrState.deprecationJob.Cancel() + tempAddrState.invalidationJob.Cancel() + tempAddrState.regenJob.Cancel() delete(tempAddrs, tempAddr) } @@ -1860,7 +1860,7 @@ func (ndp *ndpState) startSolicitingRouters() { var done bool ndp.rtrSolicit.done = &done - ndp.rtrSolicit.timer = time.AfterFunc(delay, func() { + ndp.rtrSolicit.timer = ndp.nic.stack.Clock().AfterFunc(delay, func() { ndp.nic.mu.Lock() if done { // If we reach this point, it means that the RS timer fired after another diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 6f86abc98..644ba7c33 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -1254,7 +1254,7 @@ func TestRouterDiscovery(t *testing.T) { default: } - // Wait for lladdr2's router invalidation timer to fire. The lifetime + // Wait for lladdr2's router invalidation job to execute. The lifetime // of the router should have been updated to the most recent (smaller) // lifetime. // @@ -1271,7 +1271,7 @@ func TestRouterDiscovery(t *testing.T) { e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0)) expectRouterEvent(llAddr2, false) - // Wait for lladdr3's router invalidation timer to fire. The lifetime + // Wait for lladdr3's router invalidation job to execute. The lifetime // of the router should have been updated to the most recent (smaller) // lifetime. // @@ -1502,7 +1502,7 @@ func TestPrefixDiscovery(t *testing.T) { default: } - // Wait for prefix2's most recent invalidation timer plus some buffer to + // Wait for prefix2's most recent invalidation job plus some buffer to // expire. select { case e := <-ndpDisp.prefixC: @@ -2395,7 +2395,7 @@ func TestAutoGenTempAddrRegen(t *testing.T) { for _, addr := range tempAddrs { // Wait for a deprecation then invalidation event, or just an invalidation // event. We need to cover both cases but cannot deterministically hit both - // cases because the deprecation and invalidation timers could fire in any + // cases because the deprecation and invalidation jobs could execute in any // order. select { case e := <-ndpDisp.autoGenAddrC: @@ -2432,9 +2432,9 @@ func TestAutoGenTempAddrRegen(t *testing.T) { } } -// TestAutoGenTempAddrRegenTimerUpdates tests that a temporary address's -// regeneration timer gets updated when refreshing the address's lifetimes. -func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) { +// TestAutoGenTempAddrRegenJobUpdates tests that a temporary address's +// regeneration job gets updated when refreshing the address's lifetimes. +func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) { const ( nicID = 1 regenAfter = 2 * time.Second @@ -2533,7 +2533,7 @@ func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) { // // A new temporary address should immediately be generated since the // regeneration time has already passed since the last address was generated - // - this regeneration does not depend on a timer. + // - this regeneration does not depend on a job. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100)) expectAutoGenAddrEvent(tempAddr2, newAddr) @@ -2559,11 +2559,11 @@ func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) { } // Set the maximum lifetimes for temporary addresses such that on the next - // RA, the regeneration timer gets reset. + // RA, the regeneration job gets scheduled again. // // The maximum lifetime is the sum of the minimum lifetimes for temporary // addresses + the time that has already passed since the last address was - // generated so that the regeneration timer is needed to generate the next + // generated so that the regeneration job is needed to generate the next // address. newLifetimes := newMinVLDuration + regenAfter + defaultAsyncNegativeEventTimeout ndpConfigs.MaxTempAddrValidLifetime = newLifetimes @@ -2993,9 +2993,9 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { expectPrimaryAddr(addr2) } -// TestAutoGenAddrTimerDeprecation tests that an address is properly deprecated +// TestAutoGenAddrJobDeprecation tests that an address is properly deprecated // when its preferred lifetime expires. -func TestAutoGenAddrTimerDeprecation(t *testing.T) { +func TestAutoGenAddrJobDeprecation(t *testing.T) { const nicID = 1 const newMinVL = 2 newMinVLDuration := newMinVL * time.Second @@ -3513,8 +3513,8 @@ func TestAutoGenAddrRemoval(t *testing.T) { } expectAutoGenAddrEvent(addr, invalidatedAddr) - // Wait for the original valid lifetime to make sure the original timer - // got stopped/cleaned up. + // Wait for the original valid lifetime to make sure the original job got + // cancelled/cleaned up. select { case <-ndpDisp.autoGenAddrC: t.Fatal("unexpectedly received an auto gen addr event") diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 2b7ece851..a6faa22c2 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -728,6 +728,11 @@ func New(opts Options) *Stack { return s } +// newJob returns a tcpip.Job using the Stack clock. +func (s *Stack) newJob(l sync.Locker, f func()) *tcpip.Job { + return tcpip.NewJob(s.clock, l, f) +} + // UniqueID returns a unique identifier. func (s *Stack) UniqueID() uint64 { return s.uniqueIDGenerator.UniqueID() @@ -801,9 +806,10 @@ func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h f } } -// NowNanoseconds implements tcpip.Clock.NowNanoseconds. -func (s *Stack) NowNanoseconds() int64 { - return s.clock.NowNanoseconds() +// Clock returns the Stack's clock for retrieving the current time and +// scheduling work. +func (s *Stack) Clock() tcpip.Clock { + return s.clock } // Stats returns a mutable copy of the current stats. diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index ff14a3b3c..21aafb0a2 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -192,7 +192,7 @@ func (e ErrSaveRejection) Error() string { return "save rejected due to unsupported networking state: " + e.Err.Error() } -// A Clock provides the current time. +// A Clock provides the current time and schedules work for execution. // // Times returned by a Clock should always be used for application-visible // time. Only monotonic times should be used for netstack internal timekeeping. @@ -203,6 +203,31 @@ type Clock interface { // NowMonotonic returns a monotonic time value. NowMonotonic() int64 + + // AfterFunc waits for the duration to elapse and then calls f in its own + // goroutine. It returns a Timer that can be used to cancel the call using + // its Stop method. + AfterFunc(d time.Duration, f func()) Timer +} + +// Timer represents a single event. A Timer must be created with +// Clock.AfterFunc. +type Timer interface { + // Stop prevents the Timer from firing. It returns true if the call stops the + // timer, false if the timer has already expired or been stopped. + // + // If Stop returns false, then the timer has already expired and the function + // f of Clock.AfterFunc(d, f) has been started in its own goroutine; Stop + // does not wait for f to complete before returning. If the caller needs to + // know whether f is completed, it must coordinate with f explicitly. + Stop() bool + + // Reset changes the timer to expire after duration d. + // + // Reset should be invoked only on stopped or expired timers. If the timer is + // known to have expired, Reset can be used directly. Otherwise, the caller + // must coordinate with the function f of Clock.AfterFunc(d, f). + Reset(d time.Duration) } // Address is a byte slice cast as a string that represents the address of a diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go index 7f172f978..f32d58091 100644 --- a/pkg/tcpip/time_unsafe.go +++ b/pkg/tcpip/time_unsafe.go @@ -20,7 +20,7 @@ package tcpip import ( - _ "time" // Used with go:linkname. + "time" // Used with go:linkname. _ "unsafe" // Required for go:linkname. ) @@ -45,3 +45,31 @@ func (*StdClock) NowMonotonic() int64 { _, _, mono := now() return mono } + +// AfterFunc implements Clock.AfterFunc. +func (*StdClock) AfterFunc(d time.Duration, f func()) Timer { + return &stdTimer{ + t: time.AfterFunc(d, f), + } +} + +type stdTimer struct { + t *time.Timer +} + +var _ Timer = (*stdTimer)(nil) + +// Stop implements Timer.Stop. +func (st *stdTimer) Stop() bool { + return st.t.Stop() +} + +// Reset implements Timer.Reset. +func (st *stdTimer) Reset(d time.Duration) { + st.t.Reset(d) +} + +// NewStdTimer returns a Timer implemented with the time package. +func NewStdTimer(t *time.Timer) Timer { + return &stdTimer{t: t} +} diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go index 5554c573f..f1dd7c310 100644 --- a/pkg/tcpip/timer.go +++ b/pkg/tcpip/timer.go @@ -20,50 +20,49 @@ import ( "gvisor.dev/gvisor/pkg/sync" ) -// cancellableTimerInstance is a specific instance of CancellableTimer. +// jobInstance is a specific instance of Job. // -// Different instances are created each time CancellableTimer is Reset so each -// timer has its own earlyReturn signal. This is to address a bug when a -// CancellableTimer is stopped and reset in quick succession resulting in a -// timer instance's earlyReturn signal being affected or seen by another timer -// instance. +// Different instances are created each time Job is scheduled so each timer has +// its own earlyReturn signal. This is to address a bug when a Job is stopped +// and reset in quick succession resulting in a timer instance's earlyReturn +// signal being affected or seen by another timer instance. // // Consider the following sceneario where timer instances share a common // earlyReturn signal (T1 creates, stops and resets a Cancellable timer under a // lock L; T2, T3, T4 and T5 are goroutines that handle the first (A), second // (B), third (C), and fourth (D) instance of the timer firing, respectively): // T1: Obtain L -// T1: Create a new CancellableTimer w/ lock L (create instance A) +// T1: Create a new Job w/ lock L (create instance A) // T2: instance A fires, blocked trying to obtain L. // T1: Attempt to stop instance A (set earlyReturn = true) -// T1: Reset timer (create instance B) +// T1: Schedule timer (create instance B) // T3: instance B fires, blocked trying to obtain L. // T1: Attempt to stop instance B (set earlyReturn = true) -// T1: Reset timer (create instance C) +// T1: Schedule timer (create instance C) // T4: instance C fires, blocked trying to obtain L. // T1: Attempt to stop instance C (set earlyReturn = true) -// T1: Reset timer (create instance D) +// T1: Schedule timer (create instance D) // T5: instance D fires, blocked trying to obtain L. // T1: Release L // -// Now that T1 has released L, any of the 4 timer instances can take L and check -// earlyReturn. If the timers simply check earlyReturn and then do nothing -// further, then instance D will never early return even though it was not -// requested to stop. If the timers reset earlyReturn before early returning, -// then all but one of the timers will do work when only one was expected to. -// If CancellableTimer resets earlyReturn when resetting, then all the timers +// Now that T1 has released L, any of the 4 timer instances can take L and +// check earlyReturn. If the timers simply check earlyReturn and then do +// nothing further, then instance D will never early return even though it was +// not requested to stop. If the timers reset earlyReturn before early +// returning, then all but one of the timers will do work when only one was +// expected to. If Job resets earlyReturn when resetting, then all the timers // will fire (again, when only one was expected to). // // To address the above concerns the simplest solution was to give each timer // its own earlyReturn signal. -type cancellableTimerInstance struct { - timer *time.Timer +type jobInstance struct { + timer Timer // Used to inform the timer to early return when it gets stopped while the // lock the timer tries to obtain when fired is held (T1 is a goroutine that // tries to cancel the timer and T2 is the goroutine that handles the timer // firing): - // T1: Obtain the lock, then call StopLocked() + // T1: Obtain the lock, then call Cancel() // T2: timer fires, and gets blocked on obtaining the lock // T1: Releases lock // T2: Obtains lock does unintended work @@ -74,29 +73,33 @@ type cancellableTimerInstance struct { earlyReturn *bool } -// stop stops the timer instance t from firing if it hasn't fired already. If it +// stop stops the job instance j from firing if it hasn't fired already. If it // has fired and is blocked at obtaining the lock, earlyReturn will be set to // true so that it will early return when it obtains the lock. -func (t *cancellableTimerInstance) stop() { - if t.timer != nil { - t.timer.Stop() - *t.earlyReturn = true +func (j *jobInstance) stop() { + if j.timer != nil { + j.timer.Stop() + *j.earlyReturn = true } } -// CancellableTimer is a timer that does some work and can be safely cancelled -// when it fires at the same time some "related work" is being done. +// Job represents some work that can be scheduled for execution. The work can +// be safely cancelled when it fires at the same time some "related work" is +// being done. // // The term "related work" is defined as some work that needs to be done while // holding some lock that the timer must also hold while doing some work. // -// Note, it is not safe to copy a CancellableTimer as its timer instance creates -// a closure over the address of the CancellableTimer. -type CancellableTimer struct { +// Note, it is not safe to copy a Job as its timer instance creates +// a closure over the address of the Job. +type Job struct { _ sync.NoCopy + // The clock used to schedule the backing timer + clock Clock + // The active instance of a cancellable timer. - instance cancellableTimerInstance + instance jobInstance // locker is the lock taken by the timer immediately after it fires and must // be held when attempting to stop the timer. @@ -113,59 +116,91 @@ type CancellableTimer struct { fn func() } -// StopLocked prevents the Timer from firing if it has not fired already. +// Cancel prevents the Job from executing if it has not executed already. // -// If the timer is blocked on obtaining the t.locker lock when StopLocked is -// called, it will early return instead of calling t.fn. +// Cancel requires appropriate locking to be in place for any resources managed +// by the Job. If the Job is blocked on obtaining the lock when Cancel is +// called, it will early return. // // Note, t will be modified. // -// t.locker MUST be locked. -func (t *CancellableTimer) StopLocked() { - t.instance.stop() +// j.locker MUST be locked. +func (j *Job) Cancel() { + j.instance.stop() // Nothing to do with the stopped instance anymore. - t.instance = cancellableTimerInstance{} + j.instance = jobInstance{} } -// Reset changes the timer to expire after duration d. +// Schedule schedules the Job for execution after duration d. This can be +// called on cancelled or completed Jobs to schedule them again. // -// Note, t will be modified. +// Schedule should be invoked only on unscheduled, cancelled, or completed +// Jobs. To be safe, callers should always call Cancel before calling Schedule. // -// Reset should only be called on stopped or expired timers. To be safe, callers -// should always call StopLocked before calling Reset. -func (t *CancellableTimer) Reset(d time.Duration) { +// Note, j will be modified. +func (j *Job) Schedule(d time.Duration) { // Create a new instance. earlyReturn := false // Capture the locker so that updating the timer does not cause a data race // when a timer fires and tries to obtain the lock (read the timer's locker). - locker := t.locker - t.instance = cancellableTimerInstance{ - timer: time.AfterFunc(d, func() { + locker := j.locker + j.instance = jobInstance{ + timer: j.clock.AfterFunc(d, func() { locker.Lock() defer locker.Unlock() if earlyReturn { // If we reach this point, it means that the timer fired while another - // goroutine called StopLocked while it had the lock. Simply return - // here and do nothing further. + // goroutine called Cancel while it had the lock. Simply return here + // and do nothing further. earlyReturn = false return } - t.fn() + j.fn() }), earlyReturn: &earlyReturn, } } -// NewCancellableTimer returns an unscheduled CancellableTimer with the given -// locker and fn. -// -// fn MUST NOT attempt to lock locker. -// -// Callers must call Reset to schedule the timer to fire. -func NewCancellableTimer(locker sync.Locker, fn func()) *CancellableTimer { - return &CancellableTimer{locker: locker, fn: fn} +// NewJob returns a new Job that can be used to schedule f to run in its own +// gorountine. l will be locked before calling f then unlocked after f returns. +// +// var clock tcpip.StdClock +// var mu sync.Mutex +// message := "foo" +// job := tcpip.NewJob(&clock, &mu, func() { +// fmt.Println(message) +// }) +// job.Schedule(time.Second) +// +// mu.Lock() +// message = "bar" +// mu.Unlock() +// +// // Output: bar +// +// f MUST NOT attempt to lock l. +// +// l MUST be locked prior to calling the returned job's Cancel(). +// +// var clock tcpip.StdClock +// var mu sync.Mutex +// message := "foo" +// job := tcpip.NewJob(&clock, &mu, func() { +// fmt.Println(message) +// }) +// job.Schedule(time.Second) +// +// mu.Lock() +// job.Cancel() +// mu.Unlock() +func NewJob(c Clock, l sync.Locker, f func()) *Job { + return &Job{ + clock: c, + locker: l, + fn: f, + } } diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go index b4940e397..a82384c49 100644 --- a/pkg/tcpip/timer_test.go +++ b/pkg/tcpip/timer_test.go @@ -28,8 +28,8 @@ const ( longDuration = 1 * time.Second ) -func TestCancellableTimerReassignment(t *testing.T) { - var timer tcpip.CancellableTimer +func TestJobReschedule(t *testing.T) { + var clock tcpip.StdClock var wg sync.WaitGroup var lock sync.Mutex @@ -43,26 +43,27 @@ func TestCancellableTimerReassignment(t *testing.T) { // that has an active timer (even if it has been stopped as a stopped // timer may be blocked on a lock before it can check if it has been // stopped while another goroutine holds the same lock). - timer = *tcpip.NewCancellableTimer(&lock, func() { + job := tcpip.NewJob(&clock, &lock, func() { wg.Done() }) - timer.Reset(shortDuration) + job.Schedule(shortDuration) lock.Unlock() }() } wg.Wait() } -func TestCancellableTimerFire(t *testing.T) { +func TestJobExecution(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) - timer := tcpip.NewCancellableTimer(&lock, func() { + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) + job.Schedule(shortDuration) // Wait for timer to fire. select { @@ -82,17 +83,18 @@ func TestCancellableTimerFire(t *testing.T) { func TestCancellableTimerResetFromLongDuration(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(middleDuration) + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(middleDuration) lock.Lock() - timer.StopLocked() + job.Cancel() lock.Unlock() - timer.Reset(shortDuration) + job.Schedule(shortDuration) // Wait for timer to fire. select { @@ -109,16 +111,17 @@ func TestCancellableTimerResetFromLongDuration(t *testing.T) { } } -func TestCancellableTimerResetFromShortDuration(t *testing.T) { +func TestJobRescheduleFromShortDuration(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) - timer.StopLocked() + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) + job.Cancel() lock.Unlock() // Wait for timer to fire if it wasn't correctly stopped. @@ -128,7 +131,7 @@ func TestCancellableTimerResetFromShortDuration(t *testing.T) { case <-time.After(middleDuration): } - timer.Reset(shortDuration) + job.Schedule(shortDuration) // Wait for timer to fire. select { @@ -145,17 +148,18 @@ func TestCancellableTimerResetFromShortDuration(t *testing.T) { } } -func TestCancellableTimerImmediatelyStop(t *testing.T) { +func TestJobImmediatelyCancel(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) for i := 0; i < 1000; i++ { lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) - timer.StopLocked() + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) + job.Cancel() lock.Unlock() } @@ -167,25 +171,26 @@ func TestCancellableTimerImmediatelyStop(t *testing.T) { } } -func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) { +func TestJobCancelledRescheduleWithoutLock(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) - timer.StopLocked() + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) + job.Cancel() lock.Unlock() for i := 0; i < 10; i++ { - timer.Reset(middleDuration) + job.Schedule(middleDuration) lock.Lock() // Sleep until the timer fires and gets blocked trying to take the lock. time.Sleep(middleDuration * 2) - timer.StopLocked() + job.Cancel() lock.Unlock() } @@ -201,17 +206,18 @@ func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) { func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) for i := 0; i < 10; i++ { // Sleep until the timer fires and gets blocked trying to take the lock. time.Sleep(middleDuration) - timer.StopLocked() - timer.Reset(shortDuration) + job.Cancel() + job.Schedule(shortDuration) } lock.Unlock() @@ -230,18 +236,19 @@ func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) { } } -func TestManyCancellableTimerResetUnderLock(t *testing.T) { +func TestManyJobReschedulesUnderLock(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) for i := 0; i < 10; i++ { - timer.StopLocked() - timer.Reset(shortDuration) + job.Cancel() + job.Schedule(shortDuration) } lock.Unlock() diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 678f4e016..4612be4e7 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -797,7 +797,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk e.rcvList.PushBack(packet) e.rcvBufSize += packet.data.Size() - packet.timestamp = e.stack.NowNanoseconds() + packet.timestamp = e.stack.Clock().NowNanoseconds() e.rcvMu.Unlock() e.stats.PacketsReceived.Increment() diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 8f167391f..0e46e6355 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -499,7 +499,7 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, combinedVV.Append(pkt.Data) packet.data = combinedVV } - packet.timestampNS = ep.stack.NowNanoseconds() + packet.timestampNS = ep.stack.Clock().NowNanoseconds() ep.rcvList.PushBack(&packet) ep.rcvBufSize += packet.data.Size() diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index aefe0e2b2..f85a68554 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -700,7 +700,7 @@ func (e *endpoint) HandlePacket(route *stack.Route, pkt *stack.PacketBuffer) { } combinedVV.Append(pkt.Data) packet.data = combinedVV - packet.timestampNS = e.stack.NowNanoseconds() + packet.timestampNS = e.stack.Clock().NowNanoseconds() e.rcvList.PushBack(packet) e.rcvBufSize += packet.data.Size() diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index a14643ae8..6e692da07 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -1451,7 +1451,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS() } - packet.timestamp = e.stack.NowNanoseconds() + packet.timestamp = e.stack.Clock().NowNanoseconds() e.rcvMu.Unlock() -- cgit v1.2.3 From f347a578b79c96c13ed492b2cf9aec1cb3e60f3f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 27 Jul 2020 11:57:11 -0700 Subject: Move platform.File in memmap The subsequent systrap changes will need to import memmap from the platform package. PiperOrigin-RevId: 323409486 --- pkg/sentry/fs/fsutil/BUILD | 7 +--- pkg/sentry/fs/fsutil/dirty_set.go | 7 ++-- pkg/sentry/fs/fsutil/file_range_set.go | 15 ++++---- pkg/sentry/fs/fsutil/frame_ref_set.go | 10 ++--- pkg/sentry/fs/fsutil/host_file_mapper.go | 5 +-- pkg/sentry/fs/fsutil/host_mappable.go | 19 +++++---- pkg/sentry/fs/fsutil/inode_cached.go | 25 ++++++------ pkg/sentry/fsimpl/gofer/regular_file.go | 27 +++++++------ pkg/sentry/fsimpl/host/BUILD | 1 - pkg/sentry/fsimpl/host/mmap.go | 21 +++++----- pkg/sentry/kernel/shm/BUILD | 1 - pkg/sentry/kernel/shm/shm.go | 3 +- pkg/sentry/kernel/timekeeper.go | 4 +- pkg/sentry/kernel/vdso.go | 6 +-- pkg/sentry/memmap/BUILD | 14 ++++++- pkg/sentry/memmap/memmap.go | 60 +++++++++++++++++++++++++---- pkg/sentry/mm/BUILD | 4 +- pkg/sentry/mm/aio_context.go | 3 +- pkg/sentry/mm/mm.go | 10 ++--- pkg/sentry/mm/pma.go | 25 ++++++------ pkg/sentry/mm/special_mappable.go | 7 ++-- pkg/sentry/pgalloc/BUILD | 10 ++--- pkg/sentry/pgalloc/pgalloc.go | 66 ++++++++++++++++---------------- pkg/sentry/platform/BUILD | 20 +--------- pkg/sentry/platform/kvm/BUILD | 1 + pkg/sentry/platform/kvm/address_space.go | 3 +- pkg/sentry/platform/platform.go | 50 +----------------------- pkg/sentry/platform/ptrace/BUILD | 1 + pkg/sentry/platform/ptrace/subprocess.go | 3 +- 29 files changed, 205 insertions(+), 223 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 789369220..5fb419bcd 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -8,7 +8,6 @@ go_template_instance( out = "dirty_set_impl.go", imports = { "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", prefix = "Dirty", @@ -25,14 +24,14 @@ go_template_instance( name = "frame_ref_set_impl", out = "frame_ref_set_impl.go", imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "fsutil", prefix = "FrameRef", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "uint64", "Functions": "FrameRefSetFunctions", }, @@ -43,7 +42,6 @@ go_template_instance( out = "file_range_set_impl.go", imports = { "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", prefix = "FileRange", @@ -86,7 +84,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/state", diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go index c6cd45087..2c9446c1d 100644 --- a/pkg/sentry/fs/fsutil/dirty_set.go +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -159,7 +158,7 @@ func (ds *DirtySet) AllowClean(mr memmap.MappableRange) { // repeatedly until all bytes have been written. max is the true size of the // cached object; offsets beyond max will not be passed to writeAt, even if // they are marked dirty. -func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { var changedDirty bool defer func() { if changedDirty { @@ -194,7 +193,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet // successful partial write, SyncDirtyAll will call it repeatedly until all // bytes have been written. max is the true size of the cached object; offsets // beyond max will not be passed to writeAt, even if they are marked dirty. -func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { dseg := dirty.FirstSegment() for dseg.Ok() { if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil { @@ -210,7 +209,7 @@ func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max } // Preconditions: mr must be page-aligned. -func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() { wbr := cseg.Range().Intersect(mr) if max < wbr.Start { diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go index 5643cdac9..bbafebf03 100644 --- a/pkg/sentry/fs/fsutil/file_range_set.go +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -23,13 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/usermem" ) // FileRangeSet maps offsets into a memmap.Mappable to offsets into a -// platform.File. It is used to implement Mappables that store data in +// memmap.File. It is used to implement Mappables that store data in // sparsely-allocated memory. // // type FileRangeSet @@ -65,20 +64,20 @@ func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, spli } // FileRange returns the FileRange mapped by seg. -func (seg FileRangeIterator) FileRange() platform.FileRange { +func (seg FileRangeIterator) FileRange() memmap.FileRange { return seg.FileRangeOf(seg.Range()) } // FileRangeOf returns the FileRange mapped by mr. // // Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0. -func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange { +func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange { frstart := seg.Value() + (mr.Start - seg.Start()) - return platform.FileRange{frstart, frstart + mr.Length()} + return memmap.FileRange{frstart, frstart + mr.Length()} } // Fill attempts to ensure that all memmap.Mappable offsets in required are -// mapped to a platform.File offset, by allocating from mf with the given +// mapped to a memmap.File offset, by allocating from mf with the given // memory usage kind and invoking readAt to store data into memory. (If readAt // returns a successful partial read, Fill will call it repeatedly until all // bytes have been read.) EOF is handled consistently with the requirements of @@ -141,7 +140,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map } // Drop removes segments for memmap.Mappable offsets in mr, freeing the -// corresponding platform.FileRanges. +// corresponding memmap.FileRanges. // // Preconditions: mr must be page-aligned. func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { @@ -154,7 +153,7 @@ func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { } // DropAll removes all segments in mr, freeing the corresponding -// platform.FileRanges. +// memmap.FileRanges. func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) { for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { mf.DecRef(seg.FileRange()) diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index dd6f5aba6..a808894df 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -17,7 +17,7 @@ package fsutil import ( "math" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" ) @@ -39,7 +39,7 @@ func (FrameRefSetFunctions) ClearValue(val *uint64) { } // Merge implements segment.Functions.Merge. -func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { +func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 uint64, _ memmap.FileRange, val2 uint64) (uint64, bool) { if val1 != val2 { return 0, false } @@ -47,13 +47,13 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. } // Split implements segment.Functions.Split. -func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { +func (FrameRefSetFunctions) Split(_ memmap.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } // IncRefAndAccount adds a reference on the range fr. All newly inserted segments // are accounted as host page cache memory mappings. -func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { +func (refs *FrameRefSet) IncRefAndAccount(fr memmap.FileRange) { seg, gap := refs.Find(fr.Start) for { switch { @@ -74,7 +74,7 @@ func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { // DecRefAndAccount removes a reference on the range fr and untracks segments // that are removed from memory accounting. -func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) { +func (refs *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) { seg := refs.FindSegment(fr.Start) for seg.Ok() && seg.Start() < fr.End { diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index e82afd112..ef0113b52 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -126,7 +125,7 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { // offsets in fr or until the next call to UnmapAll. // // Preconditions: The caller must hold a reference on all offsets in fr. -func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) { +func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) { chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) f.mapsMu.Lock() defer f.mapsMu.Unlock() @@ -146,7 +145,7 @@ func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) } // Preconditions: f.mapsMu must be locked. -func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error { +func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error { prot := syscall.PROT_READ if write { prot |= syscall.PROT_WRITE diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index 78fec553e..c15d8a946 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -21,18 +21,17 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) -// HostMappable implements memmap.Mappable and platform.File over a +// HostMappable implements memmap.Mappable and memmap.File over a // CachedFileObject. // // Lock order (compare the lock order model in mm/mm.go): // truncateMu ("fs locks") // mu ("memmap.Mappable locks not taken by Translate") -// ("platform.File locks") +// ("memmap.File locks") // backingFile ("CachedFileObject locks") // // +stateify savable @@ -124,24 +123,24 @@ func (h *HostMappable) NotifyChangeFD() error { return nil } -// MapInternal implements platform.File.MapInternal. -func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (h *HostMappable) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (h *HostMappable) FD() int { return h.backingFile.FD() } -// IncRef implements platform.File.IncRef. -func (h *HostMappable) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (h *HostMappable) IncRef(fr memmap.FileRange) { mr := memmap.MappableRange{Start: fr.Start, End: fr.End} h.hostFileMapper.IncRefOn(mr) } -// DecRef implements platform.File.DecRef. -func (h *HostMappable) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (h *HostMappable) DecRef(fr memmap.FileRange) { mr := memmap.MappableRange{Start: fr.Start, End: fr.End} h.hostFileMapper.DecRefOn(mr) } diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 800c8b4e1..fe8b0b6ac 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -26,7 +26,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -934,7 +933,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is + // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. c.mapsMu.Lock() @@ -999,10 +998,10 @@ func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.Evictable } } -// IncRef implements platform.File.IncRef. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// IncRef implements memmap.File.IncRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. -func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { +func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg, gap := c.refs.Find(fr.Start) @@ -1024,10 +1023,10 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { } } -// DecRef implements platform.File.DecRef. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// DecRef implements memmap.File.DecRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. -func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { +func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg := c.refs.FindSegment(fr.Start) @@ -1046,15 +1045,15 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { c.dataMu.Unlock() } -// MapInternal implements platform.File.MapInternal. This is used when we +// MapInternal implements memmap.File.MapInternal. This is used when we // directly map an underlying host fd and CachingInodeOperations is used as the -// platform.File during translation. -func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// memmap.File during translation. +func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) } -// FD implements platform.File.FD. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// FD implements memmap.File.FD. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. func (c *CachingInodeOperations) FD() int { return c.backingFile.FD() diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 02317a133..09f142cfc 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -221,12 +220,12 @@ func (fd *regularFileFD) pwriteLocked(ctx context.Context, src usermem.IOSequenc return 0, syserror.EINVAL } mr := memmap.MappableRange{pgstart, pgend} - var freed []platform.FileRange + var freed []memmap.FileRange d.dataMu.Lock() cseg := d.cache.LowerBoundSegment(mr.Start) for cseg.Ok() && cseg.Start() < mr.End { cseg = d.cache.Isolate(cseg, mr) - freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) cseg = d.cache.Remove(cseg).NextSegment() } d.dataMu.Unlock() @@ -821,7 +820,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (d *dentry) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is + // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. d.mapsMu.Lock() @@ -869,8 +868,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { } } -// dentryPlatformFile implements platform.File. It exists solely because dentry -// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. +// dentryPlatformFile implements memmap.File. It exists solely because dentry +// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // // dentryPlatformFile is only used when a host FD representing the remote file // is available (i.e. dentry.handle.fd >= 0), and that FD is used for @@ -878,7 +877,7 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { type dentryPlatformFile struct { *dentry - // fdRefs counts references on platform.File offsets. fdRefs is protected + // fdRefs counts references on memmap.File offsets. fdRefs is protected // by dentry.dataMu. fdRefs fsutil.FrameRefSet @@ -890,29 +889,29 @@ type dentryPlatformFile struct { hostFileMapperInitOnce sync.Once } -// IncRef implements platform.File.IncRef. -func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.IncRefAndAccount(fr) d.dataMu.Unlock() } -// DecRef implements platform.File.DecRef. -func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() } -// MapInternal implements platform.File.MapInternal. -func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { d.handleMu.RLock() bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) d.handleMu.RUnlock() return bs, err } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { d.handleMu.RLock() fd := d.handle.fd diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index e86fbe2d5..bd701bbc7 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -34,7 +34,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/platform", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go index 8545a82f0..65d3af38c 100644 --- a/pkg/sentry/fsimpl/host/mmap.go +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -19,13 +19,12 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) -// inodePlatformFile implements platform.File. It exists solely because inode -// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef. +// inodePlatformFile implements memmap.File. It exists solely because inode +// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef. // // inodePlatformFile should only be used if inode.canMap is true. type inodePlatformFile struct { @@ -34,7 +33,7 @@ type inodePlatformFile struct { // fdRefsMu protects fdRefs. fdRefsMu sync.Mutex - // fdRefs counts references on platform.File offsets. It is used solely for + // fdRefs counts references on memmap.File offsets. It is used solely for // memory accounting. fdRefs fsutil.FrameRefSet @@ -45,32 +44,32 @@ type inodePlatformFile struct { fileMapperInitOnce sync.Once } -// IncRef implements platform.File.IncRef. +// IncRef implements memmap.File.IncRef. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) IncRef(fr platform.FileRange) { +func (i *inodePlatformFile) IncRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.IncRefAndAccount(fr) i.fdRefsMu.Unlock() } -// DecRef implements platform.File.DecRef. +// DecRef implements memmap.File.DecRef. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) DecRef(fr platform.FileRange) { +func (i *inodePlatformFile) DecRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.DecRefAndAccount(fr) i.fdRefsMu.Unlock() } -// MapInternal implements platform.File.MapInternal. +// MapInternal implements memmap.File.MapInternal. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (i *inodePlatformFile) FD() int { return i.hostFD } diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index bfd779837..c211fc8d0 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index f66cfcc7f..55b4c2cdb 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -45,7 +45,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -370,7 +369,7 @@ type Shm struct { // fr is the offset into mfp.MemoryFile() that backs this contents of this // segment. Immutable. - fr platform.FileRange + fr memmap.FileRange // mu protects all fields below. mu sync.Mutex `state:"nosave"` diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index 5f3908d8b..7c4fefb16 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/log" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" ) @@ -90,7 +90,7 @@ type Timekeeper struct { // NewTimekeeper does not take ownership of paramPage. // // SetClocks must be called on the returned Timekeeper before it is usable. -func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) { +func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) { return &Timekeeper{ params: NewVDSOParamPage(mfp, paramPage), }, nil diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index f1b3c212c..290c32466 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -58,7 +58,7 @@ type vdsoParams struct { type VDSOParamPage struct { // The parameter page is fr, allocated from mfp.MemoryFile(). mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange // seq is the current sequence count written to the page. // @@ -81,7 +81,7 @@ type VDSOParamPage struct { // * VDSOParamPage must be the only writer to fr. // // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block. -func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage { +func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage { return &VDSOParamPage{mfp: mfp, fr: fr} } diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index a98b66de1..2c95669cd 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -28,9 +28,21 @@ go_template_instance( }, ) +go_template_instance( + name = "file_range", + out = "file_range.go", + package = "memmap", + prefix = "File", + template = "//pkg/segment:generic_range", + types = { + "T": "uint64", + }, +) + go_library( name = "memmap", srcs = [ + "file_range.go", "mappable_range.go", "mapping_set.go", "mapping_set_impl.go", @@ -40,7 +52,7 @@ go_library( deps = [ "//pkg/context", "//pkg/log", - "//pkg/sentry/platform", + "//pkg/safemem", "//pkg/syserror", "//pkg/usermem", ], diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index c6db9fc8f..c188f6c29 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -19,12 +19,12 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/usermem" ) // Mappable represents a memory-mappable object, a mutable mapping from uint64 -// offsets to (platform.File, uint64 File offset) pairs. +// offsets to (File, uint64 File offset) pairs. // // See mm/mm.go for Mappable's place in the lock order. // @@ -74,7 +74,7 @@ type Mappable interface { // Translations are valid until invalidated by a callback to // MappingSpace.Invalidate or until the caller removes its mapping of the // translated range. Mappable implementations must ensure that at least one - // reference is held on all pages in a platform.File that may be the result + // reference is held on all pages in a File that may be the result // of a valid Translation. // // Preconditions: required.Length() > 0. optional.IsSupersetOf(required). @@ -100,7 +100,7 @@ type Translation struct { Source MappableRange // File is the mapped file. - File platform.File + File File // Offset is the offset into File at which this Translation begins. Offset uint64 @@ -110,9 +110,9 @@ type Translation struct { Perms usermem.AccessType } -// FileRange returns the platform.FileRange represented by t. -func (t Translation) FileRange() platform.FileRange { - return platform.FileRange{t.Offset, t.Offset + t.Source.Length()} +// FileRange returns the FileRange represented by t. +func (t Translation) FileRange() FileRange { + return FileRange{t.Offset, t.Offset + t.Source.Length()} } // CheckTranslateResult returns an error if (ts, terr) does not satisfy all @@ -361,3 +361,49 @@ type MMapOpts struct { // TODO(jamieliu): Replace entirely with MappingIdentity? Hint string } + +// File represents a host file that may be mapped into an platform.AddressSpace. +type File interface { + // All pages in a File are reference-counted. + + // IncRef increments the reference count on all pages in fr. + // + // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > + // 0. At least one reference must be held on all pages in fr. (The File + // interface does not provide a way to acquire an initial reference; + // implementors may define mechanisms for doing so.) + IncRef(fr FileRange) + + // DecRef decrements the reference count on all pages in fr. + // + // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > + // 0. At least one reference must be held on all pages in fr. + DecRef(fr FileRange) + + // MapInternal returns a mapping of the given file offsets in the invoking + // process' address space for reading and writing. + // + // Note that fr.Start and fr.End need not be page-aligned. + // + // Preconditions: fr.Length() > 0. At least one reference must be held on + // all pages in fr. + // + // Postconditions: The returned mapping is valid as long as at least one + // reference is held on the mapped pages. + MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error) + + // FD returns the file descriptor represented by the File. + // + // The only permitted operation on the returned file descriptor is to map + // pages from it consistent with the requirements of AddressSpace.MapFile. + FD() int +} + +// FileRange represents a range of uint64 offsets into a File. +// +// type FileRange + +// String implements fmt.Stringer.String. +func (fr FileRange) String() string { + return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) +} diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index a036ce53c..f9d0837a1 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -7,14 +7,14 @@ go_template_instance( name = "file_refcount_set", out = "file_refcount_set.go", imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "mm", prefix = "fileRefcount", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "int32", "Functions": "fileRefcountSetFunctions", }, diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 379148903..1999ec706 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -243,7 +242,7 @@ type aioMappable struct { refs.AtomicRefCount mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange } var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp()) diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 6db7c3d40..3e85964e4 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -25,7 +25,7 @@ // Locks taken by memmap.Mappable.Translate // mm.privateRefs.mu // platform.AddressSpace locks -// platform.File locks +// memmap.File locks // mm.aioManager.mu // mm.AIOContext.mu // @@ -396,7 +396,7 @@ type pma struct { // file is the file mapped by this pma. Only pmas for which file == // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to // the corresponding file range while they exist. - file platform.File `state:"nosave"` + file memmap.File `state:"nosave"` // off is the offset into file at which this pma begins. // @@ -436,7 +436,7 @@ type pma struct { private bool // If internalMappings is not empty, it is the cached return value of - // file.MapInternal for the platform.FileRange mapped by this pma. + // file.MapInternal for the memmap.FileRange mapped by this pma. internalMappings safemem.BlockSeq `state:"nosave"` } @@ -469,10 +469,10 @@ func (fileRefcountSetFunctions) MaxKey() uint64 { func (fileRefcountSetFunctions) ClearValue(_ *int32) { } -func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) { +func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) { return rc1, rc1 == rc2 } -func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) { +func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) { return rc, rc } diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index 62e4c20af..930ec895f 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -604,7 +603,7 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat } } -// Pin returns the platform.File ranges currently mapped by addresses in ar in +// Pin returns the memmap.File ranges currently mapped by addresses in ar in // mm, acquiring a reference on the returned ranges which the caller must // release by calling Unpin. If not all addresses are mapped, Pin returns a // non-nil error. Note that Pin may return both a non-empty slice of @@ -674,15 +673,15 @@ type PinnedRange struct { Source usermem.AddrRange // File is the mapped file. - File platform.File + File memmap.File // Offset is the offset into File at which this PinnedRange begins. Offset uint64 } -// FileRange returns the platform.File offsets mapped by pr. -func (pr PinnedRange) FileRange() platform.FileRange { - return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} +// FileRange returns the memmap.File offsets mapped by pr. +func (pr PinnedRange) FileRange() memmap.FileRange { + return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} } // Unpin releases the reference held by prs. @@ -857,7 +856,7 @@ func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) saf } // incPrivateRef acquires a reference on private pages in fr. -func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { +func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) { mm.privateRefs.mu.Lock() defer mm.privateRefs.mu.Unlock() refSet := &mm.privateRefs.refs @@ -878,8 +877,8 @@ func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { } // decPrivateRef releases a reference on private pages in fr. -func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) { - var freed []platform.FileRange +func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) { + var freed []memmap.FileRange mm.privateRefs.mu.Lock() refSet := &mm.privateRefs.refs @@ -951,7 +950,7 @@ func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRa // Discard internal mappings instead of trying to merge them, since merging // them requires an allocation and getting them again from the - // platform.File might not. + // memmap.File might not. pma1.internalMappings = safemem.BlockSeq{} return pma1, true } @@ -1012,12 +1011,12 @@ func (pseg pmaIterator) getInternalMappingsLocked() error { return nil } -func (pseg pmaIterator) fileRange() platform.FileRange { +func (pseg pmaIterator) fileRange() memmap.FileRange { return pseg.fileRangeOf(pseg.Range()) } // Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0. -func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { +func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange { if checkInvariants { if !pseg.Ok() { panic("terminal pma iterator") @@ -1032,5 +1031,5 @@ func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { pma := pseg.ValuePtr() pstart := pseg.Start() - return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} + return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} } diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 9ad52082d..0e142fb11 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -35,7 +34,7 @@ type SpecialMappable struct { refs.AtomicRefCount mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange name string } @@ -44,7 +43,7 @@ type SpecialMappable struct { // SpecialMappable will use the given name in /proc/[pid]/maps. // // Preconditions: fr.Length() != 0. -func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { +func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable { m := SpecialMappable{mfp: mfp, fr: fr, name: name} m.EnableLeakCheck("mm.SpecialMappable") return &m @@ -126,7 +125,7 @@ func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider { // FileRange returns the offsets into MemoryFileProvider().MemoryFile() that // store the SpecialMappable's contents. -func (m *SpecialMappable) FileRange() platform.FileRange { +func (m *SpecialMappable) FileRange() memmap.FileRange { return m.fr } diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index e1fcb175f..7a3311a70 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -36,14 +36,14 @@ go_template_instance( "trackGaps": "1", }, imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "pgalloc", prefix = "usage", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "usageInfo", "Functions": "usageSetFunctions", }, @@ -56,14 +56,14 @@ go_template_instance( "minDegree": "10", }, imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "pgalloc", prefix = "reclaim", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "reclaimSetValue", "Functions": "reclaimSetFunctions", }, @@ -89,7 +89,7 @@ go_library( "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/hostmm", - "//pkg/sentry/platform", + "//pkg/sentry/memmap", "//pkg/sentry/usage", "//pkg/state", "//pkg/state/wire", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index afab97c0a..3243d7214 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -33,14 +33,14 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostmm" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -// MemoryFile is a platform.File whose pages may be allocated to arbitrary +// MemoryFile is a memmap.File whose pages may be allocated to arbitrary // users. type MemoryFile struct { // opts holds options passed to NewMemoryFile. opts is immutable. @@ -372,7 +372,7 @@ func (f *MemoryFile) Destroy() { // to Allocate. // // Preconditions: length must be page-aligned and non-zero. -func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) { +func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) { if length == 0 || length%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid allocation length: %#x", length)) } @@ -390,7 +390,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // Find a range in the underlying file. fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment) if !ok { - return platform.FileRange{}, syserror.ENOMEM + return memmap.FileRange{}, syserror.ENOMEM } // Expand the file if needed. @@ -398,7 +398,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // Round the new file size up to be chunk-aligned. newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask if err := f.file.Truncate(newFileSize); err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } f.fileSize = newFileSize f.mappingsMu.Lock() @@ -416,7 +416,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi bs[i] = 0 } }); err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } } if !f.usage.Add(fr, usageInfo{ @@ -439,7 +439,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // space for mappings to be allocated downwards. // // Precondition: alignment must be a power of 2. -func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (platform.FileRange, bool) { +func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) { alignmentMask := alignment - 1 // Search for space in existing gaps, starting at the current end of the @@ -461,7 +461,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 break } if start := unalignedStart &^ alignmentMask; start >= gap.Start() { - return platform.FileRange{start, start + length}, true + return memmap.FileRange{start, start + length}, true } gap = gap.PrevLargeEnoughGap(length) @@ -475,7 +475,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 min = (min + alignmentMask) &^ alignmentMask if min+length < min { // Overflow: allocation would exceed the range of uint64. - return platform.FileRange{}, false + return memmap.FileRange{}, false } // Determine the minimum file size required to fit this allocation at its end. @@ -484,7 +484,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 if newFileSize <= fileSize { if fileSize != 0 { // Overflow: allocation would exceed the range of int64. - return platform.FileRange{}, false + return memmap.FileRange{}, false } newFileSize = chunkSize } @@ -496,7 +496,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 continue } if start := unalignedStart &^ alignmentMask; start >= min { - return platform.FileRange{start, start + length}, true + return memmap.FileRange{start, start + length}, true } } } @@ -508,22 +508,22 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 // by r.ReadToBlocks(), it returns that error. // // Preconditions: length > 0. length must be page-aligned. -func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) { +func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) { fr, err := f.Allocate(length, kind) if err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } dsts, err := f.MapInternal(fr, usermem.Write) if err != nil { f.DecRef(fr) - return platform.FileRange{}, err + return memmap.FileRange{}, err } n, err := safemem.ReadFullToBlocks(r, dsts) un := uint64(usermem.Addr(n).RoundDown()) if un < length { // Free unused memory and update fr to contain only the memory that is // still allocated. - f.DecRef(platform.FileRange{fr.Start + un, fr.End}) + f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) fr.End = fr.Start + un } return fr, err @@ -540,7 +540,7 @@ const ( // will read zeroes. // // Preconditions: fr.Length() > 0. -func (f *MemoryFile) Decommit(fr platform.FileRange) error { +func (f *MemoryFile) Decommit(fr memmap.FileRange) error { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -560,7 +560,7 @@ func (f *MemoryFile) Decommit(fr platform.FileRange) error { return nil } -func (f *MemoryFile) markDecommitted(fr platform.FileRange) { +func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { f.mu.Lock() defer f.mu.Unlock() // Since we're changing the knownCommitted attribute, we need to merge @@ -581,8 +581,8 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) { f.usage.MergeRange(fr) } -// IncRef implements platform.File.IncRef. -func (f *MemoryFile) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (f *MemoryFile) IncRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -600,8 +600,8 @@ func (f *MemoryFile) IncRef(fr platform.FileRange) { f.usage.MergeAdjacent(fr) } -// DecRef implements platform.File.DecRef. -func (f *MemoryFile) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (f *MemoryFile) DecRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -637,8 +637,8 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) { } } -// MapInternal implements platform.File.MapInternal. -func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (f *MemoryFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { if !fr.WellFormed() || fr.Length() == 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -664,7 +664,7 @@ func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) ( // forEachMappingSlice invokes fn on a sequence of byte slices that // collectively map all bytes in fr. -func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error { +func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error { mappings := f.mappings.Load().([]uintptr) for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { chunk := int(chunkStart >> chunkShift) @@ -944,7 +944,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( continue case !populated && populatedRun: // Finish the run by changing this segment. - runRange := platform.FileRange{ + runRange := memmap.FileRange{ Start: r.Start + uint64(populatedRunStart*usermem.PageSize), End: r.Start + uint64(i*usermem.PageSize), } @@ -1009,7 +1009,7 @@ func (f *MemoryFile) File() *os.File { return f.file } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (f *MemoryFile) FD() int { return int(f.file.Fd()) } @@ -1090,13 +1090,13 @@ func (f *MemoryFile) runReclaim() { // // Note that there returned range will be removed from tracking. It // must be reclaimed (removed from f.usage) at this point. -func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { +func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) { f.mu.Lock() defer f.mu.Unlock() for { for { if f.destroyed { - return platform.FileRange{}, false + return memmap.FileRange{}, false } if f.reclaimable { break @@ -1120,7 +1120,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { } } -func (f *MemoryFile) markReclaimed(fr platform.FileRange) { +func (f *MemoryFile) markReclaimed(fr memmap.FileRange) { f.mu.Lock() defer f.mu.Unlock() seg := f.usage.FindSegment(fr.Start) @@ -1222,11 +1222,11 @@ func (usageSetFunctions) MaxKey() uint64 { func (usageSetFunctions) ClearValue(val *usageInfo) { } -func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) { +func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) { return val1, val1 == val2 } -func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { +func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { return val, val } @@ -1270,10 +1270,10 @@ func (reclaimSetFunctions) MaxKey() uint64 { func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) { } -func (reclaimSetFunctions) Merge(_ platform.FileRange, _ reclaimSetValue, _ platform.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { +func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { return reclaimSetValue{}, true } -func (reclaimSetFunctions) Split(_ platform.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { +func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { return reclaimSetValue{}, reclaimSetValue{} } diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 453241eca..209b28053 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -1,39 +1,21 @@ load("//tools:defs.bzl", "go_library") -load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -go_template_instance( - name = "file_range", - out = "file_range.go", - package = "platform", - prefix = "File", - template = "//pkg/segment:generic_range", - types = { - "T": "uint64", - }, -) - go_library( name = "platform", srcs = [ "context.go", - "file_range.go", "mmap_min_addr.go", "platform.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/atomicbitops", "//pkg/context", - "//pkg/log", - "//pkg/safecopy", - "//pkg/safemem", "//pkg/seccomp", "//pkg/sentry/arch", - "//pkg/sentry/usage", - "//pkg/syserror", + "//pkg/sentry/memmap", "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 10a10bfe2..b5d27a72a 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -47,6 +47,7 @@ go_library( "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", + "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/ring0", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index faf1d5e1c..98a3e539d 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sync" @@ -150,7 +151,7 @@ func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem. } // MapFile implements platform.AddressSpace.MapFile. -func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { +func (as *addressSpace) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error { as.mu.Lock() defer as.mu.Unlock() diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 171513f3f..4b13eec30 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -22,9 +22,9 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/usermem" ) @@ -207,7 +207,7 @@ type AddressSpace interface { // Preconditions: addr and fr must be page-aligned. fr.Length() > 0. // at.Any() == true. At least one reference must be held on all pages in // fr, and must continue to be held as long as pages are mapped. - MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error + MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error // Unmap unmaps the given range. // @@ -310,52 +310,6 @@ func (f SegmentationFault) Error() string { return fmt.Sprintf("segmentation fault at %#x", f.Addr) } -// File represents a host file that may be mapped into an AddressSpace. -type File interface { - // All pages in a File are reference-counted. - - // IncRef increments the reference count on all pages in fr. - // - // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > - // 0. At least one reference must be held on all pages in fr. (The File - // interface does not provide a way to acquire an initial reference; - // implementors may define mechanisms for doing so.) - IncRef(fr FileRange) - - // DecRef decrements the reference count on all pages in fr. - // - // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > - // 0. At least one reference must be held on all pages in fr. - DecRef(fr FileRange) - - // MapInternal returns a mapping of the given file offsets in the invoking - // process' address space for reading and writing. - // - // Note that fr.Start and fr.End need not be page-aligned. - // - // Preconditions: fr.Length() > 0. At least one reference must be held on - // all pages in fr. - // - // Postconditions: The returned mapping is valid as long as at least one - // reference is held on the mapped pages. - MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error) - - // FD returns the file descriptor represented by the File. - // - // The only permitted operation on the returned file descriptor is to map - // pages from it consistent with the requirements of AddressSpace.MapFile. - FD() int -} - -// FileRange represents a range of uint64 offsets into a File. -// -// type FileRange - -// String implements fmt.Stringer.String. -func (fr FileRange) String() string { - return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) -} - // Requirements is used to specify platform specific requirements. type Requirements struct { // RequiresCurrentPIDNS indicates that the sandbox has to be started in the diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 30402c2df..29fd23cc3 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/hostcpu", + "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sync", diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 2389423b0..c990f3454 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -616,7 +617,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp } // MapFile implements platform.AddressSpace.MapFile. -func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { +func (s *subprocess) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error { var flags int if precommit { flags |= syscall.MAP_POPULATE -- cgit v1.2.3 From b2ae7ea1bb207eddadd7962080e7bd0b8634db96 Mon Sep 17 00:00:00 2001 From: Nayana Bidari Date: Mon, 3 Aug 2020 13:33:47 -0700 Subject: Plumbing context.Context to DecRef() and Release(). context is passed to DecRef() and Release() which is needed for SO_LINGER implementation. PiperOrigin-RevId: 324672584 --- pkg/refs/BUILD | 6 +- pkg/refs/refcounter.go | 19 +-- pkg/refs/refcounter_test.go | 38 ++--- pkg/sentry/control/proc.go | 2 +- pkg/sentry/devices/memdev/full.go | 2 +- pkg/sentry/devices/memdev/null.go | 2 +- pkg/sentry/devices/memdev/random.go | 2 +- pkg/sentry/devices/memdev/zero.go | 2 +- pkg/sentry/devices/ttydev/ttydev.go | 2 +- pkg/sentry/devices/tundev/tundev.go | 4 +- pkg/sentry/fdimport/fdimport.go | 8 +- pkg/sentry/fs/copy_up.go | 12 +- pkg/sentry/fs/copy_up_test.go | 4 +- pkg/sentry/fs/dev/net_tun.go | 4 +- pkg/sentry/fs/dirent.go | 110 +++++++------- pkg/sentry/fs/dirent_cache.go | 3 +- pkg/sentry/fs/dirent_refs_test.go | 16 +-- pkg/sentry/fs/dirent_state.go | 3 +- pkg/sentry/fs/fdpipe/pipe.go | 2 +- pkg/sentry/fs/fdpipe/pipe_opener_test.go | 16 +-- pkg/sentry/fs/fdpipe/pipe_test.go | 18 +-- pkg/sentry/fs/file.go | 10 +- pkg/sentry/fs/file_operations.go | 2 +- pkg/sentry/fs/file_overlay.go | 22 +-- pkg/sentry/fs/fsutil/file.go | 4 +- pkg/sentry/fs/gofer/file.go | 4 +- pkg/sentry/fs/gofer/gofer_test.go | 8 +- pkg/sentry/fs/gofer/handles.go | 5 +- pkg/sentry/fs/gofer/inode.go | 5 +- pkg/sentry/fs/gofer/path.go | 6 +- pkg/sentry/fs/gofer/session.go | 16 +-- pkg/sentry/fs/gofer/session_state.go | 3 +- pkg/sentry/fs/gofer/socket.go | 6 +- pkg/sentry/fs/host/control.go | 2 +- pkg/sentry/fs/host/file.go | 4 +- pkg/sentry/fs/host/inode_test.go | 2 +- pkg/sentry/fs/host/socket.go | 10 +- pkg/sentry/fs/host/socket_test.go | 38 ++--- pkg/sentry/fs/host/tty.go | 4 +- pkg/sentry/fs/host/wait_test.go | 2 +- pkg/sentry/fs/inode.go | 11 +- pkg/sentry/fs/inode_inotify.go | 5 +- pkg/sentry/fs/inode_overlay.go | 30 ++-- pkg/sentry/fs/inode_overlay_test.go | 8 +- pkg/sentry/fs/inotify.go | 8 +- pkg/sentry/fs/inotify_watch.go | 9 +- pkg/sentry/fs/mount.go | 12 +- pkg/sentry/fs/mount_overlay.go | 6 +- pkg/sentry/fs/mount_test.go | 29 ++-- pkg/sentry/fs/mounts.go | 30 ++-- pkg/sentry/fs/mounts_test.go | 2 +- pkg/sentry/fs/overlay.go | 10 +- pkg/sentry/fs/proc/fds.go | 18 +-- pkg/sentry/fs/proc/mounts.go | 8 +- pkg/sentry/fs/proc/net.go | 12 +- pkg/sentry/fs/proc/proc.go | 2 +- pkg/sentry/fs/proc/task.go | 4 +- pkg/sentry/fs/ramfs/dir.go | 18 +-- pkg/sentry/fs/ramfs/tree_test.go | 2 +- pkg/sentry/fs/timerfd/timerfd.go | 4 +- pkg/sentry/fs/tmpfs/file_test.go | 2 +- pkg/sentry/fs/tty/dir.go | 8 +- pkg/sentry/fs/tty/fs.go | 2 +- pkg/sentry/fs/tty/master.go | 8 +- pkg/sentry/fs/tty/slave.go | 4 +- pkg/sentry/fs/user/path.go | 8 +- pkg/sentry/fs/user/user.go | 8 +- pkg/sentry/fs/user/user_test.go | 8 +- pkg/sentry/fsbridge/bridge.go | 2 +- pkg/sentry/fsbridge/fs.go | 8 +- pkg/sentry/fsbridge/vfs.go | 6 +- pkg/sentry/fsimpl/devpts/devpts.go | 4 +- pkg/sentry/fsimpl/devpts/master.go | 6 +- pkg/sentry/fsimpl/devpts/slave.go | 6 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 6 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 8 +- pkg/sentry/fsimpl/eventfd/eventfd.go | 6 +- pkg/sentry/fsimpl/eventfd/eventfd_test.go | 12 +- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 6 +- pkg/sentry/fsimpl/ext/dentry.go | 7 +- pkg/sentry/fsimpl/ext/directory.go | 2 +- pkg/sentry/fsimpl/ext/ext.go | 10 +- pkg/sentry/fsimpl/ext/ext_test.go | 4 +- pkg/sentry/fsimpl/ext/filesystem.go | 68 ++++----- pkg/sentry/fsimpl/ext/regular_file.go | 2 +- pkg/sentry/fsimpl/ext/symlink.go | 2 +- pkg/sentry/fsimpl/fuse/dev.go | 2 +- pkg/sentry/fsimpl/fuse/dev_test.go | 4 +- pkg/sentry/fsimpl/fuse/fusefs.go | 4 +- pkg/sentry/fsimpl/gofer/directory.go | 4 +- pkg/sentry/fsimpl/gofer/filesystem.go | 90 ++++++------ pkg/sentry/fsimpl/gofer/gofer.go | 40 +++--- pkg/sentry/fsimpl/gofer/gofer_test.go | 6 +- pkg/sentry/fsimpl/gofer/regular_file.go | 2 +- pkg/sentry/fsimpl/gofer/socket.go | 6 +- pkg/sentry/fsimpl/gofer/special_file.go | 4 +- pkg/sentry/fsimpl/host/control.go | 2 +- pkg/sentry/fsimpl/host/host.go | 14 +- pkg/sentry/fsimpl/host/socket.go | 12 +- pkg/sentry/fsimpl/host/tty.go | 4 +- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 68 ++++----- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 10 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 26 ++-- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 18 +-- pkg/sentry/fsimpl/overlay/copy_up.go | 8 +- pkg/sentry/fsimpl/overlay/directory.go | 8 +- pkg/sentry/fsimpl/overlay/filesystem.go | 64 ++++----- pkg/sentry/fsimpl/overlay/non_directory.go | 18 +-- pkg/sentry/fsimpl/overlay/overlay.go | 48 +++---- pkg/sentry/fsimpl/pipefs/pipefs.go | 6 +- pkg/sentry/fsimpl/proc/filesystem.go | 4 +- pkg/sentry/fsimpl/proc/task_fds.go | 18 +-- pkg/sentry/fsimpl/proc/task_files.go | 14 +- pkg/sentry/fsimpl/proc/task_net.go | 12 +- pkg/sentry/fsimpl/proc/tasks_test.go | 10 +- pkg/sentry/fsimpl/signalfd/signalfd.go | 4 +- pkg/sentry/fsimpl/sockfs/sockfs.go | 4 +- pkg/sentry/fsimpl/sys/sys.go | 4 +- pkg/sentry/fsimpl/sys/sys_test.go | 2 +- pkg/sentry/fsimpl/testutil/kernel.go | 2 +- pkg/sentry/fsimpl/testutil/testutil.go | 6 +- pkg/sentry/fsimpl/timerfd/timerfd.go | 6 +- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 50 +++---- pkg/sentry/fsimpl/tmpfs/directory.go | 4 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 106 +++++++------- pkg/sentry/fsimpl/tmpfs/pipe_test.go | 20 +-- pkg/sentry/fsimpl/tmpfs/regular_file.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 34 ++--- pkg/sentry/fsimpl/tmpfs/tmpfs_test.go | 6 +- pkg/sentry/kernel/abstract_socket_namespace.go | 13 +- pkg/sentry/kernel/epoll/epoll.go | 14 +- pkg/sentry/kernel/epoll/epoll_test.go | 5 +- pkg/sentry/kernel/eventfd/eventfd.go | 4 +- pkg/sentry/kernel/fd_table.go | 55 +++---- pkg/sentry/kernel/fd_table_test.go | 6 +- pkg/sentry/kernel/fs_context.go | 31 ++-- pkg/sentry/kernel/futex/BUILD | 1 + pkg/sentry/kernel/futex/futex.go | 35 ++--- pkg/sentry/kernel/futex/futex_test.go | 66 +++++---- pkg/sentry/kernel/kernel.go | 57 ++++---- pkg/sentry/kernel/pipe/node.go | 6 +- pkg/sentry/kernel/pipe/node_test.go | 2 +- pkg/sentry/kernel/pipe/pipe.go | 2 +- pkg/sentry/kernel/pipe/pipe_test.go | 16 +-- pkg/sentry/kernel/pipe/pipe_util.go | 2 +- pkg/sentry/kernel/pipe/reader.go | 3 +- pkg/sentry/kernel/pipe/vfs.go | 8 +- pkg/sentry/kernel/pipe/writer.go | 3 +- pkg/sentry/kernel/sessions.go | 5 +- pkg/sentry/kernel/shm/shm.go | 10 +- pkg/sentry/kernel/signalfd/signalfd.go | 2 +- pkg/sentry/kernel/task.go | 8 +- pkg/sentry/kernel/task_clone.go | 10 +- pkg/sentry/kernel/task_exec.go | 6 +- pkg/sentry/kernel/task_exit.go | 8 +- pkg/sentry/kernel/task_log.go | 2 +- pkg/sentry/kernel/task_start.go | 6 +- pkg/sentry/kernel/thread_group.go | 4 +- pkg/sentry/loader/elf.go | 4 +- pkg/sentry/loader/loader.go | 6 +- pkg/sentry/memmap/memmap.go | 2 +- pkg/sentry/mm/aio_context.go | 6 +- pkg/sentry/mm/lifecycle.go | 2 +- pkg/sentry/mm/metadata.go | 5 +- pkg/sentry/mm/special_mappable.go | 4 +- pkg/sentry/mm/syscalls.go | 4 +- pkg/sentry/mm/vma.go | 4 +- pkg/sentry/socket/control/control.go | 8 +- pkg/sentry/socket/control/control_vfs2.go | 8 +- pkg/sentry/socket/hostinet/socket.go | 8 +- pkg/sentry/socket/netlink/provider.go | 2 +- pkg/sentry/socket/netlink/socket.go | 14 +- pkg/sentry/socket/netlink/socket_vfs2.go | 4 +- pkg/sentry/socket/netstack/netstack.go | 6 +- pkg/sentry/socket/netstack/netstack_vfs2.go | 2 +- pkg/sentry/socket/socket.go | 4 +- pkg/sentry/socket/unix/transport/connectioned.go | 10 +- pkg/sentry/socket/unix/transport/connectionless.go | 12 +- pkg/sentry/socket/unix/transport/queue.go | 13 +- pkg/sentry/socket/unix/transport/unix.go | 48 +++---- pkg/sentry/socket/unix/unix.go | 38 ++--- pkg/sentry/socket/unix/unix_vfs2.go | 18 +-- pkg/sentry/strace/strace.go | 12 +- pkg/sentry/syscalls/epoll.go | 18 +-- pkg/sentry/syscalls/linux/sys_aio.go | 8 +- pkg/sentry/syscalls/linux/sys_eventfd.go | 2 +- pkg/sentry/syscalls/linux/sys_file.go | 78 +++++----- pkg/sentry/syscalls/linux/sys_futex.go | 6 +- pkg/sentry/syscalls/linux/sys_getdents.go | 2 +- pkg/sentry/syscalls/linux/sys_inotify.go | 10 +- pkg/sentry/syscalls/linux/sys_lseek.go | 2 +- pkg/sentry/syscalls/linux/sys_mmap.go | 4 +- pkg/sentry/syscalls/linux/sys_mount.go | 2 +- pkg/sentry/syscalls/linux/sys_pipe.go | 6 +- pkg/sentry/syscalls/linux/sys_poll.go | 10 +- pkg/sentry/syscalls/linux/sys_prctl.go | 4 +- pkg/sentry/syscalls/linux/sys_read.go | 12 +- pkg/sentry/syscalls/linux/sys_shm.go | 10 +- pkg/sentry/syscalls/linux/sys_signal.go | 4 +- pkg/sentry/syscalls/linux/sys_socket.go | 46 +++--- pkg/sentry/syscalls/linux/sys_splice.go | 12 +- pkg/sentry/syscalls/linux/sys_stat.go | 8 +- pkg/sentry/syscalls/linux/sys_sync.go | 8 +- pkg/sentry/syscalls/linux/sys_thread.go | 6 +- pkg/sentry/syscalls/linux/sys_timerfd.go | 6 +- pkg/sentry/syscalls/linux/sys_write.go | 10 +- pkg/sentry/syscalls/linux/sys_xattr.go | 8 +- pkg/sentry/syscalls/linux/vfs2/aio.go | 8 +- pkg/sentry/syscalls/linux/vfs2/epoll.go | 14 +- pkg/sentry/syscalls/linux/vfs2/eventfd.go | 4 +- pkg/sentry/syscalls/linux/vfs2/execve.go | 12 +- pkg/sentry/syscalls/linux/vfs2/fd.go | 12 +- pkg/sentry/syscalls/linux/vfs2/filesystem.go | 22 +-- pkg/sentry/syscalls/linux/vfs2/fscontext.go | 22 +-- pkg/sentry/syscalls/linux/vfs2/getdents.go | 2 +- pkg/sentry/syscalls/linux/vfs2/inotify.go | 14 +- pkg/sentry/syscalls/linux/vfs2/ioctl.go | 2 +- pkg/sentry/syscalls/linux/vfs2/lock.go | 2 +- pkg/sentry/syscalls/linux/vfs2/memfd.go | 2 +- pkg/sentry/syscalls/linux/vfs2/mmap.go | 4 +- pkg/sentry/syscalls/linux/vfs2/mount.go | 4 +- pkg/sentry/syscalls/linux/vfs2/path.go | 12 +- pkg/sentry/syscalls/linux/vfs2/pipe.go | 6 +- pkg/sentry/syscalls/linux/vfs2/poll.go | 10 +- pkg/sentry/syscalls/linux/vfs2/read_write.go | 48 +++---- pkg/sentry/syscalls/linux/vfs2/setstat.go | 20 +-- pkg/sentry/syscalls/linux/vfs2/signal.go | 4 +- pkg/sentry/syscalls/linux/vfs2/socket.go | 46 +++--- pkg/sentry/syscalls/linux/vfs2/splice.go | 20 +-- pkg/sentry/syscalls/linux/vfs2/stat.go | 30 ++-- pkg/sentry/syscalls/linux/vfs2/sync.go | 6 +- pkg/sentry/syscalls/linux/vfs2/timerfd.go | 8 +- pkg/sentry/syscalls/linux/vfs2/xattr.go | 16 +-- pkg/sentry/vfs/anonfs.go | 8 +- pkg/sentry/vfs/dentry.go | 37 ++--- pkg/sentry/vfs/epoll.go | 6 +- pkg/sentry/vfs/file_description.go | 24 ++-- pkg/sentry/vfs/file_description_impl_util_test.go | 18 +-- pkg/sentry/vfs/filesystem.go | 6 +- pkg/sentry/vfs/inotify.go | 34 ++--- pkg/sentry/vfs/mount.go | 54 +++---- pkg/sentry/vfs/pathname.go | 18 +-- pkg/sentry/vfs/resolving_path.go | 43 +++--- pkg/sentry/vfs/vfs.go | 160 ++++++++++----------- pkg/tcpip/link/tun/BUILD | 1 + pkg/tcpip/link/tun/device.go | 9 +- runsc/boot/fs.go | 12 +- runsc/boot/loader.go | 16 +-- runsc/boot/loader_test.go | 8 +- runsc/boot/vfs.go | 10 +- 252 files changed, 1711 insertions(+), 1668 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 74affc887..9888cce9c 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -24,6 +24,7 @@ go_library( ], visibility = ["//:sandbox"], deps = [ + "//pkg/context", "//pkg/log", "//pkg/sync", ], @@ -34,5 +35,8 @@ go_test( size = "small", srcs = ["refcounter_test.go"], library = ":refs", - deps = ["//pkg/sync"], + deps = [ + "//pkg/context", + "//pkg/sync", + ], ) diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index c45ba8200..61790221b 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -23,6 +23,7 @@ import ( "runtime" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" ) @@ -38,7 +39,7 @@ type RefCounter interface { // Note that AtomicRefCounter.DecRef() does not support destructors. // If a type has a destructor, it must implement its own DecRef() // method and call AtomicRefCounter.DecRefWithDestructor(destructor). - DecRef() + DecRef(ctx context.Context) // TryIncRef attempts to increase the reference counter on the object, // but may fail if all references have already been dropped. This @@ -57,7 +58,7 @@ type RefCounter interface { // A WeakRefUser is notified when the last non-weak reference is dropped. type WeakRefUser interface { // WeakRefGone is called when the last non-weak reference is dropped. - WeakRefGone() + WeakRefGone(ctx context.Context) } // WeakRef is a weak reference. @@ -123,7 +124,7 @@ func (w *WeakRef) Get() RefCounter { // Drop drops this weak reference. You should always call drop when you are // finished with the weak reference. You may not use this object after calling // drop. -func (w *WeakRef) Drop() { +func (w *WeakRef) Drop(ctx context.Context) { rc, ok := w.get() if !ok { // We've been zapped already. When the refcounter has called @@ -145,7 +146,7 @@ func (w *WeakRef) Drop() { // And now aren't on the object's list of weak references. So it won't // zap us if this causes the reference count to drop to zero. - rc.DecRef() + rc.DecRef(ctx) // Return to the pool. weakRefPool.Put(w) @@ -427,7 +428,7 @@ func (r *AtomicRefCount) dropWeakRef(w *WeakRef) { // A: TryIncRef [transform speculative to real] // //go:nosplit -func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { +func (r *AtomicRefCount) DecRefWithDestructor(ctx context.Context, destroy func(context.Context)) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: panic("Decrementing non-positive ref count") @@ -448,7 +449,7 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { if user != nil { r.mu.Unlock() - user.WeakRefGone() + user.WeakRefGone(ctx) r.mu.Lock() } } @@ -456,7 +457,7 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { // Call the destructor. if destroy != nil { - destroy() + destroy(ctx) } } } @@ -464,6 +465,6 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { // DecRef decrements this object's reference count. // //go:nosplit -func (r *AtomicRefCount) DecRef() { - r.DecRefWithDestructor(nil) +func (r *AtomicRefCount) DecRef(ctx context.Context) { + r.DecRefWithDestructor(ctx, nil) } diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go index 1ab4a4440..6d0dd1018 100644 --- a/pkg/refs/refcounter_test.go +++ b/pkg/refs/refcounter_test.go @@ -18,6 +18,7 @@ import ( "reflect" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -31,11 +32,11 @@ type testCounter struct { destroyed bool } -func (t *testCounter) DecRef() { - t.AtomicRefCount.DecRefWithDestructor(t.destroy) +func (t *testCounter) DecRef(ctx context.Context) { + t.AtomicRefCount.DecRefWithDestructor(ctx, t.destroy) } -func (t *testCounter) destroy() { +func (t *testCounter) destroy(context.Context) { t.mu.Lock() defer t.mu.Unlock() t.destroyed = true @@ -53,7 +54,7 @@ func newTestCounter() *testCounter { func TestOneRef(t *testing.T) { tc := newTestCounter() - tc.DecRef() + tc.DecRef(context.Background()) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -63,8 +64,9 @@ func TestOneRef(t *testing.T) { func TestTwoRefs(t *testing.T) { tc := newTestCounter() tc.IncRef() - tc.DecRef() - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) + tc.DecRef(ctx) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -74,12 +76,13 @@ func TestTwoRefs(t *testing.T) { func TestMultiRefs(t *testing.T) { tc := newTestCounter() tc.IncRef() - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) tc.IncRef() - tc.DecRef() + tc.DecRef(ctx) - tc.DecRef() + tc.DecRef(ctx) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -89,19 +92,20 @@ func TestMultiRefs(t *testing.T) { func TestWeakRef(t *testing.T) { tc := newTestCounter() w := NewWeakRef(tc, nil) + ctx := context.Background() // Try resolving. if x := w.Get(); x == nil { t.Errorf("weak reference didn't resolve: expected %v, got nil", tc) } else { - x.DecRef() + x.DecRef(ctx) } // Try resolving again. if x := w.Get(); x == nil { t.Errorf("weak reference didn't resolve: expected %v, got nil", tc) } else { - x.DecRef() + x.DecRef(ctx) } // Shouldn't be destroyed yet. (Can't continue if this fails.) @@ -110,7 +114,7 @@ func TestWeakRef(t *testing.T) { } // Drop the original reference. - tc.DecRef() + tc.DecRef(ctx) // Assert destroyed. if !tc.IsDestroyed() { @@ -126,7 +130,8 @@ func TestWeakRef(t *testing.T) { func TestWeakRefDrop(t *testing.T) { tc := newTestCounter() w := NewWeakRef(tc, nil) - w.Drop() + ctx := context.Background() + w.Drop(ctx) // Just assert the list is empty. if !tc.weakRefs.Empty() { @@ -134,14 +139,14 @@ func TestWeakRefDrop(t *testing.T) { } // Drop the original reference. - tc.DecRef() + tc.DecRef(ctx) } type testWeakRefUser struct { weakRefGone func() } -func (u *testWeakRefUser) WeakRefGone() { +func (u *testWeakRefUser) WeakRefGone(ctx context.Context) { u.weakRefGone() } @@ -165,7 +170,8 @@ func TestCallback(t *testing.T) { }}) // Drop the original reference, this must trigger the callback. - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) if !called { t.Fatalf("Callback not called") diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 1bae7cfaf..dfa936563 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -139,7 +139,6 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Import file descriptors. fdTable := proc.Kernel.NewFDTable() - defer fdTable.DecRef() creds := auth.NewUserCredentials( args.KUID, @@ -177,6 +176,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI initArgs.MountNamespaceVFS2.IncRef() } ctx := initArgs.NewContext(proc.Kernel) + defer fdTable.DecRef(ctx) if kernel.VFS2Enabled { // Get the full path to the filename from the PATH env variable. diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go index af66fe4dc..511179e31 100644 --- a/pkg/sentry/devices/memdev/full.go +++ b/pkg/sentry/devices/memdev/full.go @@ -46,7 +46,7 @@ type fullFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *fullFD) Release() { +func (fd *fullFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go index 92d3d71be..4918dbeeb 100644 --- a/pkg/sentry/devices/memdev/null.go +++ b/pkg/sentry/devices/memdev/null.go @@ -47,7 +47,7 @@ type nullFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *nullFD) Release() { +func (fd *nullFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go index 6b81da5ef..5e7fe0280 100644 --- a/pkg/sentry/devices/memdev/random.go +++ b/pkg/sentry/devices/memdev/random.go @@ -56,7 +56,7 @@ type randomFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *randomFD) Release() { +func (fd *randomFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go index c6f15054d..2e631a252 100644 --- a/pkg/sentry/devices/memdev/zero.go +++ b/pkg/sentry/devices/memdev/zero.go @@ -48,7 +48,7 @@ type zeroFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *zeroFD) Release() { +func (fd *zeroFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go index fbb7fd92c..fd4b79c46 100644 --- a/pkg/sentry/devices/ttydev/ttydev.go +++ b/pkg/sentry/devices/ttydev/ttydev.go @@ -55,7 +55,7 @@ type ttyFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *ttyFD) Release() {} +func (fd *ttyFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go index dfbd069af..852ec3c5c 100644 --- a/pkg/sentry/devices/tundev/tundev.go +++ b/pkg/sentry/devices/tundev/tundev.go @@ -108,8 +108,8 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *tunFD) Release() { - fd.device.Release() +func (fd *tunFD) Release(ctx context.Context) { + fd.device.Release(ctx) } // PRead implements vfs.FileDescriptionImpl.PRead. diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go index b8686adb4..1b7cb94c0 100644 --- a/pkg/sentry/fdimport/fdimport.go +++ b/pkg/sentry/fdimport/fdimport.go @@ -50,7 +50,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) // Remember this in the TTY file, as we will // use it for the other stdio FDs. @@ -69,7 +69,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) } // Add the file to the FD map. @@ -102,7 +102,7 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) // Remember this in the TTY file, as we will use it for the other stdio // FDs. @@ -119,7 +119,7 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) } if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index ab1424c95..735452b07 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -201,7 +201,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { parentUpper := parent.Inode.overlay.upper root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } // Create the file in the upper filesystem and get an Inode for it. @@ -212,7 +212,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { log.Warningf("copy up failed to create file: %v", err) return syserror.EIO } - defer childFile.DecRef() + defer childFile.DecRef(ctx) childUpperInode = childFile.Dirent.Inode case Directory: @@ -226,7 +226,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } - defer childUpper.DecRef() + defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode case Symlink: @@ -246,7 +246,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } - defer childUpper.DecRef() + defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode default: @@ -352,14 +352,14 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in if err != nil { return err } - defer upperFile.DecRef() + defer upperFile.DecRef(ctx) // Get a handle to the lower filesystem, which we will read from. lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true}) if err != nil { return err } - defer lowerFile.DecRef() + defer lowerFile.DecRef(ctx) // Use a buffer pool to minimize allocations. buf := copyUpBuffers.Get().([]byte) diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index 91792d9fe..c7a11eec1 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -126,7 +126,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { if err != nil { t.Fatalf("failed to create file %q: %v", name, err) } - defer f.DecRef() + defer f.DecRef(ctx) relname, _ := f.Dirent.FullName(lowerRoot) @@ -171,7 +171,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { if err != nil { t.Fatalf("failed to find %q: %v", f.name, err) } - defer d.DecRef() + defer d.DecRef(ctx) f.File, err = d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) if err != nil { diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go index dc7ad075a..ec474e554 100644 --- a/pkg/sentry/fs/dev/net_tun.go +++ b/pkg/sentry/fs/dev/net_tun.go @@ -80,8 +80,8 @@ type netTunFileOperations struct { var _ fs.FileOperations = (*netTunFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (fops *netTunFileOperations) Release() { - fops.device.Release() +func (fops *netTunFileOperations) Release(ctx context.Context) { + fops.device.Release(ctx) } // Ioctl implements fs.FileOperations.Ioctl. diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 65be12175..a2f751068 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -325,7 +325,7 @@ func (d *Dirent) SyncAll(ctx context.Context) { for _, w := range d.children { if child := w.Get(); child != nil { child.(*Dirent).SyncAll(ctx) - child.DecRef() + child.DecRef(ctx) } } } @@ -451,7 +451,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // which don't hold a hard reference on their parent (their parent holds a // hard reference on them, and they contain virtually no state). But this is // good house-keeping. - child.DecRef() + child.DecRef(ctx) return nil, syscall.ENOENT } @@ -468,20 +468,20 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // their pins on the child. Inotify doesn't properly support filesystems that // revalidate dirents (since watches are lost on revalidation), but if we fail // to unpin the watches child will never be GCed. - cd.Inode.Watches.Unpin(cd) + cd.Inode.Watches.Unpin(ctx, cd) // This child needs to be revalidated, fallthrough to unhash it. Make sure // to not leak a reference from Get(). // // Note that previous lookups may still have a reference to this stale child; // this can't be helped, but we can ensure that *new* lookups are up-to-date. - child.DecRef() + child.DecRef(ctx) } // Either our weak reference expired or we need to revalidate it. Unhash child first, we're // about to replace it. delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be @@ -512,12 +512,12 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // There are active references to the existing child, prefer it to the one we // retrieved from Lookup. Likely the Lookup happened very close to the insertion // of child, so considering one stale over the other is fairly arbitrary. - c.DecRef() + c.DecRef(ctx) // The child that was installed could be negative. if cd.IsNegative() { // If so, don't leak a reference and short circuit. - child.DecRef() + child.DecRef(ctx) return nil, syscall.ENOENT } @@ -531,7 +531,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child // we looked up. delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Give the looked up child a parent. We cannot kick out entries, since we just checked above @@ -587,7 +587,7 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { return false } // Child exists. - child.DecRef() + child.DecRef(ctx) return true } @@ -622,7 +622,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi } child := file.Dirent - d.finishCreate(child, name) + d.finishCreate(ctx, child, name) // Return the reference and the new file. When the last reference to // the file is dropped, file.Dirent may no longer be cached. @@ -631,7 +631,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi // finishCreate validates the created file, adds it as a child of this dirent, // and notifies any watchers. -func (d *Dirent) finishCreate(child *Dirent, name string) { +func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) { // Sanity check c, its name must be consistent. if child.name != name { panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name)) @@ -650,14 +650,14 @@ func (d *Dirent) finishCreate(child *Dirent, name string) { panic(fmt.Sprintf("hashed child %q over a positive child", child.name)) } // Don't leak a reference. - old.DecRef() + old.DecRef(ctx) // Drop d's reference. - old.DecRef() + old.DecRef(ctx) } // Finally drop the useless weak reference on the floor. - w.Drop() + w.Drop(ctx) } d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) @@ -686,17 +686,17 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name)) } // Don't leak a reference. - old.DecRef() + old.DecRef(ctx) // Drop d's reference. - old.DecRef() + old.DecRef(ctx) } // Unhash the negative Dirent, name needs to exist now. delete(d.children, name) // Finally drop the useless weak reference on the floor. - w.Drop() + w.Drop(ctx) } // Execute the create operation. @@ -756,7 +756,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans if e != nil { return e } - d.finishCreate(childDir, name) + d.finishCreate(ctx, childDir, name) return nil }) if err == syscall.EEXIST { @@ -901,7 +901,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, // references to children. // // Preconditions: d.mu must be held. -func (d *Dirent) flush() { +func (d *Dirent) flush(ctx context.Context) { expired := make(map[string]*refs.WeakRef) for n, w := range d.children { // Call flush recursively on each child before removing our @@ -912,7 +912,7 @@ func (d *Dirent) flush() { if !cd.IsNegative() { // Flush the child. cd.mu.Lock() - cd.flush() + cd.flush(ctx) cd.mu.Unlock() // Allow the file system to drop extra references on child. @@ -920,13 +920,13 @@ func (d *Dirent) flush() { } // Don't leak a reference. - child.DecRef() + child.DecRef(ctx) } // Check if the child dirent is closed, and mark it as expired if it is. // We must call w.Get() again here, since the child could have been closed // by the calls to flush() and cache.Remove() in the above if-block. if child := w.Get(); child != nil { - child.DecRef() + child.DecRef(ctx) } else { expired[n] = w } @@ -935,7 +935,7 @@ func (d *Dirent) flush() { // Remove expired entries. for n, w := range expired { delete(d.children, n) - w.Drop() + w.Drop(ctx) } } @@ -977,7 +977,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err if !ok { panic("mount must mount over an existing dirent") } - weakRef.Drop() + weakRef.Drop(ctx) // Note that even though `d` is now hidden, it still holds a reference // to its parent. @@ -1002,13 +1002,13 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { if !ok { panic("mount must mount over an existing dirent") } - weakRef.Drop() + weakRef.Drop(ctx) // d is not reachable anymore, and hence not mounted anymore. d.mounted = false // Drop mount reference. - d.DecRef() + d.DecRef(ctx) return nil } @@ -1029,7 +1029,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath // Child does not exist. return err } - defer child.DecRef() + defer child.DecRef(ctx) // Remove cannot remove directories. if IsDir(child.Inode.StableAttr) { @@ -1055,7 +1055,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath atomic.StoreInt32(&child.deleted, 1) if w, ok := d.children[name]; ok { delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Allow the file system to drop extra references on child. @@ -1067,7 +1067,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath // inode may have other links. If this was the last link, the events for the // watch removal will be queued by the inode destructor. child.Inode.Watches.MarkUnlinked() - child.Inode.Watches.Unpin(child) + child.Inode.Watches.Unpin(ctx, child) d.Inode.Watches.Notify(name, linux.IN_DELETE, 0) return nil @@ -1100,7 +1100,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) // Child does not exist. return err } - defer child.DecRef() + defer child.DecRef(ctx) // RemoveDirectory can only remove directories. if !IsDir(child.Inode.StableAttr) { @@ -1121,7 +1121,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) atomic.StoreInt32(&child.deleted, 1) if w, ok := d.children[name]; ok { delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Allow the file system to drop extra references on child. @@ -1130,14 +1130,14 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) // Finally, let inotify know the child is being unlinked. Drop any extra // refs from inotify to this child dirent. child.Inode.Watches.MarkUnlinked() - child.Inode.Watches.Unpin(child) + child.Inode.Watches.Unpin(ctx, child) d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0) return nil } // destroy closes this node and all children. -func (d *Dirent) destroy() { +func (d *Dirent) destroy(ctx context.Context) { if d.IsNegative() { // Nothing to tear-down and no parent references to drop, since a negative // Dirent does not take a references on its parent, has no Inode and no children. @@ -1153,19 +1153,19 @@ func (d *Dirent) destroy() { if c.(*Dirent).IsNegative() { // The parent holds both weak and strong refs in the case of // negative dirents. - c.DecRef() + c.DecRef(ctx) } // Drop the reference we just acquired in WeakRef.Get. - c.DecRef() + c.DecRef(ctx) } - w.Drop() + w.Drop(ctx) } d.children = nil allDirents.remove(d) // Drop our reference to the Inode. - d.Inode.DecRef() + d.Inode.DecRef(ctx) // Allow the Dirent to be GC'ed after this point, since the Inode may still // be referenced after the Dirent is destroyed (for instance by filesystem @@ -1175,7 +1175,7 @@ func (d *Dirent) destroy() { // Drop the reference we have on our parent if we took one. renameMu doesn't need to be // held because d can't be reparented without any references to it left. if d.parent != nil { - d.parent.DecRef() + d.parent.DecRef(ctx) } } @@ -1201,14 +1201,14 @@ func (d *Dirent) TryIncRef() bool { // DecRef decreases the Dirent's refcount and drops its reference on its mount. // // DecRef implements RefCounter.DecRef with destructor d.destroy. -func (d *Dirent) DecRef() { +func (d *Dirent) DecRef(ctx context.Context) { if d.Inode != nil { // Keep mount around, since DecRef may destroy d.Inode. msrc := d.Inode.MountSource - d.DecRefWithDestructor(d.destroy) + d.DecRefWithDestructor(ctx, d.destroy) msrc.DecDirentRefs() } else { - d.DecRefWithDestructor(d.destroy) + d.DecRefWithDestructor(ctx, d.destroy) } } @@ -1359,7 +1359,7 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error if err != nil { return err } - defer victim.DecRef() + defer victim.DecRef(ctx) return d.mayDelete(ctx, victim) } @@ -1411,7 +1411,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string if err != nil { return err } - defer renamed.DecRef() + defer renamed.DecRef(ctx) // Check that the renamed dirent is deletable. if err := oldParent.mayDelete(ctx, renamed); err != nil { @@ -1453,13 +1453,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Check that we can delete replaced. if err := newParent.mayDelete(ctx, replaced); err != nil { - replaced.DecRef() + replaced.DecRef(ctx) return err } // Target should not be an ancestor of source. if oldParent.descendantOf(replaced) { - replaced.DecRef() + replaced.DecRef(ctx) // Note that Linux returns EINVAL if the source is an // ancestor of target, but ENOTEMPTY if the target is @@ -1470,7 +1470,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Check that replaced is not a mount point. if replaced.isMountPointLocked() { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.EBUSY } @@ -1478,11 +1478,11 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string oldIsDir := IsDir(renamed.Inode.StableAttr) newIsDir := IsDir(replaced.Inode.StableAttr) if !newIsDir && oldIsDir { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.ENOTDIR } if !oldIsDir && newIsDir { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.EISDIR } @@ -1493,13 +1493,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // open across renames is currently broken for multiple // reasons, so we flush all references on the replaced node and // its children. - replaced.Inode.Watches.Unpin(replaced) + replaced.Inode.Watches.Unpin(ctx, replaced) replaced.mu.Lock() - replaced.flush() + replaced.flush(ctx) replaced.mu.Unlock() // Done with replaced. - replaced.DecRef() + replaced.DecRef(ctx) } if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil { @@ -1513,14 +1513,14 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // can't destroy oldParent (and try to retake its lock) because // Rename's caller must be holding a reference. newParent.IncRef() - oldParent.DecRef() + oldParent.DecRef(ctx) } if w, ok := newParent.children[newName]; ok { - w.Drop() + w.Drop(ctx) delete(newParent.children, newName) } if w, ok := oldParent.children[oldName]; ok { - w.Drop() + w.Drop(ctx) delete(oldParent.children, oldName) } @@ -1551,7 +1551,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Same as replaced.flush above. renamed.mu.Lock() - renamed.flush() + renamed.flush(ctx) renamed.mu.Unlock() return nil diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 33de32c69..7d9dd717e 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -17,6 +17,7 @@ package fs import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -101,7 +102,7 @@ func (c *DirentCache) remove(d *Dirent) { panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d)) } c.list.Remove(d) - d.DecRef() + d.DecRef(context.Background()) c.currentSize-- if c.limit != nil { c.limit.dec() diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go index 98d69c6f2..176b894ba 100644 --- a/pkg/sentry/fs/dirent_refs_test.go +++ b/pkg/sentry/fs/dirent_refs_test.go @@ -51,7 +51,7 @@ func TestWalkPositive(t *testing.T) { t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 1) } - d.DecRef() + d.DecRef(ctx) if got := root.ReadRefs(); got != 1 { t.Fatalf("root has a ref count of %d, want %d", got, 1) @@ -61,7 +61,7 @@ func TestWalkPositive(t *testing.T) { t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0) } - root.flush() + root.flush(ctx) if got := len(root.children); got != 0 { t.Fatalf("root has %d children, want %d", got, 0) @@ -114,7 +114,7 @@ func TestWalkNegative(t *testing.T) { t.Fatalf("child has a ref count of %d, want %d", got, 2) } - child.DecRef() + child.DecRef(ctx) if got := child.(*Dirent).ReadRefs(); got != 1 { t.Fatalf("child has a ref count of %d, want %d", got, 1) @@ -124,7 +124,7 @@ func TestWalkNegative(t *testing.T) { t.Fatalf("root has %d children, want %d", got, 1) } - root.DecRef() + root.DecRef(ctx) if got := root.ReadRefs(); got != 0 { t.Fatalf("root has a ref count of %d, want %d", got, 0) @@ -351,9 +351,9 @@ func TestRemoveExtraRefs(t *testing.T) { t.Fatalf("dirent has a ref count of %d, want %d", got, 1) } - d.DecRef() + d.DecRef(ctx) - test.root.flush() + test.root.flush(ctx) if got := len(test.root.children); got != 0 { t.Errorf("root has %d children, want %d", got, 0) @@ -403,8 +403,8 @@ func TestRenameExtraRefs(t *testing.T) { t.Fatalf("Rename got error %v, want nil", err) } - oldParent.flush() - newParent.flush() + oldParent.flush(ctx) + newParent.flush(ctx) // Expect to have only active references. if got := renamed.ReadRefs(); got != 1 { diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go index f623d6c0e..67a35f0b2 100644 --- a/pkg/sentry/fs/dirent_state.go +++ b/pkg/sentry/fs/dirent_state.go @@ -18,6 +18,7 @@ import ( "fmt" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" ) @@ -48,7 +49,7 @@ func (d *Dirent) saveChildren() map[string]*Dirent { for name, w := range d.children { if rc := w.Get(); rc != nil { // Drop the reference count obtain in w.Get() - rc.DecRef() + rc.DecRef(context.Background()) cd := rc.(*Dirent) if cd.IsNegative() { diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 9fce177ad..b99199798 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -115,7 +115,7 @@ func (p *pipeOperations) Readiness(mask waiter.EventMask) (eventMask waiter.Even } // Release implements fs.FileOperations.Release. -func (p *pipeOperations) Release() { +func (p *pipeOperations) Release(context.Context) { fdnotifier.RemoveFD(int32(p.file.FD())) p.file.Close() p.file = nil diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index e556da48a..b9cec4b13 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -182,7 +182,7 @@ func TestTryOpen(t *testing.T) { // Cleanup the state of the pipe, and remove the fd from the // fdnotifier. Sadly this needed to maintain the correctness // of other tests because the fdnotifier is global. - pipeOps.Release() + pipeOps.Release(ctx) } continue } @@ -191,7 +191,7 @@ func TestTryOpen(t *testing.T) { } if pipeOps != nil { // Same as above. - pipeOps.Release() + pipeOps.Release(ctx) } } } @@ -279,7 +279,7 @@ func TestPipeOpenUnblocksEventually(t *testing.T) { pipeOps, err := Open(ctx, opener, flags) if pipeOps != nil { // Same as TestTryOpen. - pipeOps.Release() + pipeOps.Release(ctx) } // Check that the partner opened the file successfully. @@ -325,7 +325,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { ctx := contexttest.Context(t) pipeOps, err := pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true}) if pipeOps != nil { - pipeOps.Release() + pipeOps.Release(ctx) t.Fatalf("open(%s, %o) got file, want nil", name, syscall.O_RDONLY) } if err != syserror.ErrWouldBlock { @@ -351,7 +351,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { if pipeOps == nil { t.Fatalf("open(%s, %o) got nil file, want not nil", name, syscall.O_RDONLY) } - defer pipeOps.Release() + defer pipeOps.Release(ctx) if err != nil { t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_RDONLY, err) @@ -471,14 +471,14 @@ func TestPipeHangup(t *testing.T) { f := <-fdchan if f < 0 { t.Errorf("%s: partner routine got fd %d, want > 0", test.desc, f) - pipeOps.Release() + pipeOps.Release(ctx) continue } if test.hangupSelf { // Hangup self and assert that our partner got the expected hangup // error. - pipeOps.Release() + pipeOps.Release(ctx) if test.flags.Read { // Partner is writer. @@ -490,7 +490,7 @@ func TestPipeHangup(t *testing.T) { } else { // Hangup our partner and expect us to get the hangup error. syscall.Close(f) - defer pipeOps.Release() + defer pipeOps.Release(ctx) if test.flags.Read { assertReaderHungup(t, test.desc, pipeOps.(*pipeOperations).file) diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index a0082ecca..1c9e82562 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -98,10 +98,11 @@ func TestNewPipe(t *testing.T) { } f := fd.New(gfd) - p, err := newPipeOperations(contexttest.Context(t), nil, test.flags, f, test.readAheadBuffer) + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, test.flags, f, test.readAheadBuffer) if p != nil { // This is necessary to remove the fd from the global fd notifier. - defer p.Release() + defer p.Release(ctx) } else { // If there is no p to DecRef on, because newPipeOperations failed, then the // file still needs to be closed. @@ -153,13 +154,14 @@ func TestPipeDestruction(t *testing.T) { syscall.Close(fds[1]) // Test the read end, but it doesn't really matter which. - p, err := newPipeOperations(contexttest.Context(t), nil, fs.FileFlags{Read: true}, f, nil) + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, f, nil) if err != nil { f.Close() t.Fatalf("newPipeOperations got error %v, want nil", err) } // Drop our only reference, which should trigger the destructor. - p.Release() + p.Release(ctx) if fdnotifier.HasFD(int32(fds[0])) { t.Fatalf("after DecRef fdnotifier has fd %d, want no longer registered", fds[0]) @@ -282,7 +284,7 @@ func TestPipeRequest(t *testing.T) { if err != nil { t.Fatalf("%s: newPipeOperations got error %v, want nil", test.desc, err) } - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) file := fs.NewFile(ctx, fs.NewDirent(ctx, inode, "pipe"), fs.FileFlags{Read: true}, p) @@ -334,7 +336,7 @@ func TestPipeReadAheadBuffer(t *testing.T) { rfile.Close() t.Fatalf("newPipeOperations got error %v, want nil", err) } - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, @@ -380,7 +382,7 @@ func TestPipeReadsAccumulate(t *testing.T) { } // Don't forget to remove the fd from the fd notifier. Otherwise other tests will // likely be borked, because it's global :( - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, @@ -448,7 +450,7 @@ func TestPipeWritesAccumulate(t *testing.T) { } // Don't forget to remove the fd from the fd notifier. Otherwise other tests // will likely be borked, because it's global :( - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index ca41520b4..72ea70fcf 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -142,17 +142,17 @@ func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOper } // DecRef destroys the File when it is no longer referenced. -func (f *File) DecRef() { - f.DecRefWithDestructor(func() { +func (f *File) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, func(context.Context) { // Drop BSD style locks. lockRng := lock.LockRange{Start: 0, End: lock.LockEOF} f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng) // Release resources held by the FileOperations. - f.FileOperations.Release() + f.FileOperations.Release(ctx) // Release a reference on the Dirent. - f.Dirent.DecRef() + f.Dirent.DecRef(ctx) // Only unregister if we are currently registered. There is nothing // to register if f.async is nil (this happens when async mode is @@ -460,7 +460,7 @@ func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) { func (f *File) MappedName(ctx context.Context) string { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } name, _ := f.Dirent.FullName(root) return name diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index f5537411e..305c0f840 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -67,7 +67,7 @@ type SpliceOpts struct { // - File.Flags(): This value may change during the operation. type FileOperations interface { // Release release resources held by FileOperations. - Release() + Release(ctx context.Context) // Waitable defines how this File can be waited on for read and // write readiness. diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index dcc1df38f..9dc58d5ff 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -54,7 +54,7 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err // Drop the extra reference on the Dirent. Now there's only one reference // on the dirent, either owned by f (if non-nil), or the Dirent is about // to be destroyed (if GetFile failed). - dirent.DecRef() + dirent.DecRef(ctx) return f, err } @@ -89,12 +89,12 @@ type overlayFileOperations struct { } // Release implements FileOperations.Release. -func (f *overlayFileOperations) Release() { +func (f *overlayFileOperations) Release(ctx context.Context) { if f.upper != nil { - f.upper.DecRef() + f.upper.DecRef(ctx) } if f.lower != nil { - f.lower.DecRef() + f.lower.DecRef(ctx) } } @@ -164,7 +164,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &DirCtx{ @@ -497,7 +497,7 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) { if err != nil { return nil, err } - defer dir.DecRef() + defer dir.DecRef(ctx) // Use a stub serializer to read the entries into memory. stubSerializer := &CollectEntriesSerializer{} @@ -521,10 +521,10 @@ type overlayMappingIdentity struct { } // DecRef implements AtomicRefCount.DecRef. -func (omi *overlayMappingIdentity) DecRef() { - omi.AtomicRefCount.DecRefWithDestructor(func() { - omi.overlayFile.DecRef() - omi.id.DecRef() +func (omi *overlayMappingIdentity) DecRef(ctx context.Context) { + omi.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { + omi.overlayFile.DecRef(ctx) + omi.id.DecRef(ctx) }) } @@ -544,7 +544,7 @@ func (omi *overlayMappingIdentity) InodeID() uint64 { func (omi *overlayMappingIdentity) MappedName(ctx context.Context) string { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } name, _ := omi.overlayFile.Dirent.FullName(root) return name diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index 08695391c..dc9efa5df 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -31,7 +31,7 @@ import ( type FileNoopRelease struct{} // Release is a no-op. -func (FileNoopRelease) Release() {} +func (FileNoopRelease) Release(context.Context) {} // SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor // is not nil and the seek was on a directory, the cursor will be updated. @@ -296,7 +296,7 @@ func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, d *fs.Diren func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index b2fcab127..c0bc63a32 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -114,7 +114,7 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF } // Release implements fs.FileOpeations.Release. -func (f *fileOperations) Release() { +func (f *fileOperations) Release(context.Context) { f.handles.DecRef() } @@ -122,7 +122,7 @@ func (f *fileOperations) Release() { func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 2df2fe889..326fed954 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -232,7 +232,7 @@ func TestRevalidation(t *testing.T) { // We must release the dirent, of the test will fail // with a reference leak. This is tracked by p9test. - defer dirent.DecRef() + defer dirent.DecRef(ctx) // Walk again. Depending on the cache policy, we may // get a new dirent. @@ -246,7 +246,7 @@ func TestRevalidation(t *testing.T) { if !test.preModificationWantReload && dirent != newDirent { t.Errorf("Lookup with cachePolicy=%s got new dirent %+v, wanted old dirent %+v", test.cachePolicy, newDirent, dirent) } - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. // Modify the underlying mocked file's modification // time for the next walk that occurs. @@ -287,7 +287,7 @@ func TestRevalidation(t *testing.T) { if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds { t.Fatalf("Lookup with cachePolicy=%s got new modification time %v, wanted %v", test.cachePolicy, gotModTimeSeconds, nowSeconds) } - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. // Remove the file from the remote fs, subsequent walks // should now fail to find anything. @@ -303,7 +303,7 @@ func TestRevalidation(t *testing.T) { t.Errorf("Lookup with cachePolicy=%s got new dirent and error %v, wanted old dirent and nil error", test.cachePolicy, err) } if err == nil { - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. } }) } diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index fc14249be..f324dbf26 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -47,7 +47,8 @@ type handles struct { // DecRef drops a reference on handles. func (h *handles) DecRef() { - h.DecRefWithDestructor(func() { + ctx := context.Background() + h.DecRefWithDestructor(ctx, func(context.Context) { if h.Host != nil { if h.isHostBorrowed { h.Host.Release() @@ -57,7 +58,7 @@ func (h *handles) DecRef() { } } } - if err := h.File.close(context.Background()); err != nil { + if err := h.File.close(ctx); err != nil { log.Warningf("error closing p9 file: %v", err) } }) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 51d7368a1..3a225fd39 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -441,8 +441,9 @@ func (i *inodeOperations) Release(ctx context.Context) { // asynchronously. // // We use AsyncWithContext to avoid needing to allocate an extra - // anonymous function on the heap. - fs.AsyncWithContext(ctx, i.fileState.Release) + // anonymous function on the heap. We must use background context + // because the async work cannot happen on the task context. + fs.AsyncWithContext(context.Background(), i.fileState.Release) } // Mappable implements fs.InodeOperations.Mappable. diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index cf9800100..3c66dc3c2 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -168,7 +168,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string // Construct the positive Dirent. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - defer d.DecRef() + defer d.DecRef(ctx) // Construct the new file, caching the handles if allowed. h := handles{ @@ -371,7 +371,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { - defer d.DecRef() + defer d.DecRef(ctx) if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) { switch iops := d.Inode.InodeOperations.(type) { @@ -392,7 +392,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return err } if key != nil { - i.session().overrides.remove(*key) + i.session().overrides.remove(ctx, *key) } i.touchModificationAndStatusChangeTime(ctx, dir) diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index b5efc86f2..7cf3522ff 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -89,10 +89,10 @@ func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *f // remove deletes the key from the maps. // // Precondition: maps must have been locked with 'lock'. -func (e *overrideMaps) remove(key device.MultiDeviceKey) { +func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) { endpoint := e.keyMap[key] delete(e.keyMap, key) - endpoint.dirent.DecRef() + endpoint.dirent.DecRef(ctx) } // lock blocks other addition and removal operations from happening while @@ -197,7 +197,7 @@ type session struct { } // Destroy tears down the session. -func (s *session) Destroy() { +func (s *session) Destroy(ctx context.Context) { s.client.Close() } @@ -329,7 +329,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF s.client, err = p9.NewClient(conn, s.msize, s.version) if err != nil { // Drop our reference on the session, it needs to be torn down. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -340,7 +340,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF ctx.UninterruptibleSleepFinish(false) if err != nil { // Same as above. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -348,7 +348,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF if err != nil { s.attach.close(ctx) // Same as above, but after we execute the Close request. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -393,13 +393,13 @@ func (s *session) fillKeyMap(ctx context.Context) error { // fillPathMap populates paths for overrides from dirents in direntMap // before save. -func (s *session) fillPathMap() error { +func (s *session) fillPathMap(ctx context.Context) error { unlock := s.overrides.lock() defer unlock() for _, endpoint := range s.overrides.keyMap { mountRoot := endpoint.dirent.MountRoot() - defer mountRoot.DecRef() + defer mountRoot.DecRef(ctx) dirPath, _ := endpoint.dirent.FullName(mountRoot) if dirPath == "" { return fmt.Errorf("error getting path from dirent") diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 2d398b753..48b423dd8 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -26,7 +26,8 @@ import ( // beforeSave is invoked by stateify. func (s *session) beforeSave() { if s.overrides != nil { - if err := s.fillPathMap(); err != nil { + ctx := &dummyClockContext{context.Background()} + if err := s.fillPathMap(ctx); err != nil { panic("failed to save paths to override map before saving" + err.Error()) } } diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 40f2c1cad..8a1c69ac2 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -134,14 +134,14 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect // We don't need the receiver. c.CloseRecv() - c.Release() + c.Release(ctx) return c, nil } // Release implements transport.BoundEndpoint.Release. -func (e *endpoint) Release() { - e.inode.DecRef() +func (e *endpoint) Release(ctx context.Context) { + e.inode.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 39299b7e4..0d8d36afa 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -57,7 +57,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (c *scmRights) Release() { +func (c *scmRights) Release(ctx context.Context) { for _, fd := range c.fds { syscall.Close(fd) } diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index 3e48b8b2c..86d1a87f0 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -110,7 +110,7 @@ func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID) dirent := fs.NewDirent(ctx, inode, name) - defer dirent.DecRef() + defer dirent.DecRef(ctx) if isTTY { return newTTYFile(ctx, dirent, flags, iops), nil @@ -169,7 +169,7 @@ func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index c507f57eb..41a23b5da 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -36,7 +36,7 @@ func TestCloseFD(t *testing.T) { if err != nil { t.Fatalf("Failed to create File: %v", err) } - file.DecRef() + file.DecRef(ctx) s := make([]byte, 10) if c, err := syscall.Read(p[0], s); c != 0 || err != nil { diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index cfb089e43..a2f3d5918 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -194,7 +194,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) } // Send implements transport.ConnectedEndpoint.Send. -func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -271,7 +271,7 @@ func (c *ConnectedEndpoint) EventUpdate() { } // Recv implements transport.Receiver.Recv. -func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -318,7 +318,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek } // close releases all resources related to the endpoint. -func (c *ConnectedEndpoint) close() { +func (c *ConnectedEndpoint) close(context.Context) { fdnotifier.RemoveFD(int32(c.file.FD())) c.file.Close() c.file = nil @@ -374,8 +374,8 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { } // Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release. -func (c *ConnectedEndpoint) Release() { - c.ref.DecRefWithDestructor(c.close) +func (c *ConnectedEndpoint) Release(ctx context.Context) { + c.ref.DecRefWithDestructor(ctx, c.close) } // CloseUnread implements transport.ConnectedEndpoint.CloseUnread. diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index affdbcacb..9d58ea448 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -67,11 +67,12 @@ func TestSocketIsBlocking(t *testing.T) { if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { t.Fatalf("Expected socket %v to be blocking", pair[1]) } - sock, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sock, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) failed => %v", pair[0], err) } - defer sock.DecRef() + defer sock.DecRef(ctx) // Test that the socket now is non-blocking. if fl, err = getFl(pair[0]); err != nil { t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err) @@ -93,11 +94,12 @@ func TestSocketWritev(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - socket, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + socket, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer socket.DecRef() + defer socket.DecRef(ctx) buf := []byte("hello world\n") n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf)) if err != nil { @@ -115,11 +117,12 @@ func TestSocketWritevLen0(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - socket, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + socket, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer socket.DecRef() + defer socket.DecRef(ctx) n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil)) if err != nil { t.Fatalf("socket writev failed: %v", err) @@ -136,11 +139,12 @@ func TestSocketSendMsgLen0(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - sfile, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sfile, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer sfile.DecRef() + defer sfile.DecRef(ctx) s := sfile.FileOperations.(socket.Socket) n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, false, ktime.Time{}, socket.ControlMessages{}) @@ -158,18 +162,19 @@ func TestListen(t *testing.T) { if err != nil { t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) } - sfile1, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sfile1, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer sfile1.DecRef() + defer sfile1.DecRef(ctx) socket1 := sfile1.FileOperations.(socket.Socket) - sfile2, err := newSocket(contexttest.Context(t), pair[1], false) + sfile2, err := newSocket(ctx, pair[1], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[1], err) } - defer sfile2.DecRef() + defer sfile2.DecRef(ctx) socket2 := sfile2.FileOperations.(socket.Socket) // Socketpairs can not be listened to. @@ -185,11 +190,11 @@ func TestListen(t *testing.T) { if err != nil { t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) } - sfile3, err := newSocket(contexttest.Context(t), sock, false) + sfile3, err := newSocket(ctx, sock, false) if err != nil { t.Fatalf("newSocket(%v) => %v", sock, err) } - defer sfile3.DecRef() + defer sfile3.DecRef(ctx) socket3 := sfile3.FileOperations.(socket.Socket) // This socket is not bound so we can't listen on it. @@ -237,9 +242,10 @@ func TestRelease(t *testing.T) { } c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} want := &ConnectedEndpoint{queue: c.queue} - want.ref.DecRef() + ctx := contexttest.Context(t) + want.ref.DecRef(ctx) fdnotifier.AddFD(int32(c.file.FD()), nil) - c.Release() + c.Release(ctx) if !reflect.DeepEqual(c, want) { t.Errorf("got = %#v, want = %#v", c, want) } diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 82a02fcb2..b5229098c 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -113,12 +113,12 @@ func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src userme } // Release implements fs.FileOperations.Release. -func (t *TTYFileOperations) Release() { +func (t *TTYFileOperations) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() - t.fileOperations.Release() + t.fileOperations.Release(ctx) } // Ioctl implements fs.FileOperations.Ioctl. diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go index ce397a5e3..c143f4ce2 100644 --- a/pkg/sentry/fs/host/wait_test.go +++ b/pkg/sentry/fs/host/wait_test.go @@ -39,7 +39,7 @@ func TestWait(t *testing.T) { t.Fatalf("NewFile failed: %v", err) } - defer file.DecRef() + defer file.DecRef(ctx) r := file.Readiness(waiter.EventIn) if r != 0 { diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index a34fbc946..b79cd9877 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -96,13 +96,12 @@ func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, satt } // DecRef drops a reference on the Inode. -func (i *Inode) DecRef() { - i.DecRefWithDestructor(i.destroy) +func (i *Inode) DecRef(ctx context.Context) { + i.DecRefWithDestructor(ctx, i.destroy) } // destroy releases the Inode and releases the msrc reference taken. -func (i *Inode) destroy() { - ctx := context.Background() +func (i *Inode) destroy(ctx context.Context) { if err := i.WriteOut(ctx); err != nil { // FIXME(b/65209558): Mark as warning again once noatime is // properly supported. @@ -122,12 +121,12 @@ func (i *Inode) destroy() { i.Watches.targetDestroyed() if i.overlay != nil { - i.overlay.release() + i.overlay.release(ctx) } else { i.InodeOperations.Release(ctx) } - i.MountSource.DecRef() + i.MountSource.DecRef(ctx) } // Mappable calls i.InodeOperations.Mappable. diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go index efd3c962b..9911a00c2 100644 --- a/pkg/sentry/fs/inode_inotify.go +++ b/pkg/sentry/fs/inode_inotify.go @@ -17,6 +17,7 @@ package fs import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -136,11 +137,11 @@ func (w *Watches) Notify(name string, events, cookie uint32) { } // Unpin unpins dirent from all watches in this set. -func (w *Watches) Unpin(d *Dirent) { +func (w *Watches) Unpin(ctx context.Context, d *Dirent) { w.mu.RLock() defer w.mu.RUnlock() for _, watch := range w.ws { - watch.Unpin(d) + watch.Unpin(ctx, d) } } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 537c8d257..dc2e353d9 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -85,7 +85,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name upperInode = child.Inode upperInode.IncRef() } - child.DecRef() + child.DecRef(ctx) } // Are we done? @@ -108,7 +108,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name entry, err := newOverlayEntry(ctx, upperInode, nil, false) if err != nil { // Don't leak resources. - upperInode.DecRef() + upperInode.DecRef(ctx) parent.copyMu.RUnlock() return nil, false, err } @@ -129,7 +129,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name if err != nil && err != syserror.ENOENT { // Don't leak resources. if upperInode != nil { - upperInode.DecRef() + upperInode.DecRef(ctx) } parent.copyMu.RUnlock() return nil, false, err @@ -152,7 +152,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name } } } - child.DecRef() + child.DecRef(ctx) } } @@ -183,7 +183,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // unnecessary because we don't need to copy-up and we will always // operate (e.g. read/write) on the upper Inode. if !IsDir(upperInode.StableAttr) { - lowerInode.DecRef() + lowerInode.DecRef(ctx) lowerInode = nil } } @@ -194,10 +194,10 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // Well, not quite, we failed at the last moment, how depressing. // Be sure not to leak resources. if upperInode != nil { - upperInode.DecRef() + upperInode.DecRef(ctx) } if lowerInode != nil { - lowerInode.DecRef() + lowerInode.DecRef(ctx) } parent.copyMu.RUnlock() return nil, false, err @@ -248,7 +248,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st // user) will clobber the real path for the underlying Inode. upperFile.Dirent.Inode.IncRef() upperDirent := NewTransientDirent(upperFile.Dirent.Inode) - upperFile.Dirent.DecRef() + upperFile.Dirent.DecRef(ctx) upperFile.Dirent = upperDirent // Create the overlay inode and dirent. We need this to construct the @@ -259,7 +259,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st // The overlay file created below with NewFile will take a reference on // the overlayDirent, and it should be the only thing holding a // reference at the time of creation, so we must drop this reference. - defer overlayDirent.DecRef() + defer overlayDirent.DecRef(ctx) // Create a new overlay file that wraps the upper file. flags.Pread = upperFile.Flags().Pread @@ -399,7 +399,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) { children, err := readdirOne(ctx, replaced) if err != nil { - replaced.DecRef() + replaced.DecRef(ctx) return err } @@ -407,12 +407,12 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // included among the returned children, so we don't // need to bother checking for them. if len(children) > 0 { - replaced.DecRef() + replaced.DecRef(ctx) return syserror.ENOTEMPTY } } - replaced.DecRef() + replaced.DecRef(ctx) } } @@ -455,12 +455,12 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri // Grab the inode and drop the dirent, we don't need it. inode := d.Inode inode.IncRef() - d.DecRef() + d.DecRef(ctx) // Create a new overlay entry and dirent for the socket. entry, err := newOverlayEntry(ctx, inode, nil, false) if err != nil { - inode.DecRef() + inode.DecRef(ctx) return nil, err } // Use the parent's MountSource, since that corresponds to the overlay, @@ -672,7 +672,7 @@ func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) { // ground and claim that jumping around the filesystem like this // is not supported. name, _ := dirent.FullName(nil) - dirent.DecRef() + dirent.DecRef(ctx) // Claim that the path is not accessible. err = syserror.EACCES diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index 389c219d6..aa9851b26 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -316,7 +316,7 @@ func TestCacheFlush(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) ctx = &rootContext{ Context: ctx, @@ -345,7 +345,7 @@ func TestCacheFlush(t *testing.T) { } // Drop the file reference. - file.DecRef() + file.DecRef(ctx) // Dirent should have 2 refs left. if got, want := dirent.ReadRefs(), 2; int(got) != want { @@ -361,7 +361,7 @@ func TestCacheFlush(t *testing.T) { } // Drop our ref. - dirent.DecRef() + dirent.DecRef(ctx) // We should be back to zero refs. if got, want := dirent.ReadRefs(), 0; int(got) != want { @@ -398,7 +398,7 @@ func (d *dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags if err != nil { return nil, err } - defer file.DecRef() + defer file.DecRef(ctx) // Wrap the file's FileOperations in a dirFile. fops := &dirFile{ FileOperations: file.FileOperations, diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index e3a715c1f..c5c07d564 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -80,7 +80,7 @@ func NewInotify(ctx context.Context) *Inotify { // Release implements FileOperations.Release. Release removes all watches and // frees all resources for an inotify instance. -func (i *Inotify) Release() { +func (i *Inotify) Release(ctx context.Context) { // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.targetDestroyed from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all @@ -93,7 +93,7 @@ func (i *Inotify) Release() { // the owner's destructor. w.target.Watches.Remove(w.ID()) // Don't leak any references to the target, held by pins in the watch. - w.destroy() + w.destroy(ctx) } } @@ -321,7 +321,7 @@ func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 { // // RmWatch looks up an inotify watch for the given 'wd' and configures the // target dirent to stop sending events to this inotify instance. -func (i *Inotify) RmWatch(wd int32) error { +func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. @@ -346,7 +346,7 @@ func (i *Inotify) RmWatch(wd int32) error { i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) // Remove all pins. - watch.destroy() + watch.destroy(ctx) return nil } diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go index 900cba3ca..605423d22 100644 --- a/pkg/sentry/fs/inotify_watch.go +++ b/pkg/sentry/fs/inotify_watch.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -105,12 +106,12 @@ func (w *Watch) Pin(d *Dirent) { // Unpin drops any extra refs held on dirent due to a previous Pin // call. Calling Unpin multiple times for the same dirent, or on a dirent // without a corresponding Pin call is a no-op. -func (w *Watch) Unpin(d *Dirent) { +func (w *Watch) Unpin(ctx context.Context, d *Dirent) { w.mu.Lock() defer w.mu.Unlock() if w.pins[d] { delete(w.pins, d) - d.DecRef() + d.DecRef(ctx) } } @@ -125,11 +126,11 @@ func (w *Watch) TargetDestroyed() { // this watch. Destroy does not cause any new events to be generated. The caller // is responsible for ensuring there are no outstanding references to this // watch. -func (w *Watch) destroy() { +func (w *Watch) destroy(ctx context.Context) { w.mu.Lock() defer w.mu.Unlock() for d := range w.pins { - d.DecRef() + d.DecRef(ctx) } w.pins = nil } diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 37bae6810..ee69b10e8 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -51,7 +51,7 @@ type MountSourceOperations interface { DirentOperations // Destroy destroys the MountSource. - Destroy() + Destroy(ctx context.Context) // Below are MountSourceOperations that do not conform to Linux. @@ -165,16 +165,16 @@ func (msrc *MountSource) DecDirentRefs() { } } -func (msrc *MountSource) destroy() { +func (msrc *MountSource) destroy(ctx context.Context) { if c := msrc.DirentRefs(); c != 0 { panic(fmt.Sprintf("MountSource with non-zero direntRefs is being destroyed: %d", c)) } - msrc.MountSourceOperations.Destroy() + msrc.MountSourceOperations.Destroy(ctx) } // DecRef drops a reference on the MountSource. -func (msrc *MountSource) DecRef() { - msrc.DecRefWithDestructor(msrc.destroy) +func (msrc *MountSource) DecRef(ctx context.Context) { + msrc.DecRefWithDestructor(ctx, msrc.destroy) } // FlushDirentRefs drops all references held by the MountSource on Dirents. @@ -264,7 +264,7 @@ func (*SimpleMountSourceOperations) ResetInodeMappings() {} func (*SimpleMountSourceOperations) SaveInodeMapping(*Inode, string) {} // Destroy implements MountSourceOperations.Destroy. -func (*SimpleMountSourceOperations) Destroy() {} +func (*SimpleMountSourceOperations) Destroy(context.Context) {} // Info defines attributes of a filesystem. type Info struct { diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go index 78e35b1e6..7badc75d6 100644 --- a/pkg/sentry/fs/mount_overlay.go +++ b/pkg/sentry/fs/mount_overlay.go @@ -115,9 +115,9 @@ func (o *overlayMountSourceOperations) SaveInodeMapping(inode *Inode, path strin } // Destroy drops references on the upper and lower MountSource. -func (o *overlayMountSourceOperations) Destroy() { - o.upper.DecRef() - o.lower.DecRef() +func (o *overlayMountSourceOperations) Destroy(ctx context.Context) { + o.upper.DecRef(ctx) + o.lower.DecRef(ctx) } // type overlayFilesystem is the filesystem for overlay mounts. diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go index a3d10770b..6c296f5d0 100644 --- a/pkg/sentry/fs/mount_test.go +++ b/pkg/sentry/fs/mount_test.go @@ -18,6 +18,7 @@ import ( "fmt" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/contexttest" ) @@ -32,13 +33,13 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool { return false } -func mountPathsAre(root *Dirent, got []*Mount, want ...string) error { +func mountPathsAre(ctx context.Context, root *Dirent, got []*Mount, want ...string) error { gotPaths := make(map[string]struct{}, len(got)) gotStr := make([]string, len(got)) for i, g := range got { if groot := g.Root(); groot != nil { name, _ := groot.FullName(root) - groot.DecRef() + groot.DecRef(ctx) gotStr[i] = name gotPaths[name] = struct{}{} } @@ -69,7 +70,7 @@ func TestMountSourceOnlyCachedOnce(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Get a child of the root which we will mount over. Note that the // MockInodeOperations causes Walk to always succeed. @@ -125,7 +126,7 @@ func TestAllMountsUnder(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Add mounts at the following paths: paths := []string{ @@ -150,14 +151,14 @@ func TestAllMountsUnder(t *testing.T) { if err := mm.Mount(ctx, d, submountInode); err != nil { t.Fatalf("could not mount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) } // mm root should contain all submounts (and does not include the root mount). rootMnt := mm.FindMount(rootDirent) submounts := mm.AllMountsUnder(rootMnt) allPaths := append(paths, "/") - if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil { t.Error(err) } @@ -181,9 +182,9 @@ func TestAllMountsUnder(t *testing.T) { if err != nil { t.Fatalf("could not find path %q in mount manager: %v", "/foo", err) } - defer d.DecRef() + defer d.DecRef(ctx) submounts = mm.AllMountsUnder(mm.FindMount(d)) - if err := mountPathsAre(rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { t.Error(err) } @@ -193,9 +194,9 @@ func TestAllMountsUnder(t *testing.T) { if err != nil { t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err) } - defer waldo.DecRef() + defer waldo.DecRef(ctx) submounts = mm.AllMountsUnder(mm.FindMount(waldo)) - if err := mountPathsAre(rootDirent, submounts, "/waldo"); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, "/waldo"); err != nil { t.Error(err) } } @@ -212,7 +213,7 @@ func TestUnmount(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Add mounts at the following paths: paths := []string{ @@ -240,7 +241,7 @@ func TestUnmount(t *testing.T) { if err := mm.Mount(ctx, d, submountInode); err != nil { t.Fatalf("could not mount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) } allPaths := make([]string, len(paths)+1) @@ -259,13 +260,13 @@ func TestUnmount(t *testing.T) { if err := mm.Unmount(ctx, d, false); err != nil { t.Fatalf("could not unmount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) // Remove the path that has been unmounted and the check that the remaining // mounts are still there. allPaths = allPaths[:len(allPaths)-1] submounts := mm.AllMountsUnder(rootMnt) - if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil { t.Error(err) } } diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 3f2bd0e87..d741c4339 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -234,7 +234,7 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() { // After destroy is called, the MountNamespace may continue to be referenced (for // example via /proc/mounts), but should free all resources and shouldn't have // Find* methods called. -func (mns *MountNamespace) destroy() { +func (mns *MountNamespace) destroy(ctx context.Context) { mns.mu.Lock() defer mns.mu.Unlock() @@ -247,13 +247,13 @@ func (mns *MountNamespace) destroy() { for _, mp := range mns.mounts { // Drop the mount reference on all mounted dirents. for ; mp != nil; mp = mp.previous { - mp.root.DecRef() + mp.root.DecRef(ctx) } } mns.mounts = nil // Drop reference on the root. - mns.root.DecRef() + mns.root.DecRef(ctx) // Ensure that root cannot be accessed via this MountNamespace any // more. @@ -265,8 +265,8 @@ func (mns *MountNamespace) destroy() { } // DecRef implements RefCounter.DecRef with destructor mns.destroy. -func (mns *MountNamespace) DecRef() { - mns.DecRefWithDestructor(mns.destroy) +func (mns *MountNamespace) DecRef(ctx context.Context) { + mns.DecRefWithDestructor(ctx, mns.destroy) } // withMountLocked prevents further walks to `node`, because `node` is about to @@ -312,7 +312,7 @@ func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode if err != nil { return err } - defer replacement.DecRef() + defer replacement.DecRef(ctx) // Set the mount's root dirent and id. parentMnt := mns.findMountLocked(mountPoint) @@ -394,7 +394,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev)) } // Drop mount reference taken at the end of MountNamespace.Mount. - prev.root.DecRef() + prev.root.DecRef(ctx) } else { mns.mounts[prev.root] = prev } @@ -496,11 +496,11 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path // non-directory root is hopeless. if current != root { if !IsDir(current.Inode.StableAttr) { - current.DecRef() // Drop reference from above. + current.DecRef(ctx) // Drop reference from above. return nil, syserror.ENOTDIR } if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { - current.DecRef() // Drop reference from above. + current.DecRef(ctx) // Drop reference from above. return nil, err } } @@ -511,12 +511,12 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path // Allow failed walks to cache the dirent, because no // children will acquire a reference at the end. current.maybeExtendReference() - current.DecRef() + current.DecRef(ctx) return nil, err } // Drop old reference. - current.DecRef() + current.DecRef(ctx) if remainder != "" { // Ensure it's resolved, unless it's the last level. @@ -570,11 +570,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema case nil: // Make sure we didn't exhaust the traversal budget. if *remainingTraversals == 0 { - target.DecRef() + target.DecRef(ctx) return nil, syscall.ELOOP } - node.DecRef() // Drop the original reference. + node.DecRef(ctx) // Drop the original reference. return target, nil case syscall.ENOLINK: @@ -582,7 +582,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema return node, nil case ErrResolveViaReadlink: - defer node.DecRef() // See above. + defer node.DecRef(ctx) // See above. // First, check if we should traverse. if *remainingTraversals == 0 { @@ -608,7 +608,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema return d, err default: - node.DecRef() // Drop for err; see above. + node.DecRef(ctx) // Drop for err; see above. // Propagate the error. return nil, err diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go index a69b41468..975d6cbc9 100644 --- a/pkg/sentry/fs/mounts_test.go +++ b/pkg/sentry/fs/mounts_test.go @@ -51,7 +51,7 @@ func TestFindLink(t *testing.T) { } root := mm.Root() - defer root.DecRef() + defer root.DecRef(ctx) foo, err := root.Walk(ctx, root, "foo") if err != nil { t.Fatalf("Error walking to foo: %v", err) diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index a8ae7d81d..35013a21b 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -107,7 +107,7 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount msrc := newOverlayMountSource(ctx, upper.MountSource, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, upper, lower, true) if err != nil { - msrc.DecRef() + msrc.DecRef(ctx) return nil, err } @@ -130,7 +130,7 @@ func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, msrc := newOverlayMountSource(ctx, upperMS, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, nil, lower, true) if err != nil { - msrc.DecRef() + msrc.DecRef(ctx) return nil, err } return newOverlayInode(ctx, overlay, msrc), nil @@ -230,16 +230,16 @@ func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExist }, nil } -func (o *overlayEntry) release() { +func (o *overlayEntry) release(ctx context.Context) { // We drop a reference on upper and lower file system Inodes // rather than releasing them, because in-memory filesystems // may hold an extra reference to these Inodes so that they // stay in memory. if o.upper != nil { - o.upper.DecRef() + o.upper.DecRef(ctx) } if o.lower != nil { - o.lower.DecRef() + o.lower.DecRef(ctx) } } diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index 35972e23c..45523adf8 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -56,11 +56,11 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF // readDescriptors reads fds in the task starting at offset, and calls the // toDentAttr callback for each to get a DentAttr, which it then emits. This is // a helper for implementing fs.InodeOperations.Readdir. -func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { +func readDescriptors(ctx context.Context, t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { var fds []int32 t.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.GetFDs() + fds = fdTable.GetFDs(ctx) } }) @@ -116,7 +116,7 @@ func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } n, _ := f.file.Dirent.FullName(root) return n, nil @@ -135,13 +135,7 @@ func (f *fd) Truncate(context.Context, *fs.Inode, int64) error { func (f *fd) Release(ctx context.Context) { f.Symlink.Release(ctx) - f.file.DecRef() -} - -// Close releases the reference on the file. -func (f *fd) Close() error { - f.file.DecRef() - return nil + f.file.DecRef(ctx) } // fdDir is an InodeOperations for /proc/TID/fd. @@ -227,7 +221,7 @@ func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySer if f.isInfoFile { typ = fs.Symlink } - return readDescriptors(f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr { + return readDescriptors(ctx, f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr { return fs.GenericDentAttr(typ, device.ProcDevice) }) } @@ -261,7 +255,7 @@ func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs // locks, and other data. For now we only have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags() - file.DecRef() + file.DecRef(ctx) contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags)) return newStaticProcInode(ctx, dir.MountSource, contents) }) diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index 1fc9c703c..6a63c47b3 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -47,7 +47,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { // The task has been destroyed. Nothing to show here. return } - defer rootDir.DecRef() + defer rootDir.DecRef(t) mnt := t.MountNamespace().FindMount(rootDir) if mnt == nil { @@ -64,7 +64,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { continue // No longer valid. } mountPath, desc := mroot.FullName(rootDir) - mroot.DecRef() + mroot.DecRef(t) if !desc { // MountSources that are not descendants of the chroot jail are ignored. continue @@ -97,7 +97,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se if mroot == nil { return // No longer valid. } - defer mroot.DecRef() + defer mroot.DecRef(ctx) // Format: // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue @@ -216,7 +216,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan if root == nil { return // No longer valid. } - defer root.DecRef() + defer root.DecRef(ctx) flags := root.Inode.MountSource.Flags opts := "rw" diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index bd18177d4..83a43aa26 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -419,7 +419,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } sfile := s.(*fs.File) if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { - s.DecRef() + s.DecRef(ctx) // Not a unix socket. continue } @@ -479,7 +479,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ @@ -574,7 +574,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } if family, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() + s.DecRef(ctx) // Not tcp4 sockets. continue } @@ -664,7 +664,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ @@ -752,7 +752,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() + s.DecRef(ctx) // Not udp4 socket. continue } @@ -822,7 +822,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index c659224a7..77e0e1d26 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -213,7 +213,7 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent // Add dot and dotdot. root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dot, dotdot := file.Dirent.GetDotAttrs(root) names = append(names, ".", "..") diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 4bbe90198..9cf7f2a62 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -185,7 +185,7 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry // Serialize "." and "..". root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dot, dotdot := file.Dirent.GetDotAttrs(root) if err := dirCtx.DirEmit(".", dot); err != nil { @@ -295,7 +295,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if err != nil { return "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) return exec.PathnameWithDeleted(ctx), nil } diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index bfa304552..f4fcddecb 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -219,7 +219,7 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error { } // Remove our reference on the inode. - inode.DecRef() + inode.DecRef(ctx) return nil } @@ -250,7 +250,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err } // Remove our reference on the inode. - inode.DecRef() + inode.DecRef(ctx) return nil } @@ -326,7 +326,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F // Create the Dirent and corresponding file. created := fs.NewDirent(ctx, inode, name) - defer created.DecRef() + defer created.DecRef(ctx) return created.Inode.GetFile(ctx, created, flags) } @@ -412,11 +412,11 @@ func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, ol } // Release implements fs.InodeOperation.Release. -func (d *Dir) Release(_ context.Context) { +func (d *Dir) Release(ctx context.Context) { // Drop references on all children. d.mu.Lock() for _, i := range d.children { - i.DecRef() + i.DecRef(ctx) } d.mu.Unlock() } @@ -456,7 +456,7 @@ func (dfo *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirC func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, @@ -473,13 +473,13 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) { // dropped when that dirent is destroyed. inode.IncRef() d := fs.NewTransientDirent(inode) - defer d.DecRef() + defer d.DecRef(ctx) file, err := inode.GetFile(ctx, d, fs.FileFlags{Read: true}) if err != nil { return false, err } - defer file.DecRef() + defer file.DecRef(ctx) ser := &fs.CollectEntriesSerializer{} if err := file.Readdir(ctx, ser); err != nil { @@ -530,7 +530,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n if err != nil { return err } - inode.DecRef() + inode.DecRef(ctx) } // Be careful, we may have already grabbed this mutex above. diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go index a6ed8b2c5..3e0d1e07e 100644 --- a/pkg/sentry/fs/ramfs/tree_test.go +++ b/pkg/sentry/fs/ramfs/tree_test.go @@ -67,7 +67,7 @@ func TestMakeDirectoryTree(t *testing.T) { continue } root := mm.Root() - defer mm.DecRef() + defer mm.DecRef(ctx) for _, p := range test.subdirs { maxTraversals := uint(0) diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index 88c344089..f362ca9b6 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -55,7 +55,7 @@ type TimerOperations struct { func NewFile(ctx context.Context, c ktime.Clock) *fs.File { dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[timerfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) tops := &TimerOperations{} tops.timer = ktime.NewTimer(c, tops) // Timerfds reject writes, but the Write flag must be set in order to @@ -65,7 +65,7 @@ func NewFile(ctx context.Context, c ktime.Clock) *fs.File { } // Release implements fs.FileOperations.Release. -func (t *TimerOperations) Release() { +func (t *TimerOperations) Release(context.Context) { t.timer.Destroy() } diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go index aaba35502..d4d613ea9 100644 --- a/pkg/sentry/fs/tmpfs/file_test.go +++ b/pkg/sentry/fs/tmpfs/file_test.go @@ -46,7 +46,7 @@ func newFile(ctx context.Context) *fs.File { func TestGrow(t *testing.T) { ctx := contexttest.Context(t) f := newFile(ctx) - defer f.DecRef() + defer f.DecRef(ctx) abuf := bytes.Repeat([]byte{'a'}, 68) n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0) diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 108654827..463f6189e 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -132,7 +132,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) { d.mu.Lock() defer d.mu.Unlock() - d.master.DecRef() + d.master.DecRef(ctx) if len(d.slaves) != 0 { panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d)) } @@ -263,7 +263,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e } // masterClose is called when the master end of t is closed. -func (d *dirInodeOperations) masterClose(t *Terminal) { +func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) { d.mu.Lock() defer d.mu.Unlock() @@ -277,7 +277,7 @@ func (d *dirInodeOperations) masterClose(t *Terminal) { panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d)) } - s.DecRef() + s.DecRef(ctx) delete(d.slaves, t.n) d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10)) } @@ -322,7 +322,7 @@ func (df *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCt func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index 8fe05ebe5..2d4d44bf3 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -108,4 +108,4 @@ func (superOperations) ResetInodeMappings() {} func (superOperations) SaveInodeMapping(*fs.Inode, string) {} // Destroy implements MountSourceOperations.Destroy. -func (superOperations) Destroy() {} +func (superOperations) Destroy(context.Context) {} diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index fe07fa929..e00746017 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -75,7 +75,7 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn } // Release implements fs.InodeOperations.Release. -func (mi *masterInodeOperations) Release(ctx context.Context) { +func (mi *masterInodeOperations) Release(context.Context) { } // Truncate implements fs.InodeOperations.Truncate. @@ -120,9 +120,9 @@ type masterFileOperations struct { var _ fs.FileOperations = (*masterFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (mf *masterFileOperations) Release() { - mf.d.masterClose(mf.t) - mf.t.DecRef() +func (mf *masterFileOperations) Release(ctx context.Context) { + mf.d.masterClose(ctx, mf.t) + mf.t.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 9871f6fc6..7c7292687 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -71,7 +71,7 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne // Release implements fs.InodeOperations.Release. func (si *slaveInodeOperations) Release(ctx context.Context) { - si.t.DecRef() + si.t.DecRef(ctx) } // Truncate implements fs.InodeOperations.Truncate. @@ -106,7 +106,7 @@ type slaveFileOperations struct { var _ fs.FileOperations = (*slaveFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (sf *slaveFileOperations) Release() { +func (sf *slaveFileOperations) Release(context.Context) { } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go index 397e96045..2f5a43b84 100644 --- a/pkg/sentry/fs/user/path.go +++ b/pkg/sentry/fs/user/path.go @@ -82,7 +82,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s // Caller has no root. Don't bother traversing anything. return "", syserror.ENOENT } - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. @@ -100,7 +100,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s if err != nil { return "", err } - defer d.DecRef() + defer d.DecRef(ctx) // Check that it is a regular file. if !fs.IsRegular(d.Inode.StableAttr) { @@ -121,7 +121,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. @@ -148,7 +148,7 @@ func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNam if err != nil { return "", err } - dentry.DecRef() + dentry.DecRef(ctx) return binPath, nil } diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go index f4d525523..936fd3932 100644 --- a/pkg/sentry/fs/user/user.go +++ b/pkg/sentry/fs/user/user.go @@ -62,7 +62,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K // doesn't exist we will return the default home directory. return defaultHome, nil } - defer dirent.DecRef() + defer dirent.DecRef(ctx) // Check read permissions on the file. if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil { @@ -81,7 +81,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K if err != nil { return "", err } - defer f.DecRef() + defer f.DecRef(ctx) r := &fileReader{ Ctx: ctx, @@ -105,7 +105,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth. const defaultHome = "/" root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) creds := auth.CredentialsFromContext(ctx) @@ -123,7 +123,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth. if err != nil { return defaultHome, nil } - defer fd.DecRef() + defer fd.DecRef(ctx) r := &fileReaderVFS2{ ctx: ctx, diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go index 7d8e9ac7c..12b786224 100644 --- a/pkg/sentry/fs/user/user_test.go +++ b/pkg/sentry/fs/user/user_test.go @@ -39,7 +39,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode if err != nil { return err } - defer etc.DecRef() + defer etc.DecRef(ctx) switch mode.FileType() { case 0: // Don't create anything. @@ -49,7 +49,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode if err != nil { return err } - defer passwd.DecRef() + defer passwd.DecRef(ctx) if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil { return err } @@ -110,9 +110,9 @@ func TestGetExecUserHome(t *testing.T) { if err != nil { t.Fatalf("NewMountNamespace failed: %v", err) } - defer mns.DecRef() + defer mns.DecRef(ctx) root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) ctx = fs.WithRoot(ctx, root) if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil { diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go index 8e7590721..7e61209ee 100644 --- a/pkg/sentry/fsbridge/bridge.go +++ b/pkg/sentry/fsbridge/bridge.go @@ -44,7 +44,7 @@ type File interface { IncRef() // DecRef decrements reference. - DecRef() + DecRef(ctx context.Context) } // Lookup provides a common interface to open files. diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go index 093ce1fb3..9785fd62a 100644 --- a/pkg/sentry/fsbridge/fs.go +++ b/pkg/sentry/fsbridge/fs.go @@ -49,7 +49,7 @@ func (f *fsFile) PathnameWithDeleted(ctx context.Context) string { // global there. return "" } - defer root.DecRef() + defer root.DecRef(ctx) name, _ := f.file.Dirent.FullName(root) return name @@ -87,8 +87,8 @@ func (f *fsFile) IncRef() { } // DecRef implements File. -func (f *fsFile) DecRef() { - f.file.DecRef() +func (f *fsFile) DecRef(ctx context.Context) { + f.file.DecRef(ctx) } // fsLookup implements Lookup interface using fs.File. @@ -124,7 +124,7 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio if err != nil { return nil, err } - defer d.DecRef() + defer d.DecRef(ctx) if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { return nil, syserror.ELOOP diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go index 89168220a..323506d33 100644 --- a/pkg/sentry/fsbridge/vfs.go +++ b/pkg/sentry/fsbridge/vfs.go @@ -43,7 +43,7 @@ func NewVFSFile(file *vfs.FileDescription) File { // PathnameWithDeleted implements File. func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string { root := vfs.RootFromContext(ctx) - defer root.DecRef() + defer root.DecRef(ctx) vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry()) @@ -86,8 +86,8 @@ func (f *VFSFile) IncRef() { } // DecRef implements File. -func (f *VFSFile) DecRef() { - f.file.DecRef() +func (f *VFSFile) DecRef(ctx context.Context) { + f.file.DecRef(ctx) } // FileDescription returns the FileDescription represented by f. It does not diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index e6fda2b4f..7169e91af 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -103,9 +103,9 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // rootInode is the root directory inode for the devpts mounts. diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 1081fff52..3bb397f71 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -60,7 +60,7 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf } fd.LockFD.Init(&mi.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - mi.DecRef() + mi.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -98,9 +98,9 @@ type masterFileDescription struct { var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (mfd *masterFileDescription) Release() { +func (mfd *masterFileDescription) Release(ctx context.Context) { mfd.inode.root.masterClose(mfd.t) - mfd.inode.DecRef() + mfd.inode.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index a91cae3ef..32e4e1908 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -56,7 +56,7 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs } fd.LockFD.Init(&si.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - si.DecRef() + si.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -103,8 +103,8 @@ type slaveFileDescription struct { var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil) // Release implements fs.FileOperations.Release. -func (sfd *slaveFileDescription) Release() { - sfd.inode.DecRef() +func (sfd *slaveFileDescription) Release(ctx context.Context) { + sfd.inode.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index d0e06cdc0..2ed5fa8a9 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -92,9 +92,9 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth } // Release must be called when a is no longer in use. -func (a *Accessor) Release() { - a.root.DecRef() - a.mntns.DecRef() +func (a *Accessor) Release(ctx context.Context) { + a.root.DecRef(ctx) + a.mntns.DecRef(ctx) } // accessorContext implements context.Context by extending an existing diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index b6d52c015..747867cca 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -30,7 +30,7 @@ func TestDevtmpfs(t *testing.T) { creds := auth.CredentialsFromContext(ctx) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Register tmpfs just so that we can have a root filesystem that isn't @@ -48,9 +48,9 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) devpop := vfs.PathOperation{ Root: root, Start: root, @@ -69,7 +69,7 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create devtmpfs.Accessor: %v", err) } - defer a.Release() + defer a.Release(ctx) // Create "userspace-initialized" files using a devtmpfs.Accessor. if err := a.UserspaceInit(ctx); err != nil { diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index d12d78b84..812171fa3 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -59,9 +59,9 @@ type EventFileDescription struct { var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) // New creates a new event fd. -func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[eventfd]") - defer vd.DecRef() + defer vd.DecRef(ctx) efd := &EventFileDescription{ val: initVal, semMode: semMode, @@ -107,7 +107,7 @@ func (efd *EventFileDescription) HostFD() (int, error) { } // Release implements FileDescriptionImpl.Release() -func (efd *EventFileDescription) Release() { +func (efd *EventFileDescription) Release(context.Context) { efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go index 20e3adffc..49916fa81 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd_test.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -36,16 +36,16 @@ func TestEventFD(t *testing.T) { for _, initVal := range initVals { ctx := contexttest.Context(t) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) + eventfd, err := New(ctx, vfsObj, initVal, false, linux.O_RDWR) if err != nil { t.Fatalf("New() failed: %v", err) } - defer eventfd.DecRef() + defer eventfd.DecRef(ctx) // Register a callback for a write event. w, ch := waiter.NewChannelEntry(nil) @@ -74,16 +74,16 @@ func TestEventFD(t *testing.T) { func TestEventFDStat(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) + eventfd, err := New(ctx, vfsObj, 0, false, linux.O_RDWR) if err != nil { t.Fatalf("New() failed: %v", err) } - defer eventfd.DecRef() + defer eventfd.DecRef(ctx) statx, err := eventfd.Stat(ctx, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 89caee3df..8f7d5a9bb 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -53,7 +53,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { return nil, nil, nil, nil, err } vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -68,7 +68,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys root := mntns.Root() tearDown := func() { - root.DecRef() + root.DecRef(ctx) if err := f.Close(); err != nil { b.Fatalf("tearDown failed: %v", err) @@ -169,7 +169,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) // Create extfs submount. mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop) diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 55902322a..7a1b4219f 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -15,6 +15,7 @@ package ext import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -55,7 +56,7 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { // FIXME(b/134676337): filesystem.mu may not be locked as required by // inode.decRef(). d.inode.decRef() @@ -64,7 +65,7 @@ func (d *dentry) DecRef() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(b/134676337): Implement inotify. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // @@ -76,4 +77,4 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.Dentry.OnZeroWatches. // // TODO(b/134676337): Implement inotify. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 357512c7e..0fc01668d 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -142,7 +142,7 @@ type directoryFD struct { var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { if fd.iter == nil { return } diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index dac6effbf..08ffc2834 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -123,32 +123,32 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { // mount(2) specifies that EINVAL should be returned if the superblock is // invalid. - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } // Refuse to mount if the filesystem is incompatible. if !isCompatible(fs.sb) { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } rootInode.incRef() diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 64e9a579f..2dbaee287 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -65,7 +65,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -80,7 +80,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys root := mntns.Root() tearDown := func() { - root.DecRef() + root.DecRef(ctx) if err := f.Close(); err != nil { t.Fatalf("tearDown failed: %v", err) diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 557963e03..c714ddf73 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -84,7 +84,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil) // - filesystem.mu must be locked (for writing if write param is true). // - !rp.Done(). // - inode == vfsd.Impl().(*Dentry).inode. -func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { +func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { if !inode.isDir() { return nil, nil, syserror.ENOTDIR } @@ -100,7 +100,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } d := vfsd.Impl().(*dentry) if name == ".." { - isRoot, err := rp.CheckRoot(vfsd) + isRoot, err := rp.CheckRoot(ctx, vfsd) if err != nil { return nil, nil, err } @@ -108,7 +108,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo rp.Advance() return vfsd, inode, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, nil, err } rp.Advance() @@ -143,7 +143,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo child.name = name dir.childCache[name] = child } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, nil, err } if child.inode.isSymlink() && rp.ShouldFollowSymlink() { @@ -167,12 +167,12 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo // // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). -func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { +func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Done() { var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) if err != nil { return nil, nil, err } @@ -196,12 +196,12 @@ func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). // - !rp.Done(). -func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { +func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Final() { var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) if err != nil { return nil, nil, err } @@ -216,7 +216,7 @@ func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, e // the rp till the parent of the last component which should be an existing // directory. If parent is false then resolves rp entirely. Attemps to resolve // the path as far as it can with a read lock and upgrades the lock if needed. -func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { +func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { var ( vfsd *vfs.Dentry inode *inode @@ -227,9 +227,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in // of disk. This reduces congestion (allows concurrent walks). fs.mu.RLock() if parent { - vfsd, inode, err = walkParentLocked(rp, false) + vfsd, inode, err = walkParentLocked(ctx, rp, false) } else { - vfsd, inode, err = walkLocked(rp, false) + vfsd, inode, err = walkLocked(ctx, rp, false) } fs.mu.RUnlock() @@ -238,9 +238,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in // walk is fine as this is a read only filesystem. fs.mu.Lock() if parent { - vfsd, inode, err = walkParentLocked(rp, true) + vfsd, inode, err = walkParentLocked(ctx, rp, true) } else { - vfsd, inode, err = walkLocked(rp, true) + vfsd, inode, err = walkLocked(ctx, rp, true) } fs.mu.Unlock() } @@ -283,7 +283,7 @@ func (fs *filesystem) statTo(stat *linux.Statfs) { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -292,7 +292,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(rp, false) + vfsd, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -312,7 +312,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(rp, true) + vfsd, inode, err := fs.walk(ctx, rp, true) if err != nil { return nil, err } @@ -322,7 +322,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - vfsd, inode, err := fs.walk(rp, false) + vfsd, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -336,7 +336,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return "", err } @@ -349,7 +349,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return linux.Statx{}, err } @@ -360,7 +360,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - if _, _, err := fs.walk(rp, false); err != nil { + if _, _, err := fs.walk(ctx, rp, false); err != nil { return linux.Statfs{}, err } @@ -370,7 +370,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } @@ -390,7 +390,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EEXIST } - if _, _, err := fs.walk(rp, true); err != nil { + if _, _, err := fs.walk(ctx, rp, true); err != nil { return err } @@ -403,7 +403,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } - if _, _, err := fs.walk(rp, true); err != nil { + if _, _, err := fs.walk(ctx, rp, true); err != nil { return err } @@ -416,7 +416,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } - _, _, err := fs.walk(rp, true) + _, _, err := fs.walk(ctx, rp, true) if err != nil { return err } @@ -430,7 +430,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOENT } - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -440,7 +440,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -454,7 +454,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -468,7 +468,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return syserror.EEXIST } - _, _, err := fs.walk(rp, true) + _, _, err := fs.walk(ctx, rp, true) if err != nil { return err } @@ -478,7 +478,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -492,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -506,7 +506,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -515,7 +515,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return "", err } @@ -524,7 +524,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -533,7 +533,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 66d14bb95..e73e740d6 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -79,7 +79,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() {} +func (fd *regularFileFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index 62efd4095..2fd0d1fa8 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -73,7 +73,7 @@ type symlinkFD struct { var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (fd *symlinkFD) Release() {} +func (fd *symlinkFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 2225076bc..e522ff9a0 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -99,7 +99,7 @@ type DeviceFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DeviceFD) Release() { +func (fd *DeviceFD) Release(context.Context) { fd.fs.conn.connected = false } diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index 84c222ad6..1ffe7ccd2 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -356,12 +356,12 @@ func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveReque vfsObj := &vfs.VirtualFilesystem{} fuseDev := &DeviceFD{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(system.Ctx); err != nil { return nil, nil, err } vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef() + defer vd.DecRef(system.Ctx) if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, nil, err } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 200a93bbf..a1405f7c3 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -191,9 +191,9 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // inode implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 8c7c8e1b3..1679066ba 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -122,7 +122,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. @@ -139,7 +139,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.dirents = ds } - d.InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) if d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 00e3c99cd..e6af37d0d 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -55,7 +55,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync regular files. for _, d := range ds { err := d.syncSharedHandle(ctx) - d.DecRef() + d.DecRef(ctx) if err != nil && retErr == nil { retErr = err } @@ -65,7 +65,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // handles (so they won't be synced by the above). for _, sffd := range sffds { err := sffd.Sync(ctx) - sffd.vfsfd.DecRef() + sffd.vfsfd.DecRef(ctx) if err != nil && retErr == nil { retErr = err } @@ -133,7 +133,7 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() @@ -146,7 +146,7 @@ afterSymlink: // // Call rp.CheckMount() before updating d.parent's metadata, since if // we traverse to another mount then d.parent's metadata is irrelevant. - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } if d != d.parent && !d.cachedMetadataAuthoritative() { @@ -164,7 +164,7 @@ afterSymlink: if child == nil { return nil, syserror.ENOENT } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { @@ -239,7 +239,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // has 0 references, drop it). Wait to update parent.children until we // know what to replace the existing dentry with (i.e. one of the // returns below), to avoid a redundant map access. - vfsObj.InvalidateDentry(&child.vfsd) + vfsObj.InvalidateDentry(ctx, &child.vfsd) if child.isSynthetic() { // Normally we don't mark invalidated dentries as deleted since // they may still exist (but at a different path), and also for @@ -332,7 +332,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -384,7 +384,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } if fs.opts.interop == InteropModeShared { @@ -405,7 +405,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } if child := parent.children[name]; child != nil { @@ -426,7 +426,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } @@ -434,7 +434,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -470,7 +470,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -600,17 +600,17 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // Generate inotify events for rmdir or unlink. if dir { - parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) } else { var cw *vfs.Watches if child != nil { cw = &child.watches } - vfs.InotifyRemoveChild(cw, &parent.watches, name) + vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) } if child != nil { - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) child.setDeleted() if child.isSynthetic() { parent.syntheticChildren-- @@ -637,7 +637,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return @@ -645,20 +645,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { if len(**ds) != 0 { fs.renameMu.Lock() for _, d := range **ds { - d.checkCachingLocked() + d.checkCachingLocked(ctx) } fs.renameMu.Unlock() } putDentrySlice(*ds) } -func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { +func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { - d.checkCachingLocked() + d.checkCachingLocked(ctx) } fs.renameMu.Unlock() putDentrySlice(*ds) @@ -668,7 +668,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -680,7 +680,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -701,7 +701,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -812,7 +812,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { @@ -1126,7 +1126,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } - d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) return childVFSFD, nil } @@ -1134,7 +1134,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1154,7 +1154,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa var ds *[]*dentry fs.renameMu.Lock() - defer fs.renameMuUnlockAndCheckCaching(&ds) + defer fs.renameMuUnlockAndCheckCaching(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err @@ -1244,7 +1244,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return nil } mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } @@ -1269,7 +1269,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Update the dentry tree. - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) if replaced != nil { replaced.setDeleted() if replaced.isSynthetic() { @@ -1331,17 +1331,17 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -1350,7 +1350,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1367,7 +1367,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err @@ -1417,7 +1417,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1443,7 +1443,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1455,7 +1455,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1469,16 +1469,16 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -1488,16 +1488,16 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.removexattr(ctx, rp.Credentials(), name); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index e20de84b5..2e5575d8d 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -482,7 +482,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) if err != nil { attachFile.close(ctx) - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } // Set the root's reference count to 2. One reference is returned to the @@ -495,8 +495,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { - ctx := context.Background() +func (fs *filesystem) Release(ctx context.Context) { mf := fs.mfp.MemoryFile() fs.syncMu.Lock() @@ -1089,10 +1088,10 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { d.fs.renameMu.Lock() - d.checkCachingLocked() + d.checkCachingLocked(ctx) d.fs.renameMu.Unlock() } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") @@ -1109,7 +1108,7 @@ func (d *dentry) decRefLocked() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.isDir() { events |= linux.IN_ISDIR } @@ -1117,9 +1116,9 @@ func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { d.fs.renameMu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - d.parent.watches.Notify(d.name, events, cookie, et, d.isDeleted()) + d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) } - d.watches.Notify("", events, cookie, et, d.isDeleted()) + d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) d.fs.renameMu.RUnlock() } @@ -1131,10 +1130,10 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. // // If no watches are left on this dentry and it has no references, cache it. -func (d *dentry) OnZeroWatches() { +func (d *dentry) OnZeroWatches(ctx context.Context) { if atomic.LoadInt64(&d.refs) == 0 { d.fs.renameMu.Lock() - d.checkCachingLocked() + d.checkCachingLocked(ctx) d.fs.renameMu.Unlock() } } @@ -1149,7 +1148,7 @@ func (d *dentry) OnZeroWatches() { // do nothing. // // Preconditions: d.fs.renameMu must be locked for writing. -func (d *dentry) checkCachingLocked() { +func (d *dentry) checkCachingLocked(ctx context.Context) { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will @@ -1171,14 +1170,14 @@ func (d *dentry) checkCachingLocked() { // reachable by path resolution and should be dropped immediately. if d.vfsd.IsDead() { if d.isDeleted() { - d.watches.HandleDeletion() + d.watches.HandleDeletion(ctx) } if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- d.cached = false } - d.destroyLocked() + d.destroyLocked(ctx) return } // If d still has inotify watches and it is not deleted or invalidated, we @@ -1213,7 +1212,7 @@ func (d *dentry) checkCachingLocked() { if !victim.vfsd.IsDead() { // Note that victim can't be a mount point (in any mount // namespace), since VFS holds references on mount points. - d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd) delete(victim.parent.children, victim.name) // We're only deleting the dentry, not the file it // represents, so we don't need to update @@ -1221,7 +1220,7 @@ func (d *dentry) checkCachingLocked() { } victim.parent.dirMu.Unlock() } - victim.destroyLocked() + victim.destroyLocked(ctx) } // Whether or not victim was destroyed, we brought fs.cachedDentriesLen // back down to fs.opts.maxCachedDentries, so we don't loop. @@ -1233,7 +1232,7 @@ func (d *dentry) checkCachingLocked() { // // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is // not a child dentry. -func (d *dentry) destroyLocked() { +func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: // Mark the dentry destroyed. @@ -1244,7 +1243,6 @@ func (d *dentry) destroyLocked() { panic("dentry.destroyLocked() called with references on the dentry") } - ctx := context.Background() d.handleMu.Lock() if !d.handle.file.isNil() { mf := d.fs.mfp.MemoryFile() @@ -1276,7 +1274,7 @@ func (d *dentry) destroyLocked() { // d.fs.renameMu. if d.parent != nil { if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { - d.parent.checkCachingLocked() + d.parent.checkCachingLocked(ctx) } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") } @@ -1514,7 +1512,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - fd.dentry().InotifyWithParent(ev, 0, vfs.InodeEvent) + fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -1535,7 +1533,7 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -1545,7 +1543,7 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index adff39490..56d80bcf8 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -50,7 +50,7 @@ func TestDestroyIdempotent(t *testing.T) { } parent.cacheNewChildLocked(child, "child") - child.checkCachingLocked() + child.checkCachingLocked(ctx) if got := atomic.LoadInt64(&child.refs); got != -1 { t.Fatalf("child.refs=%d, want: -1", got) } @@ -58,6 +58,6 @@ func TestDestroyIdempotent(t *testing.T) { if got := atomic.LoadInt64(&parent.refs); got != -1 { t.Fatalf("parent.refs=%d, want: -1", got) } - child.checkCachingLocked() - child.checkCachingLocked() + child.checkCachingLocked(ctx) + child.checkCachingLocked(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 09f142cfc..420e8efe2 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -48,7 +48,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { +func (fd *regularFileFD) Release(context.Context) { } // OnClose implements vfs.FileDescriptionImpl.OnClose. diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index d6dbe9092..85d2bee72 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -108,7 +108,7 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect // We don't need the receiver. c.CloseRecv() - c.Release() + c.Release(ctx) return c, nil } @@ -136,8 +136,8 @@ func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFla } // Release implements transport.BoundEndpoint.Release. -func (e *endpoint) Release() { - e.dentry.DecRef() +func (e *endpoint) Release(ctx context.Context) { + e.dentry.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 811528982..fc269ef2b 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -80,11 +80,11 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *specialFileFD) Release() { +func (fd *specialFileFD) Release(ctx context.Context) { if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } - fd.handle.close(context.Background()) + fd.handle.close(ctx) fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() delete(fs.specialFileFDs, fd) diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go index b9082a20f..0135e4428 100644 --- a/pkg/sentry/fsimpl/host/control.go +++ b/pkg/sentry/fsimpl/host/control.go @@ -58,7 +58,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (c *scmRights) Release() { +func (c *scmRights) Release(ctx context.Context) { for _, fd := range c.fds { syscall.Close(fd) } diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index c894f2ca0..bf922c566 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -117,7 +117,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) d.Init(i) // i.open will take a reference on d. - defer d.DecRef() + defer d.DecRef(ctx) // For simplicity, fileDescription.offset is set to 0. Technically, we // should only set to 0 on files that are not seekable (sockets, pipes, @@ -168,9 +168,9 @@ type filesystem struct { devMinor uint32 } -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { @@ -431,12 +431,12 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } // DecRef implements kernfs.Inode. -func (i *inode) DecRef() { - i.AtomicRefCount.DecRefWithDestructor(i.Destroy) +func (i *inode) DecRef(ctx context.Context) { + i.AtomicRefCount.DecRefWithDestructor(ctx, i.Destroy) } // Destroy implements kernfs.Inode. -func (i *inode) Destroy() { +func (i *inode) Destroy(context.Context) { if i.wouldBlock { fdnotifier.RemoveFD(int32(i.hostFD)) } @@ -542,7 +542,7 @@ func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux } // Release implements vfs.FileDescriptionImpl. -func (f *fileDescription) Release() { +func (f *fileDescription) Release(context.Context) { // noop } diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index fd16bd92d..4979dd0a9 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -139,7 +139,7 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable } // Send implements transport.ConnectedEndpoint.Send. -func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -216,7 +216,7 @@ func (c *ConnectedEndpoint) EventUpdate() { } // Recv implements transport.Receiver.Recv. -func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -317,8 +317,8 @@ func (c *ConnectedEndpoint) destroyLocked() { // Release implements transport.ConnectedEndpoint.Release and // transport.Receiver.Release. -func (c *ConnectedEndpoint) Release() { - c.ref.DecRefWithDestructor(func() { +func (c *ConnectedEndpoint) Release(ctx context.Context) { + c.ref.DecRefWithDestructor(ctx, func(context.Context) { c.mu.Lock() c.destroyLocked() c.mu.Unlock() @@ -347,8 +347,8 @@ func (e *SCMConnectedEndpoint) Init() error { // Release implements transport.ConnectedEndpoint.Release and // transport.Receiver.Release. -func (e *SCMConnectedEndpoint) Release() { - e.ref.DecRefWithDestructor(func() { +func (e *SCMConnectedEndpoint) Release(ctx context.Context) { + e.ref.DecRefWithDestructor(ctx, func(context.Context) { e.mu.Lock() if err := syscall.Close(e.fd); err != nil { log.Warningf("Failed to close host fd %d: %v", err) diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 4ee9270cc..d372c60cb 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -67,12 +67,12 @@ func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup { } // Release implements fs.FileOperations.Release. -func (t *TTYFileDescription) Release() { +func (t *TTYFileDescription) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() - t.fileDescription.Release() + t.fileDescription.Release(ctx) } // PRead implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c6c4472e7..12adf727a 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -122,7 +122,7 @@ func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, of } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DynamicBytesFD) Release() {} +func (fd *DynamicBytesFD) Release(context.Context) {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 1d37ccb98..fcee6200a 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -113,7 +113,7 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *GenericDirectoryFD) Release() {} +func (fd *GenericDirectoryFD) Release(context.Context) {} func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { return fd.vfsfd.VirtualDentry().Mount().Filesystem() diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 61a36cff9..d7edb6342 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -56,13 +56,13 @@ afterSymlink: return vfsd, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return vfsd, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -77,7 +77,7 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(&next.vfsd); err != nil { + if err := rp.CheckMount(ctx, &next.vfsd); err != nil { return nil, err } // Resolve any symlink at current path component. @@ -88,7 +88,7 @@ afterSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef() + targetVD.DecRef(ctx) if err != nil { return nil, err } @@ -116,7 +116,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Cached dentry exists, revalidate. if !child.inode.Valid(ctx) { delete(parent.children, name) - vfsObj.InvalidateDentry(&child.vfsd) + vfsObj.InvalidateDentry(ctx, &child.vfsd) fs.deferDecRef(&child.vfsd) // Reference from Lookup. child = nil } @@ -234,7 +234,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Den } // Release implements vfs.FilesystemImpl.Release. -func (fs *Filesystem) Release() { +func (fs *Filesystem) Release(context.Context) { } // Sync implements vfs.FilesystemImpl.Sync. @@ -246,7 +246,7 @@ func (fs *Filesystem) Sync(ctx context.Context) error { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() _, inode, err := fs.walkExistingLocked(ctx, rp) @@ -259,7 +259,7 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) if err != nil { @@ -282,7 +282,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, _, err := fs.walkParentDirLocked(ctx, rp) if err != nil { @@ -300,7 +300,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -337,7 +337,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -365,7 +365,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -397,7 +397,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) if err != nil { @@ -429,7 +429,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf } afterTrailingSymlink: parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return nil, err } @@ -483,7 +483,7 @@ afterTrailingSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef() + targetVD.DecRef(ctx) if err != nil { return nil, err } @@ -507,7 +507,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st fs.mu.RLock() d, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return "", err } @@ -526,7 +526,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.processDeferredDecRefsLocked() + defer fs.processDeferredDecRefsLocked(ctx) defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this @@ -584,7 +584,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) virtfs := rp.VirtualFilesystem() // We can't deadlock here due to lock ordering because we're protected from @@ -615,7 +615,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa dstDir.children = make(map[string]*Dentry) } dstDir.children[pc] = src - virtfs.CommitRenameReplaceDentry(srcVFSD, replaced) + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced) return nil } @@ -624,7 +624,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error fs.mu.Lock() defer fs.mu.Unlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -648,7 +648,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } @@ -656,7 +656,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error virtfs.AbortDeleteDentry(vfsd) return err } - virtfs.CommitDeleteDentry(vfsd) + virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -665,7 +665,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -680,7 +680,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statx{}, err } @@ -692,7 +692,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statfs{}, err } @@ -708,7 +708,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -733,7 +733,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error fs.mu.Lock() defer fs.mu.Unlock() vfsd, _, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -753,7 +753,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } @@ -761,7 +761,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error virtfs.AbortDeleteDentry(vfsd) return err } - virtfs.CommitDeleteDentry(vfsd) + virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -770,7 +770,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } @@ -785,7 +785,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } @@ -798,7 +798,7 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return "", err } @@ -811,7 +811,7 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -824,7 +824,7 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 579e627f0..c3efcf3ec 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -40,7 +40,7 @@ func (InodeNoopRefCount) IncRef() { } // DecRef implements Inode.DecRef. -func (InodeNoopRefCount) DecRef() { +func (InodeNoopRefCount) DecRef(context.Context) { } // TryIncRef implements Inode.TryIncRef. @@ -49,7 +49,7 @@ func (InodeNoopRefCount) TryIncRef() bool { } // Destroy implements Inode.Destroy. -func (InodeNoopRefCount) Destroy() { +func (InodeNoopRefCount) Destroy(context.Context) { } // InodeDirectoryNoNewChildren partially implements the Inode interface. @@ -366,12 +366,12 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { } // DecRef implements Inode.DecRef. -func (o *OrderedChildren) DecRef() { - o.AtomicRefCount.DecRefWithDestructor(o.Destroy) +func (o *OrderedChildren) DecRef(ctx context.Context) { + o.AtomicRefCount.DecRefWithDestructor(ctx, o.Destroy) } // Destroy cleans up resources referenced by this OrderedChildren. -func (o *OrderedChildren) Destroy() { +func (o *OrderedChildren) Destroy(context.Context) { o.mu.Lock() defer o.mu.Unlock() o.order.Reset() diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 46f207664..080118841 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -116,17 +116,17 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the // droppedDentries list. See comment on Filesystem.mu. -func (fs *Filesystem) processDeferredDecRefs() { +func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { fs.mu.Lock() - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) fs.mu.Unlock() } // Precondition: fs.mu must be held for writing. -func (fs *Filesystem) processDeferredDecRefsLocked() { +func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) { fs.droppedDentriesMu.Lock() for _, d := range fs.droppedDentries { - d.DecRef() + d.DecRef(ctx) } fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. fs.droppedDentriesMu.Unlock() @@ -212,16 +212,16 @@ func (d *Dentry) isSymlink() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *Dentry) DecRef() { - d.AtomicRefCount.DecRefWithDestructor(d.destroy) +func (d *Dentry) DecRef(ctx context.Context) { + d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy) } // Precondition: Dentry must be removed from VFS' dentry cache. -func (d *Dentry) destroy() { - d.inode.DecRef() // IncRef from Init. +func (d *Dentry) destroy(ctx context.Context) { + d.inode.DecRef(ctx) // IncRef from Init. d.inode = nil if d.parent != nil { - d.parent.DecRef() // IncRef from Dentry.InsertChild. + d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. } } @@ -230,7 +230,7 @@ func (d *Dentry) destroy() { // Although Linux technically supports inotify on pseudo filesystems (inotify // is implemented at the vfs layer), it is not particularly useful. It is left // unimplemented until someone actually needs it. -func (d *Dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} +func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. func (d *Dentry) Watches() *vfs.Watches { @@ -238,7 +238,7 @@ func (d *Dentry) Watches() *vfs.Watches { } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. -func (d *Dentry) OnZeroWatches() {} +func (d *Dentry) OnZeroWatches(context.Context) {} // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on @@ -326,12 +326,12 @@ type Inode interface { type inodeRefs interface { IncRef() - DecRef() + DecRef(ctx context.Context) TryIncRef() bool // Destroy is called when the inode reaches zero references. Destroy release // all resources (references) on objects referenced by the inode, including // any child dentries. - Destroy() + Destroy(ctx context.Context) } type inodeMetadata interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index dc407eb1d..c5d5afedf 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -46,7 +46,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) v := &vfs.VirtualFilesystem{} - if err := v.Init(); err != nil { + if err := v.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ @@ -163,7 +163,7 @@ func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (* dir := d.fs.newDir(creds, opts.Mode, nil) dirVFSD := dir.VFSDentry() if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { - dir.DecRef() + dir.DecRef(ctx) return nil, err } d.IncLinks(1) @@ -175,7 +175,7 @@ func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (* f := d.fs.newFile(creds, "") fVFSD := f.VFSDentry() if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { - f.DecRef() + f.DecRef(ctx) return nil, err } return fVFSD, nil @@ -213,7 +213,7 @@ func TestBasic(t *testing.T) { }) }) defer sys.Destroy() - sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() + sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef(sys.Ctx) } func TestMkdirGetDentry(t *testing.T) { @@ -228,7 +228,7 @@ func TestMkdirGetDentry(t *testing.T) { if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) } - sys.GetDentryOrDie(pop).DecRef() + sys.GetDentryOrDie(pop).DecRef(sys.Ctx) } func TestReadStaticFile(t *testing.T) { @@ -246,7 +246,7 @@ func TestReadStaticFile(t *testing.T) { if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(sys.Ctx) content, err := sys.ReadToEnd(fd) if err != nil { @@ -273,7 +273,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { } // Close the file. The file should persist. - fd.DecRef() + fd.DecRef(sys.Ctx) fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, @@ -281,7 +281,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } - fd.DecRef() + fd.DecRef(sys.Ctx) } func TestDirFDReadWrite(t *testing.T) { @@ -297,7 +297,7 @@ func TestDirFDReadWrite(t *testing.T) { if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(sys.Ctx) // Read/Write should fail for directory FDs. if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 8f8dcfafe..b3d19ff82 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -98,7 +98,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { if err != nil { return err } - defer oldFD.DecRef() + defer oldFD.DecRef(ctx) newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{ Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL, Mode: linux.FileMode(d.mode &^ linux.S_IFMT), @@ -106,7 +106,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { if err != nil { return err } - defer newFD.DecRef() + defer newFD.DecRef(ctx) bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size for { readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{}) @@ -241,13 +241,13 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { Mask: linux.STATX_INO, }) if err != nil { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) d.upperVD = vfs.VirtualDentry{} cleanupUndoCopyUp() return err } if upperStat.Mask&linux.STATX_INO == 0 { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) d.upperVD = vfs.VirtualDentry{} cleanupUndoCopyUp() return syserror.EREMOTE diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index f5c2462a5..fccb94105 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -46,7 +46,7 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string readdirErr = err return false } - defer layerFD.DecRef() + defer layerFD.DecRef(ctx) // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. @@ -108,7 +108,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. @@ -177,7 +177,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { readdirErr = err return false } - defer layerFD.DecRef() + defer layerFD.DecRef(ctx) // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. @@ -282,6 +282,6 @@ func (fd *directoryFD) Sync(ctx context.Context) error { return err } err = upperFD.Sync(ctx) - upperFD.DecRef() + upperFD.DecRef(ctx) return err } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 6b705e955..986b36ead 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -77,7 +77,7 @@ func putDentrySlice(ds *[]*dentry) { // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return @@ -85,20 +85,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) { if len(**ds) != 0 { fs.renameMu.Lock() for _, d := range **ds { - d.checkDropLocked() + d.checkDropLocked(ctx) } fs.renameMu.Unlock() } putDentrySlice(*ds) } -func (fs *filesystem) renameMuUnlockAndCheckDrop(ds **[]*dentry) { +func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { - d.checkDropLocked() + d.checkDropLocked(ctx) } fs.renameMu.Unlock() putDentrySlice(*ds) @@ -126,13 +126,13 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return d, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -142,7 +142,7 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { @@ -272,11 +272,11 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str }) if lookupErr != nil { - child.destroyLocked() + child.destroyLocked(ctx) return nil, lookupErr } if !existsOnAnyLayer { - child.destroyLocked() + child.destroyLocked(ctx) return nil, syserror.ENOENT } @@ -430,7 +430,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -501,7 +501,7 @@ func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.V func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -513,7 +513,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -532,7 +532,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -553,7 +553,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -720,7 +720,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) if rp.Done() { @@ -825,7 +825,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf fd.LockFD.Init(&d.locks) layerFDOpts := layerFD.Options() if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil { - layerFD.DecRef() + layerFD.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -920,7 +920,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving fd.LockFD.Init(&child.locks) upperFDOpts := upperFD.Options() if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil { - upperFD.DecRef() + upperFD.DecRef(ctx) // Don't bother with cleanup; the file was created successfully, we // just can't open it anymore for some reason. return nil, err @@ -932,7 +932,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -952,7 +952,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa var ds *[]*dentry fs.renameMu.Lock() - defer fs.renameMuUnlockAndCheckDrop(&ds) + defer fs.renameMuUnlockAndCheckDrop(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err @@ -979,7 +979,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -1001,7 +1001,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -1086,7 +1086,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) parent.dirents = nil @@ -1097,7 +1097,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -1132,7 +1132,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1160,7 +1160,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err @@ -1211,7 +1211,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -1233,7 +1233,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -1298,7 +1298,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } if child != nil { - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) } @@ -1310,7 +1310,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1324,7 +1324,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1336,7 +1336,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -1348,7 +1348,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index c0749e711..d3060a481 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -81,11 +81,11 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR) if oldOffErr == nil { if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil { - upperFD.DecRef() + upperFD.DecRef(ctx) return nil, err } } - fd.cachedFD.DecRef() + fd.cachedFD.DecRef(ctx) fd.copiedUp = true fd.cachedFD = upperFD fd.cachedFlags = statusFlags @@ -99,8 +99,8 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *nonDirectoryFD) Release() { - fd.cachedFD.DecRef() +func (fd *nonDirectoryFD) Release(ctx context.Context) { + fd.cachedFD.DecRef(ctx) fd.cachedFD = nil } @@ -138,7 +138,7 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux Mask: layerMask, Sync: opts.Sync, }) - wrappedFD.DecRef() + wrappedFD.DecRef(ctx) if err != nil { return linux.Statx{}, err } @@ -187,7 +187,7 @@ func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, off if err != nil { return 0, err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.PRead(ctx, dst, offset, opts) } @@ -209,7 +209,7 @@ func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, of if err != nil { return 0, err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.PWrite(ctx, src, offset, opts) } @@ -250,7 +250,7 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error { return err } wrappedFD.IncRef() - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) fd.mu.Unlock() return wrappedFD.Sync(ctx) } @@ -261,6 +261,6 @@ func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOp if err != nil { return err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.ConfigureMMap(ctx, opts) } diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index e720d4825..75cc006bf 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -123,7 +123,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // filesystem with any number of lower layers. } else { vfsroot := vfs.RootFromContext(ctx) - defer vfsroot.DecRef() + defer vfsroot.DecRef(ctx) upperPathname, ok := mopts["upperdir"] if ok { delete(mopts, "upperdir") @@ -147,13 +147,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) return nil, nil, err } - defer upperRoot.DecRef() + defer upperRoot.DecRef(ctx) privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) if err != nil { ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) return nil, nil, err } - defer privateUpperRoot.DecRef() + defer privateUpperRoot.DecRef(ctx) fsopts.UpperRoot = privateUpperRoot } lowerPathnamesStr, ok := mopts["lowerdir"] @@ -190,13 +190,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) return nil, nil, err } - defer lowerRoot.DecRef() + defer lowerRoot.DecRef(ctx) privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) if err != nil { ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) return nil, nil, err } - defer privateLowerRoot.DecRef() + defer privateLowerRoot.DecRef(ctx) fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) } } @@ -264,19 +264,19 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt Mask: rootStatMask, }) if err != nil { - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, err } if rootStat.Mask&rootStatMask != rootStatMask { - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EREMOTE } if isWhiteout(&rootStat) { ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } root.mode = uint32(rootStat.Mode) @@ -319,17 +319,17 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { vfsObj := fs.vfsfs.VirtualFilesystem() vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) for _, lowerDevMinor := range fs.lowerDevMinors { vfsObj.PutAnonBlockDevMinor(lowerDevMinor) } if fs.opts.UpperRoot.Ok() { - fs.opts.UpperRoot.DecRef() + fs.opts.UpperRoot.DecRef(ctx) } for _, lowerRoot := range fs.opts.LowerRoots { - lowerRoot.DecRef() + lowerRoot.DecRef(ctx) } } @@ -452,10 +452,10 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { d.fs.renameMu.Lock() - d.checkDropLocked() + d.checkDropLocked(ctx) d.fs.renameMu.Unlock() } else if refs < 0 { panic("overlay.dentry.DecRef() called without holding a reference") @@ -466,7 +466,7 @@ func (d *dentry) DecRef() { // becomes deleted. // // Preconditions: d.fs.renameMu must be locked for writing. -func (d *dentry) checkDropLocked() { +func (d *dentry) checkDropLocked(ctx context.Context) { // Dentries with a positive reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will @@ -476,14 +476,14 @@ func (d *dentry) checkDropLocked() { return } // Refs is still zero; destroy it. - d.destroyLocked() + d.destroyLocked(ctx) return } // destroyLocked destroys the dentry. // // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. -func (d *dentry) destroyLocked() { +func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: // Mark the dentry destroyed. @@ -495,10 +495,10 @@ func (d *dentry) destroyLocked() { } if d.upperVD.Ok() { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) } for _, lowerVD := range d.lowerVDs { - lowerVD.DecRef() + lowerVD.DecRef(ctx) } if d.parent != nil { @@ -510,7 +510,7 @@ func (d *dentry) destroyLocked() { // Drop the reference held by d on its parent without recursively // locking d.fs.renameMu. if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { - d.parent.checkDropLocked() + d.parent.checkDropLocked(ctx) } else if refs < 0 { panic("overlay.dentry.DecRef() called without holding a reference") } @@ -518,7 +518,7 @@ func (d *dentry) destroyLocked() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { // TODO(gvisor.dev/issue/1479): Implement inotify. } @@ -531,7 +531,7 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} // iterLayers invokes yield on each layer comprising d, from top to bottom. If // any call to yield returns false, iterLayer stops iteration. diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 811f80a5f..2ca793db9 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -63,9 +63,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. @@ -160,6 +160,6 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf inode := newInode(ctx, fs) var d kernfs.Dentry d.Init(inode) - defer d.DecRef() + defer d.DecRef(ctx) return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) } diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 609210253..2463d51cd 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -77,9 +77,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // dynamicInode is an overfitted interface for common Inodes with diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index fea29e5f0..f0d3f7f5e 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -43,12 +43,12 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) return file, flags } -func taskFDExists(t *kernel.Task, fd int32) bool { +func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool { file, _ := getTaskFD(t, fd) if file == nil { return false } - file.DecRef() + file.DecRef(ctx) return true } @@ -68,7 +68,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.GetFDs() + fds = fdTable.GetFDs(ctx) } }) @@ -135,7 +135,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(i.task, fd) { + if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) @@ -204,9 +204,9 @@ func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { if file == nil { return "", syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) root := vfs.RootFromContext(ctx) - defer root.DecRef() + defer root.DecRef(ctx) return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) } @@ -215,7 +215,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen if file == nil { return vfs.VirtualDentry{}, "", syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) vd := file.VirtualDentry() vd.IncRef() return vd, "", nil @@ -258,7 +258,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(i.task, fd) { + if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } data := &fdInfoData{ @@ -297,7 +297,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { if file == nil { return syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 859b7d727..830b78949 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -677,7 +677,7 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { if err != nil { return "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) return exec.PathnameWithDeleted(ctx), nil } @@ -692,7 +692,7 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent if err != nil { return vfs.VirtualDentry{}, "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() vd.IncRef() @@ -748,7 +748,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef() + defer rootDir.DecRef(ctx) i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) return nil } @@ -779,7 +779,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef() + defer rootDir.DecRef(ctx) i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } @@ -825,7 +825,7 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir dentry.Init(&namespaceInode{}) vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) vd.IncRef() - dentry.DecRef() + dentry.DecRef(ctx) return vd, "", nil } @@ -887,8 +887,8 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err } // Release implements FileDescriptionImpl. -func (fd *namespaceFD) Release() { - fd.inode.DecRef() +func (fd *namespaceFD) Release(ctx context.Context) { + fd.inode.DecRef(ctx) } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 6bde27376..a4c884bf9 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -212,7 +212,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { continue } if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { - s.DecRef() + s.DecRef(ctx) // Not a unix socket. continue } @@ -281,7 +281,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { } fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil } @@ -359,7 +359,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() + s.DecRef(ctx) // Not tcp4 sockets. continue } @@ -455,7 +455,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil @@ -524,7 +524,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() + s.DecRef(ctx) // Not udp4 socket. continue } @@ -600,7 +600,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 19abb5034..3c9297dee 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -218,7 +218,7 @@ func TestTasks(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { @@ -336,7 +336,7 @@ func TestTasksOffset(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) } @@ -441,7 +441,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err) continue } - defer child.DecRef() + defer child.DecRef(ctx) stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("Stat(%v) failed: %v", absPath, err) @@ -476,7 +476,7 @@ func TestTree(t *testing.T) { if err != nil { t.Fatalf("failed to create test file: %v", err) } - defer file.DecRef() + defer file.DecRef(s.Ctx) var tasks []*kernel.Task for i := 0; i < 5; i++ { @@ -501,5 +501,5 @@ func TestTree(t *testing.T) { t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err) } iterateDir(ctx, t, s, fd) - fd.DecRef() + fd.DecRef(ctx) } diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index 242ba9b5d..6297e1df4 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -54,7 +54,7 @@ var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil) // New creates a new signal fd. func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[signalfd]") - defer vd.DecRef() + defer vd.DecRef(target) sfd := &SignalFileDescription{ target: target, mask: mask, @@ -133,4 +133,4 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { } // Release implements FileDescriptionImpl.Release() -func (sfd *SignalFileDescription) Release() {} +func (sfd *SignalFileDescription) Release(context.Context) {} diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index ee0828a15..c61818ff6 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -67,9 +67,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 01ce30a4d..f81b0c38f 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -87,9 +87,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // dir implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 242d5fd12..9fd38b295 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -59,7 +59,7 @@ func TestReadCPUFile(t *testing.T) { if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) content, err := s.ReadToEnd(fd) if err != nil { t.Fatalf("Read failed: %v", err) diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index e743e8114..1e57744e8 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -127,7 +127,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns return nil, err } m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) - m.SetExecutable(fsbridge.NewVFSFile(exe)) + m.SetExecutable(ctx, fsbridge.NewVFSFile(exe)) config := &kernel.TaskConfig{ Kernel: k, diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 0556af877..568132121 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -97,8 +97,8 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { - s.Root.DecRef() - s.MntNs.DecRef() // Reference on MntNs passed to NewSystem. + s.Root.DecRef(s.Ctx) + s.MntNs.DecRef(s.Ctx) // Reference on MntNs passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. @@ -149,7 +149,7 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector { if err != nil { s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) collector := &DirentCollector{} if err := fd.IterDirents(s.Ctx, collector); err != nil { diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 2dc90d484..86beaa0a8 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -47,9 +47,9 @@ var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) var _ ktime.TimerListener = (*TimerFileDescription)(nil) // New returns a new timer fd. -func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[timerfd]") - defer vd.DecRef() + defer vd.DecRef(ctx) tfd := &TimerFileDescription{} tfd.timer = ktime.NewTimer(clock, tfd) if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ @@ -129,7 +129,7 @@ func (tfd *TimerFileDescription) ResumeTimer() { } // Release implements FileDescriptionImpl.Release() -func (tfd *TimerFileDescription) Release() { +func (tfd *TimerFileDescription) Release(context.Context) { tfd.timer.Destroy() } diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 2fb5c4d84..d263147c2 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -83,7 +83,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent } err = fn(root, d) - d.DecRef() + d.DecRef(ctx) return err } @@ -105,17 +105,17 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create mount namespace: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create nested directories with given depth. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) d := root d.IncRef() - defer d.DecRef() + defer d.DecRef(ctx) for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { @@ -125,7 +125,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - d.DecRef() + d.DecRef(ctx) d = next filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -136,7 +136,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - file.DecRef() + file.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -176,7 +176,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { // Create VFS. vfsObj := vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { b.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -186,14 +186,14 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create nested directories with given depth. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) vd := root vd.IncRef() for i := depth; i > 0; i-- { @@ -212,7 +212,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - vd.DecRef() + vd.DecRef(ctx) vd = nextVD filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -228,12 +228,12 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) - vd.DecRef() + vd.DecRef(ctx) vd = vfs.VirtualDentry{} if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - defer fd.DecRef() + defer fd.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -278,14 +278,14 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create mount namespace: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create and mount the submount. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { b.Fatalf("failed to create mount point: %v", err) } @@ -293,7 +293,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) if err != nil { b.Fatalf("failed to create tmpfs submount: %v", err) @@ -309,7 +309,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount root: %v", err) } - defer d.DecRef() + defer d.DecRef(ctx) for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { @@ -319,7 +319,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - d.DecRef() + d.DecRef(ctx) d = next filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -330,7 +330,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - file.DecRef() + file.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -370,7 +370,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { // Create VFS. vfsObj := vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { b.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -380,14 +380,14 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create the mount point. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, @@ -403,7 +403,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) // Create and mount the submount. if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) @@ -432,7 +432,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - vd.DecRef() + vd.DecRef(ctx) vd = nextVD filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -448,11 +448,11 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) - vd.DecRef() + vd.DecRef(ctx) if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - fd.DecRef() + fd.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 0a1ad4765..78b4fc5be 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -95,7 +95,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { if fd.iter != nil { dir := fd.inode().impl.(*directory) dir.iterMu.Lock() @@ -110,7 +110,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fs := fd.filesystem() dir := fd.inode().impl.(*directory) - defer fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + defer fd.dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) // fs.mu is required to read d.parent and dentry.name. fs.mu.RLock() diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index ef210a69b..fb77f95cc 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -40,7 +40,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // stepLocked is loosely analogous to fs/namei.c:walk_component(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { +func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { dir, ok := d.inode.impl.(*directory) if !ok { return nil, syserror.ENOTDIR @@ -55,13 +55,13 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return d, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -74,7 +74,7 @@ afterSymlink: if !ok { return nil, syserror.ENOENT } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { @@ -98,9 +98,9 @@ afterSymlink: // fs/namei.c:path_parentat(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { +func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { - next, err := stepLocked(rp, d) + next, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } @@ -118,10 +118,10 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). // // Preconditions: filesystem.mu must be locked. -func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { +func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { d := rp.Start().Impl().(*dentry) for !rp.Done() { - next, err := stepLocked(rp, d) + next, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } @@ -141,10 +141,10 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -182,7 +182,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if dir { ev |= linux.IN_ISDIR } - parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) parentDir.inode.touchCMtime() return nil } @@ -191,7 +191,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return err } @@ -202,7 +202,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -222,7 +222,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } @@ -232,7 +232,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { return syserror.EXDEV } @@ -251,7 +251,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } i.incLinksLocked() - i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) + i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) parentDir.insertChildLocked(fs.newDentry(i), name) return nil }) @@ -259,7 +259,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() if parentDir.inode.nlink == maxLinks { return syserror.EMLINK @@ -273,7 +273,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() var childInode *inode switch opts.Mode.FileType() { @@ -308,7 +308,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -330,7 +330,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: - parentDir, err := walkParentDirLocked(rp, start) + parentDir, err := walkParentDirLocked(ctx, rp, start) if err != nil { return nil, err } @@ -368,7 +368,7 @@ afterTrailingSymlink: if err != nil { return nil, err } - parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) parentDir.inode.touchCMtime() return fd, nil } @@ -376,7 +376,7 @@ afterTrailingSymlink: return nil, syserror.EEXIST } // Is the file mounted over? - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } // Do we need to resolve a trailing symlink? @@ -445,7 +445,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return "", err } @@ -467,7 +467,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Resolve newParent first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() - newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -555,7 +555,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) var replacedVFSD *vfs.Dentry if replaced != nil { replacedVFSD = &replaced.vfsd @@ -566,17 +566,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if replaced != nil { newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { - newParentDir.inode.decLinksLocked() // from replaced's ".." + newParentDir.inode.decLinksLocked(ctx) // from replaced's ".." } - replaced.inode.decLinksLocked() + replaced.inode.decLinksLocked(ctx) } oldParentDir.removeChildLocked(renamed) newParentDir.insertChildLocked(renamed, newName) - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) oldParentDir.inode.touchCMtime() if oldParentDir != newParentDir { if renamed.inode.isDir() { - oldParentDir.inode.decLinksLocked() + oldParentDir.inode.decLinksLocked(ctx) newParentDir.inode.incLinksLocked() } newParentDir.inode.touchCMtime() @@ -591,7 +591,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -626,17 +626,17 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } parentDir.removeChildLocked(child) - parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) // Remove links for child, child/., and child/.. - child.inode.decLinksLocked() - child.inode.decLinksLocked() - parentDir.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(&child.vfsd) + child.inode.decLinksLocked(ctx) + child.inode.decLinksLocked(ctx) + parentDir.inode.decLinksLocked(ctx) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } @@ -644,7 +644,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -656,7 +656,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RUnlock() if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -665,7 +665,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return linux.Statx{}, err } @@ -678,7 +678,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.mu.RUnlock() - if _, err := resolveLocked(rp); err != nil { + if _, err := resolveLocked(ctx, rp); err != nil { return linux.Statfs{}, err } statfs := linux.Statfs{ @@ -695,7 +695,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target)) parentDir.insertChildLocked(child, name) @@ -707,7 +707,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -738,7 +738,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } @@ -746,11 +746,11 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // Generate inotify events. Note that this must take place before the link // count of the child is decremented, or else the watches may be dropped // before these events are added. - vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name) + vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name) parentDir.removeChildLocked(child) - child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(&child.vfsd) + child.inode.decLinksLocked(ctx) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } @@ -759,7 +759,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -778,7 +778,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -789,7 +789,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return "", err } @@ -799,7 +799,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -810,14 +810,14 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt } fs.mu.RUnlock() - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -828,7 +828,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, } fs.mu.RUnlock() - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 1614f2c39..ec2701d8b 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -32,7 +32,7 @@ const fileName = "mypipe" func TestSeparateFDs(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the read side. This is done in a concurrently because opening // One end the pipe blocks until the other end is opened. @@ -55,13 +55,13 @@ func TestSeparateFDs(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer wfd.DecRef() + defer wfd.DecRef(ctx) rfd, ok := <-rfdchan if !ok { t.Fatalf("failed to open pipe for reading %q", fileName) } - defer rfd.DecRef() + defer rfd.DecRef(ctx) const msg = "vamos azul" checkEmpty(ctx, t, rfd) @@ -71,7 +71,7 @@ func TestSeparateFDs(t *testing.T) { func TestNonblockingRead(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the read side as nonblocking. pop := vfs.PathOperation{ @@ -85,7 +85,7 @@ func TestNonblockingRead(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for reading %q: %v", fileName, err) } - defer rfd.DecRef() + defer rfd.DecRef(ctx) // Open the write side. openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY} @@ -93,7 +93,7 @@ func TestNonblockingRead(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer wfd.DecRef() + defer wfd.DecRef(ctx) const msg = "geh blau" checkEmpty(ctx, t, rfd) @@ -103,7 +103,7 @@ func TestNonblockingRead(t *testing.T) { func TestNonblockingWriteError(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the write side as nonblocking, which should return ENXIO. pop := vfs.PathOperation{ @@ -121,7 +121,7 @@ func TestNonblockingWriteError(t *testing.T) { func TestSingleFD(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the pipe as readable and writable. pop := vfs.PathOperation{ @@ -135,7 +135,7 @@ func TestSingleFD(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer fd.DecRef() + defer fd.DecRef(ctx) const msg = "forza blu" checkEmpty(ctx, t, fd) @@ -152,7 +152,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index abbaa5d60..0710b65db 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -270,7 +270,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { +func (fd *regularFileFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 2545d88e9..68e615e8b 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -185,7 +185,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt case linux.S_IFDIR: root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry default: - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &root.vfsd, nil @@ -197,7 +197,7 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } @@ -249,12 +249,12 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { - d.inode.decRef() +func (d *dentry) DecRef(ctx context.Context) { + d.inode.decRef(ctx) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.inode.isDir() { events |= linux.IN_ISDIR } @@ -266,9 +266,9 @@ func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { d.inode.fs.mu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - d.parent.inode.watches.Notify(d.name, events, cookie, et, deleted) + d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) } - d.inode.watches.Notify("", events, cookie, et, deleted) + d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) d.inode.fs.mu.RUnlock() } @@ -278,7 +278,7 @@ func (d *dentry) Watches() *vfs.Watches { } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} // inode represents a filesystem object. type inode struct { @@ -359,12 +359,12 @@ func (i *inode) incLinksLocked() { // remove a reference on i as well. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. -func (i *inode) decLinksLocked() { +func (i *inode) decLinksLocked(ctx context.Context) { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 { - i.decRef() + i.decRef(ctx) } } @@ -386,9 +386,9 @@ func (i *inode) tryIncRef() bool { } } -func (i *inode) decRef() { +func (i *inode) decRef(ctx context.Context) { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { - i.watches.HandleDeletion() + i.watches.HandleDeletion(ctx) if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is // no longer usable, we don't need to grab any locks or update any @@ -701,7 +701,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -724,7 +724,7 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -736,13 +736,13 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // NewMemfd creates a new tmpfs regular file and file description that can back // an anonymous fd created by memfd_create. -func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { +func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { fs, ok := mount.Filesystem().Impl().(*filesystem) if !ok { panic("NewMemfd() called with non-tmpfs mount") @@ -757,7 +757,7 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s } d := fs.newDentry(inode) - defer d.DecRef() + defer d.DecRef(ctx) d.name = name // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go index a240fb276..6f3e3ae6f 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -34,7 +34,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr creds := auth.CredentialsFromContext(ctx) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) } @@ -47,8 +47,8 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr } root := mntns.Root() return vfsObj, root, func() { - root.DecRef() - mntns.DecRef() + root.DecRef(ctx) + mntns.DecRef(ctx) }, nil } diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 920fe4329..52ed5cea2 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -17,6 +17,7 @@ package kernel import ( "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" @@ -31,7 +32,7 @@ type abstractEndpoint struct { } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (e *abstractEndpoint) WeakRefGone() { +func (e *abstractEndpoint) WeakRefGone(context.Context) { e.ns.mu.Lock() if e.ns.endpoints[e.name].ep == e.ep { delete(e.ns.endpoints, e.name) @@ -64,9 +65,9 @@ type boundEndpoint struct { } // Release implements transport.BoundEndpoint.Release. -func (e *boundEndpoint) Release() { - e.rc.DecRef() - e.BoundEndpoint.Release() +func (e *boundEndpoint) Release(ctx context.Context) { + e.rc.DecRef(ctx) + e.BoundEndpoint.Release(ctx) } // BoundEndpoint retrieves the endpoint bound to the given name. The return @@ -93,13 +94,13 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp // // When the last reference managed by rc is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { a.mu.Lock() defer a.mu.Unlock() if ep, ok := a.endpoints[name]; ok { if rc := ep.wr.Get(); rc != nil { - rc.DecRef() + rc.DecRef(ctx) return syscall.EADDRINUSE } } diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 4c0f1e41f..15519f0df 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -76,8 +76,8 @@ type pollEntry struct { // WeakRefGone implements refs.WeakRefUser.WeakRefGone. // weakReferenceGone is called when the file in the weak reference is destroyed. // The poll entry is removed in response to this. -func (p *pollEntry) WeakRefGone() { - p.epoll.RemoveEntry(p.id) +func (p *pollEntry) WeakRefGone(ctx context.Context) { + p.epoll.RemoveEntry(ctx, p.id) } // EventPoll holds all the state associated with an event poll object, that is, @@ -144,14 +144,14 @@ func NewEventPoll(ctx context.Context) *fs.File { // name matches fs/eventpoll.c:epoll_create1. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ files: make(map[FileIdentifier]*pollEntry), }) } // Release implements fs.FileOperations.Release. -func (e *EventPoll) Release() { +func (e *EventPoll) Release(ctx context.Context) { // We need to take the lock now because files may be attempting to // remove entries in parallel if they get destroyed. e.mu.Lock() @@ -160,7 +160,7 @@ func (e *EventPoll) Release() { // Go through all entries and clean up. for _, entry := range e.files { entry.id.File.EventUnregister(&entry.waiter) - entry.file.Drop() + entry.file.Drop(ctx) } e.files = nil } @@ -423,7 +423,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter } // RemoveEntry a files from the collection of observed files. -func (e *EventPoll) RemoveEntry(id FileIdentifier) error { +func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error { e.mu.Lock() defer e.mu.Unlock() @@ -445,7 +445,7 @@ func (e *EventPoll) RemoveEntry(id FileIdentifier) error { // Remove file from map, and drop weak reference. delete(e.files, id) - entry.file.Drop() + entry.file.Drop(ctx) return nil } diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go index 22630e9c5..55b505593 100644 --- a/pkg/sentry/kernel/epoll/epoll_test.go +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -26,7 +26,8 @@ func TestFileDestroyed(t *testing.T) { f := filetest.NewTestFile(t) id := FileIdentifier{f, 12} - efile := NewEventPoll(contexttest.Context(t)) + ctx := contexttest.Context(t) + efile := NewEventPoll(ctx) e := efile.FileOperations.(*EventPoll) if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil { t.Fatalf("addEntry failed: %v", err) @@ -44,7 +45,7 @@ func TestFileDestroyed(t *testing.T) { } // Destroy the file. Check that we get no more events. - f.DecRef() + f.DecRef(ctx) evt = e.ReadEvents(1) if len(evt) != 0 { diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 87951adeb..bbf568dfc 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -70,7 +70,7 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { // name matches fs/eventfd.c:eventfd_file_create. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ val: initVal, semMode: semMode, @@ -106,7 +106,7 @@ func (e *EventOperations) HostFD() (int, error) { } // Release implements fs.FileOperations.Release. -func (e *EventOperations) Release() { +func (e *EventOperations) Release(context.Context) { e.mu.Lock() defer e.mu.Unlock() if e.hostfd >= 0 { diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 4b7d234a4..ce53af69b 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -98,7 +98,7 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ file: file, fileVFS2: fileVFS2, @@ -109,6 +109,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { } func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { + ctx := context.Background() f.init() // Initialize table. for fd, d := range m { f.setAll(fd, d.file, d.fileVFS2, d.flags) @@ -118,9 +119,9 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { // reference taken by set above. switch { case d.file != nil: - d.file.DecRef() + d.file.DecRef(ctx) case d.fileVFS2 != nil: - d.fileVFS2.DecRef() + d.fileVFS2.DecRef(ctx) } } } @@ -144,14 +145,15 @@ func (f *FDTable) drop(file *fs.File) { d.InotifyEvent(ev, 0) // Drop the table reference. - file.DecRef() + file.DecRef(context.Background()) } // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(file *vfs.FileDescription) { // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the // entire file. - err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET) + ctx := context.Background() + err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET) if err != nil && err != syserror.ENOLCK { panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) } @@ -161,10 +163,10 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { if file.IsWritable() { ev = linux.IN_CLOSE_WRITE } - file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent) // Drop the table's reference. - file.DecRef() + file.DecRef(ctx) } // NewFDTable allocates a new FDTable that may be used by tasks in k. @@ -175,15 +177,15 @@ func (k *Kernel) NewFDTable() *FDTable { } // destroy removes all of the file descriptors from the map. -func (f *FDTable) destroy() { - f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { +func (f *FDTable) destroy(ctx context.Context) { + f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { return true }) } // DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FDTable) DecRef() { - f.DecRefWithDestructor(f.destroy) +func (f *FDTable) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, f.destroy) } // Size returns the number of file descriptor slots currently allocated. @@ -195,7 +197,7 @@ func (f *FDTable) Size() int { // forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. -func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { +func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { // retries tracks the number of failed TryIncRef attempts for the same FD. retries := 0 fd := int32(0) @@ -214,7 +216,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, file, nil, flags) - file.DecRef() + file.DecRef(ctx) case fileVFS2 != nil: if !fileVFS2.TryIncRef() { retries++ @@ -224,7 +226,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, nil, fileVFS2, flags) - fileVFS2.DecRef() + fileVFS2.DecRef(ctx) } retries = 0 fd++ @@ -234,7 +236,8 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes // String is a stringer for FDTable. func (f *FDTable) String() string { var buf strings.Builder - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + ctx := context.Background() + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { switch { case file != nil: n, _ := file.Dirent.FullName(nil /* root */) @@ -242,7 +245,7 @@ func (f *FDTable) String() string { case fileVFS2 != nil: vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() - name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) if err != nil { fmt.Fprintf(&buf, "\n", err) return @@ -541,9 +544,9 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { // // Precondition: The caller must be running on the task goroutine, or Task.mu // must be locked. -func (f *FDTable) GetFDs() []int32 { +func (f *FDTable) GetFDs(ctx context.Context) []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) - f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) return fds @@ -552,9 +555,9 @@ func (f *FDTable) GetFDs() []int32 { // GetRefs returns a stable slice of references to all files and bumps the // reference count on each. The caller must use DecRef on each reference when // they're done using the slice. -func (f *FDTable) GetRefs() []*fs.File { +func (f *FDTable) GetRefs(ctx context.Context) []*fs.File { files := make([]*fs.File, 0, f.Size()) - f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -564,9 +567,9 @@ func (f *FDTable) GetRefs() []*fs.File { // GetRefsVFS2 returns a stable slice of references to all files and bumps the // reference count on each. The caller must use DecRef on each reference when // they're done using the slice. -func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { +func (f *FDTable) GetRefsVFS2(ctx context.Context) []*vfs.FileDescription { files := make([]*vfs.FileDescription, 0, f.Size()) - f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -574,10 +577,10 @@ func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { } // Fork returns an independent FDTable. -func (f *FDTable) Fork() *FDTable { +func (f *FDTable) Fork(ctx context.Context) *FDTable { clone := f.k.NewFDTable() - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. switch { @@ -622,11 +625,11 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { } // RemoveIf removes all FDs where cond is true. -func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { +func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { f.mu.Lock() defer f.mu.Unlock() - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { if cond(file, fileVFS2, flags) { f.set(fd, nil, FDFlags{}) // Clear from table. // Update current available position. diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 29f95a2c4..e3f30ba2a 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -154,7 +154,7 @@ func TestFDTable(t *testing.T) { if ref == nil { t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") } - ref.DecRef() + ref.DecRef(ctx) if ref, _ := fdTable.Remove(1); ref != nil { t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") @@ -191,7 +191,7 @@ func BenchmarkFDLookupAndDecRef(b *testing.B) { b.StartTimer() // Benchmark. for i := 0; i < b.N; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }) } @@ -219,7 +219,7 @@ func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { defer wg.Done() for i := 0; i < each; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }() } diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 47f78df9a..8f2d36d5a 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -17,6 +17,7 @@ package kernel import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -89,28 +90,28 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. -func (f *FSContext) destroy() { +func (f *FSContext) destroy(ctx context.Context) { // Hold f.mu so that we don't race with RootDirectory() and // WorkingDirectory(). f.mu.Lock() defer f.mu.Unlock() if VFS2Enabled { - f.rootVFS2.DecRef() + f.rootVFS2.DecRef(ctx) f.rootVFS2 = vfs.VirtualDentry{} - f.cwdVFS2.DecRef() + f.cwdVFS2.DecRef(ctx) f.cwdVFS2 = vfs.VirtualDentry{} } else { - f.root.DecRef() + f.root.DecRef(ctx) f.root = nil - f.cwd.DecRef() + f.cwd.DecRef(ctx) f.cwd = nil } } // DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FSContext) DecRef() { - f.DecRefWithDestructor(f.destroy) +func (f *FSContext) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, f.destroy) } // Fork forks this FSContext. @@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { // This will take an extra reference on the Dirent. // // This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { +func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetWorkingDirectory called with nil dirent") } @@ -180,21 +181,21 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { old := f.cwd f.cwd = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetWorkingDirectoryVFS2 sets the current working directory. // This will take an extra reference on the VirtualDentry. // // This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { +func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() old := f.cwdVFS2 f.cwdVFS2 = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // RootDirectory returns the current filesystem root. @@ -226,7 +227,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { // This will take an extra reference on the Dirent. // // This is not a valid call after free. -func (f *FSContext) SetRootDirectory(d *fs.Dirent) { +func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetRootDirectory called with nil dirent") } @@ -241,13 +242,13 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) { old := f.root f.root = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. // // This is not a valid call after free. -func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { +func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") } @@ -263,7 +264,7 @@ func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { vd.IncRef() f.rootVFS2 = vd f.mu.Unlock() - old.DecRef() + old.DecRef(ctx) } // Umask returns the current umask. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index c5021f2db..daa2dae76 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -51,6 +51,7 @@ go_test( srcs = ["futex_test.go"], library = ":futex", deps = [ + "//pkg/context", "//pkg/sync", "//pkg/usermem", ], diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index bcc1b29a8..e4dcc4d40 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -19,6 +19,7 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -66,9 +67,9 @@ type Key struct { Offset uint64 } -func (k *Key) release() { +func (k *Key) release(t Target) { if k.MappingIdentity != nil { - k.MappingIdentity.DecRef() + k.MappingIdentity.DecRef(t) } k.Mappable = nil k.MappingIdentity = nil @@ -94,6 +95,8 @@ func (k *Key) matches(k2 *Key) bool { // Target abstracts memory accesses and keys. type Target interface { + context.Context + // SwapUint32 gives access to usermem.IO.SwapUint32. SwapUint32(addr usermem.Addr, new uint32) (uint32, error) @@ -296,7 +299,7 @@ func (b *bucket) wakeWaiterLocked(w *Waiter) { // bucket "to". // // Preconditions: b and to must be locked. -func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { +func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int { done := 0 for w := b.waiters.Front(); done < n && w != nil; { if !w.key.matches(key) { @@ -308,7 +311,7 @@ func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { requeued := w w = w.Next() // Next iteration. b.waiters.Remove(requeued) - requeued.key.release() + requeued.key.release(t) requeued.key = nkey.clone() to.waiters.PushBack(requeued) requeued.bucket.Store(to) @@ -456,7 +459,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32 r := b.wakeLocked(&k, bitmask, n) b.mu.Unlock() - k.release() + k.release(t) return r, nil } @@ -465,12 +468,12 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, naddr, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -488,7 +491,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch done := b1.wakeLocked(&k1, ^uint32(0), nwake) // Requeue the number required. - b1.requeueLocked(b2, &k1, &k2, nreq) + b1.requeueLocked(t, b2, &k1, &k2, nreq) return done, nil } @@ -515,12 +518,12 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, addr2, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -571,7 +574,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // Perform our atomic check. if err := check(t, addr, val); err != nil { b.mu.Unlock() - w.key.release() + w.key.release(t) return err } @@ -585,7 +588,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // WaitComplete must be called when a Waiter previously added by WaitPrepare is // no longer eligible to be woken. -func (m *Manager) WaitComplete(w *Waiter) { +func (m *Manager) WaitComplete(w *Waiter, t Target) { // Remove w from the bucket it's in. for { b := w.bucket.Load() @@ -617,7 +620,7 @@ func (m *Manager) WaitComplete(w *Waiter) { } // Release references held by the waiter. - w.key.release() + w.key.release(t) } // LockPI attempts to lock the futex following the Priority-inheritance futex @@ -648,13 +651,13 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri success, err := m.lockPILocked(w, t, addr, tid, b, try) if err != nil { - w.key.release() + w.key.release(t) b.mu.Unlock() return false, err } if success || try { // Release waiter if it's not going to be a wait. - w.key.release() + w.key.release(t) } b.mu.Unlock() return success, nil @@ -730,7 +733,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool err = m.unlockPILocked(t, addr, tid, b, &k) - k.release() + k.release(t) b.mu.Unlock() return err } diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index 7c5c7665b..d0128c548 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -22,6 +22,7 @@ import ( "testing" "unsafe" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -29,28 +30,33 @@ import ( // testData implements the Target interface, and allows us to // treat the address passed for futex operations as an index in // a byte slice for testing simplicity. -type testData []byte +type testData struct { + context.Context + data []byte +} const sizeofInt32 = 4 func newTestData(size uint) testData { - return make([]byte, size) + return testData{ + data: make([]byte, size), + } } func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { - val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new) + val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new) return val, nil } func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { - if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) { + if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) { return old, nil } - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) { - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) { @@ -83,7 +89,7 @@ func TestFutexWake(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup. if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 { @@ -106,7 +112,7 @@ func TestFutexWakeBitmask(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup using the wrong bitmask. if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 { @@ -141,7 +147,7 @@ func TestFutexWakeTwo(t *testing.T) { var ws [3]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform two wakeups. @@ -174,9 +180,9 @@ func TestFutexWakeUnrelated(t *testing.T) { // Start two waiters waiting for wakeup on different addresses. w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform two wakeups on the second address. if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 { @@ -216,9 +222,9 @@ func TestWakeOpFirstNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address 0. if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 { @@ -244,9 +250,9 @@ func TestWakeOpSecondNonEmpty(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(0), which should succeed). @@ -273,9 +279,9 @@ func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(1), which should fail). @@ -302,15 +308,15 @@ func TestWakeOpAllNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(0), which @@ -344,15 +350,15 @@ func TestWakeOpAllNonEmptyFailingOp(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(1), which @@ -388,7 +394,7 @@ func TestWakeOpSameAddress(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -422,7 +428,7 @@ func TestWakeOpSameAddressFailingOp(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -472,7 +478,7 @@ func (t *testMutex) Lock() { for { // Attempt to grab the lock. if atomic.CompareAndSwapUint32( - (*uint32)(unsafe.Pointer(&t.d[t.a])), + (*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked, testMutexLocked) { // Lock held. @@ -490,7 +496,7 @@ func (t *testMutex) Lock() { panic("WaitPrepare returned unexpected error: " + err.Error()) } <-w.C - t.m.WaitComplete(w) + t.m.WaitComplete(w, t.d) } } @@ -498,7 +504,7 @@ func (t *testMutex) Lock() { // This will notify any waiters via the futex manager. func (t *testMutex) Unlock() { // Unlock. - atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked) + atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked) // Notify all waiters. t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32) diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 15dae0f5b..316df249d 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -376,7 +376,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.netlinkPorts = port.New() if VFS2Enabled { - if err := k.vfs.Init(); err != nil { + ctx := k.SupervisorContext() + if err := k.vfs.Init(ctx); err != nil { return fmt.Errorf("failed to initialize VFS: %v", err) } @@ -384,19 +385,19 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create pipefs filesystem: %v", err) } - defer pipeFilesystem.DecRef() + defer pipeFilesystem.DecRef(ctx) pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create pipefs mount: %v", err) } k.pipeMount = pipeMount - tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) if err != nil { return fmt.Errorf("failed to create tmpfs filesystem: %v", err) } - defer tmpfsFilesystem.DecRef() - defer tmpfsRoot.DecRef() + defer tmpfsFilesystem.DecRef(ctx) + defer tmpfsRoot.DecRef(ctx) shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create tmpfs mount: %v", err) @@ -407,7 +408,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) } - defer socketFilesystem.DecRef() + defer socketFilesystem.DecRef(ctx) socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create sockfs mount: %v", err) @@ -430,8 +431,8 @@ func (k *Kernel) SaveTo(w wire.Writer) error { defer k.extMu.Unlock() // Stop time. - k.pauseTimeLocked() - defer k.resumeTimeLocked() + k.pauseTimeLocked(ctx) + defer k.resumeTimeLocked(ctx) // Evict all evictable MemoryFile allocations. k.mf.StartEvictions() @@ -447,12 +448,12 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // Remove all epoll waiter objects from underlying wait queues. // NOTE: for programs to resume execution in future snapshot scenarios, // we will need to re-establish these waiter objects after saving. - k.tasks.unregisterEpollWaiters() + k.tasks.unregisterEpollWaiters(ctx) // Clear the dirent cache before saving because Dirents must be Loaded in a // particular order (parents before children), and Loading dirents from a cache // breaks that order. - if err := k.flushMountSourceRefs(); err != nil { + if err := k.flushMountSourceRefs(ctx); err != nil { return err } @@ -505,7 +506,7 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. -func (k *Kernel) flushMountSourceRefs() error { +func (k *Kernel) flushMountSourceRefs(ctx context.Context) error { // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() @@ -521,7 +522,7 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) @@ -531,7 +532,7 @@ func (k *Kernel) flushMountSourceRefs() error { // each task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { +func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. if VFS2Enabled { return nil @@ -544,7 +545,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) if t.fdTable == nil { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } @@ -555,7 +556,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -602,7 +603,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { return nil } -func (ts *TaskSet) unregisterEpollWaiters() { +func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. if VFS2Enabled { return @@ -623,7 +624,7 @@ func (ts *TaskSet) unregisterEpollWaiters() { if _, ok := processed[t.fdTable]; ok { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if e, ok := file.FileOperations.(*epoll.EventPoll); ok { e.UnregisterEpollWaiters() } @@ -900,7 +901,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, root := args.MountNamespaceVFS2.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. - defer root.DecRef() + defer root.DecRef(ctx) // Grab the working directory. wd := root // Default. @@ -918,7 +919,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) fsContext = NewFSContextVFS2(root, wd, args.Umask) @@ -933,7 +934,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, root := mntns.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. - defer root.DecRef() + defer root.DecRef(ctx) // Grab the working directory. remainingTraversals := args.MaxSymlinkTraversals @@ -944,7 +945,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewFSLookup(mntns, root, wd) fsContext = newFSContext(root, wd, args.Umask) @@ -1054,7 +1055,7 @@ func (k *Kernel) Start() error { // If k was created by LoadKernelFrom, timers were stopped during // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, // this is a no-op. - k.resumeTimeLocked() + k.resumeTimeLocked(k.SupervisorContext()) // Start task goroutines. k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() @@ -1068,7 +1069,7 @@ func (k *Kernel) Start() error { // // Preconditions: Any task goroutines running in k must be stopped. k.extMu // must be locked. -func (k *Kernel) pauseTimeLocked() { +func (k *Kernel) pauseTimeLocked(ctx context.Context) { // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before // Kernel.Start(). if k.cpuClockTicker != nil { @@ -1090,7 +1091,7 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() @@ -1112,7 +1113,7 @@ func (k *Kernel) pauseTimeLocked() { // // Preconditions: Any task goroutines running in k must be stopped. k.extMu // must be locked. -func (k *Kernel) resumeTimeLocked() { +func (k *Kernel) resumeTimeLocked(ctx context.Context) { if k.cpuClockTicker != nil { k.cpuClockTicker.Resume() } @@ -1126,7 +1127,7 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() @@ -1511,7 +1512,7 @@ type SocketEntry struct { } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (s *SocketEntry) WeakRefGone() { +func (s *SocketEntry) WeakRefGone(context.Context) { s.k.extMu.Lock() s.k.sockets.Remove(s) s.k.extMu.Unlock() @@ -1600,7 +1601,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return vfs.VirtualDentry{} } mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() - defer mntns.DecRef() + defer mntns.DecRef(ctx) // Root() takes a reference on the root dirent for us. return mntns.Root() case vfs.CtxMountNamespace: diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 4b688c627..6497dc4ba 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -93,7 +93,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { if !waitFor(&i.mu, &i.wWakeup, ctx) { - r.DecRef() + r.DecRef(ctx) return nil, syserror.ErrInterrupted } } @@ -111,12 +111,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi // On a nonblocking, write-only open, the open fails with ENXIO if the // read side isn't open yet. if flags.NonBlocking { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ENXIO } if !waitFor(&i.mu, &i.rWakeup, ctx) { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ErrInterrupted } } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index ab75a87ff..ce0db5583 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -167,7 +167,7 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) { f := NewInodeOperations(ctx, perms, newNamedPipe(t)) rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil) - rFile.DecRef() + rFile.DecRef(ctx) wDone := make(chan struct{}) // This open for write should block because the reader is now gone. diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 79645d7d2..297e8f28f 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -152,7 +152,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) // The p.Open calls below will each take a reference on the Dirent. We // must drop the one we already have. - defer d.DecRef() + defer d.DecRef(ctx) return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true}) } diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index bda739dbe..fe97e9800 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -27,8 +27,8 @@ import ( func TestPipeRW(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") wantN := int64(len(msg)) @@ -47,8 +47,8 @@ func TestPipeRW(t *testing.T) { func TestPipeReadBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) if n != 0 || err != syserror.ErrWouldBlock { @@ -62,8 +62,8 @@ func TestPipeWriteBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := make([]byte, capacity+1) n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) @@ -77,8 +77,8 @@ func TestPipeWriteUntilEnd(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index aacf28da2..6d58b682f 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -33,7 +33,7 @@ import ( // the old fs architecture. // Release cleans up the pipe's state. -func (p *Pipe) Release() { +func (p *Pipe) Release(context.Context) { p.rClose() p.wClose() diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go index 7724b4452..ac18785c0 100644 --- a/pkg/sentry/kernel/pipe/reader.go +++ b/pkg/sentry/kernel/pipe/reader.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Reader struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (r *Reader) Release() { +func (r *Reader) Release(context.Context) { r.Pipe.rClose() // Wake up writers. diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 45d4c5fc1..28f998e45 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -101,7 +101,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } @@ -112,12 +112,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // Non-blocking, write-only opens fail with ENXIO when the read // side isn't open yet. if statusFlags&linux.O_NONBLOCK != 0 { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } } @@ -169,7 +169,7 @@ type VFSPipeFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *VFSPipeFD) Release() { +func (fd *VFSPipeFD) Release(context.Context) { var event waiter.EventMask if fd.vfsfd.IsReadable() { fd.pipe.rClose() diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go index 5bc6aa931..ef4b70ca3 100644 --- a/pkg/sentry/kernel/pipe/writer.go +++ b/pkg/sentry/kernel/pipe/writer.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Writer struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (w *Writer) Release() { +func (w *Writer) Release(context.Context) { w.Pipe.wClose() // Wake up readers. diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 0e19286de..5c4c622c2 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" @@ -70,7 +71,7 @@ func (s *Session) incRef() { // // Precondition: callers must hold TaskSet.mu for writing. func (s *Session) decRef() { - s.refs.DecRefWithDestructor(func() { + s.refs.DecRefWithDestructor(nil, func(context.Context) { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] @@ -162,7 +163,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { } alive := true - pg.refs.DecRefWithDestructor(func() { + pg.refs.DecRefWithDestructor(nil, func(context.Context) { alive = false // don't bother with handleOrphan. // Remove translations from the originator. diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 55b4c2cdb..13ec7afe0 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -431,8 +431,8 @@ func (s *Shm) InodeID() uint64 { // DecRef overrides refs.RefCount.DecRef with a destructor. // // Precondition: Caller must not hold s.mu. -func (s *Shm) DecRef() { - s.DecRefWithDestructor(s.destroy) +func (s *Shm) DecRef(ctx context.Context) { + s.DecRefWithDestructor(ctx, s.destroy) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm @@ -642,7 +642,7 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { return nil } -func (s *Shm) destroy() { +func (s *Shm) destroy(context.Context) { s.mfp.MemoryFile().DecRef(s.fr) s.registry.remove(s) } @@ -651,7 +651,7 @@ func (s *Shm) destroy() { // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See // shmctl(IPC_RMID). -func (s *Shm) MarkDestroyed() { +func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() @@ -663,7 +663,7 @@ func (s *Shm) MarkDestroyed() { // // N.B. This cannot be the final DecRef, as the caller also // holds a reference. - s.DecRef() + s.DecRef(ctx) return } } diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 8243bb93e..b07e1c1bd 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -76,7 +76,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { } // Release implements fs.FileOperations.Release. -func (s *SignalOperations) Release() {} +func (s *SignalOperations) Release(context.Context) {} // Mask returns the signal mask. func (s *SignalOperations) Mask() linux.SignalSet { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c4db05bd8..5aee699e7 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -730,17 +730,17 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { func (t *Task) IsChrooted() bool { if VFS2Enabled { realRoot := t.mountNamespaceVFS2.Root() - defer realRoot.DecRef() + defer realRoot.DecRef(t) root := t.fsContext.RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) return root != realRoot } realRoot := t.tg.mounts.Root() - defer realRoot.DecRef() + defer realRoot.DecRef(t) root := t.fsContext.RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } return root != realRoot } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index e1ecca99e..fe6ba6041 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -237,7 +237,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { var fdTable *FDTable if opts.NewFiles { - fdTable = t.fdTable.Fork() + fdTable = t.fdTable.Fork(t) } else { fdTable = t.fdTable fdTable.IncRef() @@ -294,7 +294,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { nt, err := t.tg.pidns.owner.NewTask(cfg) if err != nil { if opts.NewThreadGroup { - tg.release() + tg.release(t) } return 0, nil, err } @@ -510,7 +510,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { var oldFDTable *FDTable if opts.NewFiles { oldFDTable = t.fdTable - t.fdTable = oldFDTable.Fork() + t.fdTable = oldFDTable.Fork(t) } var oldFSContext *FSContext if opts.NewFSContext { @@ -519,10 +519,10 @@ func (t *Task) Unshare(opts *SharingOptions) error { } t.mu.Unlock() if oldFDTable != nil { - oldFDTable.DecRef() + oldFDTable.DecRef(t) } if oldFSContext != nil { - oldFSContext.DecRef() + oldFSContext.DecRef(t) } return nil } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 7803b98d0..47c28b8ff 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -199,11 +199,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() oldFDTable := t.fdTable - t.fdTable = t.fdTable.Fork() - oldFDTable.DecRef() + t.fdTable = t.fdTable.Fork(t) + oldFDTable.DecRef(t) // Remove FDs with the CloseOnExec flag set. - t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { + t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 231ac548a..c165d6cb1 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -269,12 +269,12 @@ func (*runExitMain) execute(t *Task) taskRunState { // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() - t.fsContext.DecRef() - t.fdTable.DecRef() + t.fsContext.DecRef(t) + t.fdTable.DecRef(t) t.mu.Lock() if t.mountNamespaceVFS2 != nil { - t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } t.mu.Unlock() @@ -282,7 +282,7 @@ func (*runExitMain) execute(t *Task) taskRunState { // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { - t.tg.release() + t.tg.release(t) } // Detach tracees. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index eeccaa197..ab86ceedc 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -203,6 +203,6 @@ func (t *Task) traceExecEvent(tc *TaskContext) { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } - defer file.DecRef() + defer file.DecRef(t) trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 8485fb4b6..64c1e120a 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -102,10 +102,10 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { cfg.TaskContext.release() - cfg.FSContext.DecRef() - cfg.FDTable.DecRef() + cfg.FSContext.DecRef(t) + cfg.FDTable.DecRef(t) if cfg.MountNamespaceVFS2 != nil { - cfg.MountNamespaceVFS2.DecRef() + cfg.MountNamespaceVFS2.DecRef(t) } return nil, err } diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 4dfd2c990..0b34c0099 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -308,7 +308,7 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet { } // release releases the thread group's resources. -func (tg *ThreadGroup) release() { +func (tg *ThreadGroup) release(t *Task) { // Timers must be destroyed without holding the TaskSet or signal mutexes // since timers send signals with Timer.mu locked. tg.itimerRealTimer.Destroy() @@ -325,7 +325,7 @@ func (tg *ThreadGroup) release() { it.DestroyTimer() } if tg.mounts != nil { - tg.mounts.DecRef() + tg.mounts.DecRef(t) } } diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index ddeaff3db..20dd1cc21 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -281,7 +281,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr } defer func() { if mopts.MappingIdentity != nil { - mopts.MappingIdentity.DecRef() + mopts.MappingIdentity.DecRef(ctx) } }() if err := f.ConfigureMMap(ctx, &mopts); err != nil { @@ -663,7 +663,7 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err } - defer intFile.DecRef() + defer intFile.DecRef(ctx) interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin) if err != nil { diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 986c7fb4d..8d6802ea3 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -154,7 +154,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context return loadedELF{}, nil, nil, nil, err } // Ensure file is release in case the code loops or errors out. - defer args.File.DecRef() + defer args.File.DecRef(ctx) } else { if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil { return loadedELF{}, nil, nil, nil, err @@ -223,7 +223,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } - defer file.DecRef() + defer file.DecRef(ctx) // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) @@ -292,7 +292,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvStart(sl.EnvvStart) m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) - m.SetExecutable(file) + m.SetExecutable(ctx, file) symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn") if err != nil { diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index c188f6c29..59c92c7e8 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -238,7 +238,7 @@ type MappingIdentity interface { IncRef() // DecRef decrements the MappingIdentity's reference count. - DecRef() + DecRef(ctx context.Context) // MappedName returns the application-visible name shown in // /proc/[pid]/maps. diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 1999ec706..16fea53c4 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -258,8 +258,8 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { } // DecRef implements refs.RefCounter.DecRef. -func (m *aioMappable) DecRef() { - m.AtomicRefCount.DecRefWithDestructor(func() { +func (m *aioMappable) DecRef(ctx context.Context) { + m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { m.mfp.MemoryFile().DecRef(m.fr) }) } @@ -367,7 +367,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint if err != nil { return 0, err } - defer m.DecRef() + defer m.DecRef(ctx) addr, err := mm.MMap(ctx, memmap.MMapOpts{ Length: aioRingBufferSize, MappingIdentity: m, diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index aac56679b..4d7773f8b 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -258,7 +258,7 @@ func (mm *MemoryManager) DecUsers(ctx context.Context) { mm.executable = nil mm.metadataMu.Unlock() if exe != nil { - exe.DecRef() + exe.DecRef(ctx) } mm.activeMu.Lock() diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index 28e5057f7..0cfd60f6c 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -15,6 +15,7 @@ package mm import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/usermem" @@ -147,7 +148,7 @@ func (mm *MemoryManager) Executable() fsbridge.File { // SetExecutable sets the executable. // // This takes a reference on d. -func (mm *MemoryManager) SetExecutable(file fsbridge.File) { +func (mm *MemoryManager) SetExecutable(ctx context.Context, file fsbridge.File) { mm.metadataMu.Lock() // Grab a new reference. @@ -164,7 +165,7 @@ func (mm *MemoryManager) SetExecutable(file fsbridge.File) { // Do this without holding the lock, since it may wind up doing some // I/O to sync the dirent, etc. if orig != nil { - orig.DecRef() + orig.DecRef(ctx) } } diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 0e142fb11..4cdb52eb6 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -50,8 +50,8 @@ func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.F } // DecRef implements refs.RefCounter.DecRef. -func (m *SpecialMappable) DecRef() { - m.AtomicRefCount.DecRefWithDestructor(func() { +func (m *SpecialMappable) DecRef(ctx context.Context) { + m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { m.mfp.MemoryFile().DecRef(m.fr) }) } diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 3f496aa9f..e74d4e1c1 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -101,7 +101,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme if err != nil { return 0, err } - defer m.DecRef() + defer m.DecRef(ctx) opts.MappingIdentity = m opts.Mappable = m } @@ -1191,7 +1191,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar)) mm.mappingMu.RUnlock() err := id.Msync(ctx, mr) - id.DecRef() + id.DecRef(ctx) if err != nil { return err } diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 16d8207e9..bd751d696 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -377,7 +377,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) } if vma.id != nil { - vma.id.DecRef() + vma.id.DecRef(ctx) } mm.usageAS -= uint64(vmaAR.Length()) if vma.isPrivateDataLocked() { @@ -446,7 +446,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa } if vma2.id != nil { - vma2.id.DecRef() + vma2.id.DecRef(context.Background()) } return vma1, true } diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 8b439a078..70ccf77a7 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -68,7 +68,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { for _, fd := range fds { file := t.GetFile(fd) if file == nil { - files.Release() + files.Release(t) return nil, syserror.EBADF } files = append(files, file) @@ -100,9 +100,9 @@ func (fs *RightsFiles) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (fs *RightsFiles) Release() { +func (fs *RightsFiles) Release(ctx context.Context) { for _, f := range *fs { - f.DecRef() + f.DecRef(ctx) } *fs = nil } @@ -115,7 +115,7 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) - files[0].DecRef() + files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go index fd08179be..d9621968c 100644 --- a/pkg/sentry/socket/control/control_vfs2.go +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -46,7 +46,7 @@ func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { for _, fd := range fds { file := t.GetFileVFS2(fd) if file == nil { - files.Release() + files.Release(t) return nil, syserror.EBADF } files = append(files, file) @@ -78,9 +78,9 @@ func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (fs *RightsFilesVFS2) Release() { +func (fs *RightsFilesVFS2) Release(ctx context.Context) { for _, f := range *fs { - f.DecRef() + f.DecRef(ctx) } *fs = nil } @@ -93,7 +93,7 @@ func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) - files[0].DecRef() + files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 532a1ea5d..242e6bf76 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -100,12 +100,12 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc return nil, syserr.FromError(err) } dirent := socket.NewDirent(ctx, socketDevice) - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(context.Context) { fdnotifier.RemoveFD(int32(s.fd)) syscall.Close(s.fd) } @@ -269,7 +269,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, syscall.Close(fd) return 0, nil, 0, err } - defer f.DecRef() + defer f.DecRef(t) kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, @@ -281,7 +281,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, syscall.Close(fd) return 0, nil, 0, err } - defer f.DecRef() + defer f.DecRef(t) kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 0d45e5053..31e374833 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -97,7 +97,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int } d := socket.NewDirent(t, netlinkSocketDevice) - defer d.DecRef() + defer d.DecRef(t) return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, s), nil } diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 98ca7add0..68a9b9a96 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -140,14 +140,14 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { - ep.Close() + ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { - ep.Close() + ep.Close(t) return nil, err } @@ -164,9 +164,9 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { - s.connection.Release() - s.ep.Close() +func (s *socketOpsCommon) Release(ctx context.Context) { + s.connection.Release(ctx) + s.ep.Close(ctx) if s.bound { s.ports.Release(s.protocol.Protocol(), s.portID) @@ -621,7 +621,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys if len(bufs) > 0 { // RecvMsg never receives the address, so we don't need to send // one. - _, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(ctx, bufs, cms, tcpip.FullAddress{}) // If the buffer is full, we simply drop messages, just like // Linux. if err != nil && err != syserr.ErrWouldBlock { @@ -648,7 +648,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys // Add the dump_done_errno payload. m.Put(int64(0)) - _, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) if err != nil && err != syserr.ErrWouldBlock { return err } diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index dbcd8b49a..a38d25da9 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -57,14 +57,14 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { - ep.Close() + ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { - ep.Close() + ep.Close(t) return nil, err } diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 31a168f7e..e4846bc0b 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -330,7 +330,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue } dirent := socket.NewDirent(t, netstackDevice) - defer dirent.DecRef() + defer dirent.DecRef(t) return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ socketOpsCommon: socketOpsCommon{ Queue: queue, @@ -479,7 +479,7 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error { } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(context.Context) { s.Endpoint.Close() } @@ -854,7 +854,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { flags := ns.Flags() diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index a9025b0ec..3335e7430 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -169,7 +169,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, 0, syserr.FromError(err) diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index d112757fb..04b259d27 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -46,8 +46,8 @@ type ControlMessages struct { } // Release releases Unix domain socket credentials and rights. -func (c *ControlMessages) Release() { - c.Unix.Release() +func (c *ControlMessages) Release(ctx context.Context) { + c.Unix.Release(ctx) } // Socket is an interface combining fs.FileOperations and SocketOps, diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index a1e49cc57..c67b602f0 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -211,7 +211,7 @@ func (e *connectionedEndpoint) Listening() bool { // The socket will be a fresh state after a call to close and may be reused. // That is, close may be used to "unbind" or "disconnect" the socket in error // paths. -func (e *connectionedEndpoint) Close() { +func (e *connectionedEndpoint) Close(ctx context.Context) { e.Lock() var c ConnectedEndpoint var r Receiver @@ -233,7 +233,7 @@ func (e *connectionedEndpoint) Close() { case e.Listening(): close(e.acceptedChan) for n := range e.acceptedChan { - n.Close() + n.Close(ctx) } e.acceptedChan = nil e.path = "" @@ -241,11 +241,11 @@ func (e *connectionedEndpoint) Close() { e.Unlock() if c != nil { c.CloseNotify() - c.Release() + c.Release(ctx) } if r != nil { r.CloseNotify() - r.Release() + r.Release(ctx) } } @@ -340,7 +340,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn return nil default: // Busy; return ECONNREFUSED per spec. - ne.Close() + ne.Close(ctx) e.Unlock() ce.Unlock() return syserr.ErrConnectionRefused diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 4b06d63ac..70ee8f9b8 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -54,10 +54,10 @@ func (e *connectionlessEndpoint) isBound() bool { // Close puts the endpoint in a closed state and frees all resources associated // with it. -func (e *connectionlessEndpoint) Close() { +func (e *connectionlessEndpoint) Close(ctx context.Context) { e.Lock() if e.connected != nil { - e.connected.Release() + e.connected.Release(ctx) e.connected = nil } @@ -71,7 +71,7 @@ func (e *connectionlessEndpoint) Close() { e.Unlock() r.CloseNotify() - r.Release() + r.Release(ctx) } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. @@ -108,10 +108,10 @@ func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c C if err != nil { return 0, syserr.ErrInvalidEndpointState } - defer connected.Release() + defer connected.Release(ctx) e.Lock() - n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) e.Unlock() if notify { @@ -135,7 +135,7 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi e.Lock() if e.connected != nil { - e.connected.Release() + e.connected.Release(ctx) } e.connected = connected e.Unlock() diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index d8f3ad63d..ef6043e19 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -15,6 +15,7 @@ package transport import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" @@ -57,10 +58,10 @@ func (q *queue) Close() { // Both the read and write queues must be notified after resetting: // q.ReaderQueue.Notify(waiter.EventIn) // q.WriterQueue.Notify(waiter.EventOut) -func (q *queue) Reset() { +func (q *queue) Reset(ctx context.Context) { q.mu.Lock() for cur := q.dataList.Front(); cur != nil; cur = cur.Next() { - cur.Release() + cur.Release(ctx) } q.dataList.Reset() q.used = 0 @@ -68,8 +69,8 @@ func (q *queue) Reset() { } // DecRef implements RefCounter.DecRef with destructor q.Reset. -func (q *queue) DecRef() { - q.DecRefWithDestructor(q.Reset) +func (q *queue) DecRef(ctx context.Context) { + q.DecRefWithDestructor(ctx, q.Reset) // We don't need to notify after resetting because no one cares about // this queue after all references have been dropped. } @@ -111,7 +112,7 @@ func (q *queue) IsWritable() bool { // // If notify is true, ReaderQueue.Notify must be called: // q.ReaderQueue.Notify(waiter.EventIn) -func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { +func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { q.mu.Lock() if q.closed { @@ -124,7 +125,7 @@ func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress } if discardEmpty && l == 0 { q.mu.Unlock() - c.Release() + c.Release(ctx) return 0, false, nil } diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 2f1b127df..475d7177e 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -37,7 +37,7 @@ type RightsControlMessage interface { Clone() RightsControlMessage // Release releases any resources owned by the RightsControlMessage. - Release() + Release(ctx context.Context) } // A CredentialsControlMessage is a control message containing Unix credentials. @@ -74,9 +74,9 @@ func (c *ControlMessages) Clone() ControlMessages { } // Release releases both the credentials and the rights. -func (c *ControlMessages) Release() { +func (c *ControlMessages) Release(ctx context.Context) { if c.Rights != nil { - c.Rights.Release() + c.Rights.Release(ctx) } *c = ControlMessages{} } @@ -90,7 +90,7 @@ type Endpoint interface { // Close puts the endpoint in a closed state and frees all resources // associated with it. - Close() + Close(ctx context.Context) // RecvMsg reads data and a control message from the endpoint. This method // does not block if there is no data pending. @@ -252,7 +252,7 @@ type BoundEndpoint interface { // Release releases any resources held by the BoundEndpoint. It must be // called before dropping all references to a BoundEndpoint returned by a // function. - Release() + Release(ctx context.Context) } // message represents a message passed over a Unix domain socket. @@ -281,8 +281,8 @@ func (m *message) Length() int64 { } // Release releases any resources held by the message. -func (m *message) Release() { - m.Control.Release() +func (m *message) Release(ctx context.Context) { + m.Control.Release(ctx) } // Peek returns a copy of the message. @@ -304,7 +304,7 @@ type Receiver interface { // See Endpoint.RecvMsg for documentation on shared arguments. // // notify indicates if RecvNotify should be called. - Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) + Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) // RecvNotify notifies the Receiver of a successful Recv. This must not be // called while holding any endpoint locks. @@ -333,7 +333,7 @@ type Receiver interface { // Release releases any resources owned by the Receiver. It should be // called before droping all references to a Receiver. - Release() + Release(ctx context.Context) } // queueReceiver implements Receiver for datagram sockets. @@ -344,7 +344,7 @@ type queueReceiver struct { } // Recv implements Receiver.Recv. -func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { var m *message var notify bool var err *syserr.Error @@ -398,8 +398,8 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 { } // Release implements Receiver.Release. -func (q *queueReceiver) Release() { - q.readQueue.DecRef() +func (q *queueReceiver) Release(ctx context.Context) { + q.readQueue.DecRef(ctx) } // streamQueueReceiver implements Receiver for stream sockets. @@ -456,7 +456,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { } // Recv implements Receiver.Recv. -func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { q.mu.Lock() defer q.mu.Unlock() @@ -502,7 +502,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, var cmTruncated bool if c.Rights != nil && numRights == 0 { - c.Rights.Release() + c.Rights.Release(ctx) c.Rights = nil cmTruncated = true } @@ -557,7 +557,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, // Consume rights. if numRights == 0 { cmTruncated = true - q.control.Rights.Release() + q.control.Rights.Release(ctx) } else { c.Rights = q.control.Rights haveRights = true @@ -582,7 +582,7 @@ type ConnectedEndpoint interface { // // syserr.ErrWouldBlock can be returned along with a partial write if // the caller should block to send the rest of the data. - Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) + Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) // SendNotify notifies the ConnectedEndpoint of a successful Send. This // must not be called while holding any endpoint locks. @@ -616,7 +616,7 @@ type ConnectedEndpoint interface { // Release releases any resources owned by the ConnectedEndpoint. It should // be called before droping all references to a ConnectedEndpoint. - Release() + Release(ctx context.Context) // CloseUnread sets the fact that this end is closed with unread data to // the peer socket. @@ -654,7 +654,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) } // Send implements ConnectedEndpoint.Send. -func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { discardEmpty := false truncate := false if e.endpoint.Type() == linux.SOCK_STREAM { @@ -669,7 +669,7 @@ func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.Fu truncate = true } - return e.writeQueue.Enqueue(data, c, from, discardEmpty, truncate) + return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate) } // SendNotify implements ConnectedEndpoint.SendNotify. @@ -707,8 +707,8 @@ func (e *connectedEndpoint) SendMaxQueueSize() int64 { } // Release implements ConnectedEndpoint.Release. -func (e *connectedEndpoint) Release() { - e.writeQueue.DecRef() +func (e *connectedEndpoint) Release(ctx context.Context) { + e.writeQueue.DecRef(ctx) } // CloseUnread implements ConnectedEndpoint.CloseUnread. @@ -798,7 +798,7 @@ func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, n return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected } - recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(data, creds, numRights, peek) + recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(ctx, data, creds, numRights, peek) e.Unlock() if err != nil { return 0, 0, ControlMessages{}, false, err @@ -827,7 +827,7 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess return 0, syserr.ErrAlreadyConnected } - n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + n, notify, err := e.connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) e.Unlock() if notify { @@ -1001,6 +1001,6 @@ func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { } // Release implements BoundEndpoint.Release. -func (*baseEndpoint) Release() { +func (*baseEndpoint) Release(context.Context) { // Binding a baseEndpoint doesn't take a reference. } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 0482d33cf..2b8454edb 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -62,7 +62,7 @@ type SocketOperations struct { // New creates a new unix socket. func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File { dirent := socket.NewDirent(ctx, unixSocketDevice) - defer dirent.DecRef() + defer dirent.DecRef(ctx) return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true}) } @@ -97,17 +97,17 @@ type socketOpsCommon struct { } // DecRef implements RefCounter.DecRef. -func (s *socketOpsCommon) DecRef() { - s.DecRefWithDestructor(func() { - s.ep.Close() +func (s *socketOpsCommon) DecRef(ctx context.Context) { + s.DecRefWithDestructor(ctx, func(context.Context) { + s.ep.Close(ctx) }) } // Release implemements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(ctx context.Context) { // Release only decrements a reference on s because s may be referenced in // the abstract socket namespace. - s.DecRef() + s.DecRef(ctx) } func (s *socketOpsCommon) isPacket() bool { @@ -234,7 +234,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } ns := New(t, ep, s.stype) - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { flags := ns.Flags() @@ -284,7 +284,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } @@ -294,7 +294,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { var name string cwd := t.FSContext().WorkingDirectory() - defer cwd.DecRef() + defer cwd.DecRef(t) // Is there no slash at all? if !strings.Contains(p, "/") { @@ -302,7 +302,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { name = p } else { root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) // Find the last path component, we know that something follows // that final slash, otherwise extractPath() would have failed. lastSlash := strings.LastIndex(p, "/") @@ -318,7 +318,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // No path available. return syserr.ErrNoSuchFile } - defer d.DecRef() + defer d.DecRef(t) name = p[lastSlash+1:] } @@ -332,7 +332,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if err != nil { return syserr.ErrPortInUse } - childDir.DecRef() + childDir.DecRef(t) } return nil @@ -378,9 +378,9 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, FollowFinalSymlink: true, } ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) - root.DecRef() + root.DecRef(t) if relPath { - start.DecRef() + start.DecRef(t) } if e != nil { return nil, syserr.FromError(e) @@ -393,15 +393,15 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, cwd := t.FSContext().WorkingDirectory() remainingTraversals := uint(fs.DefaultTraversalLimit) d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals) - cwd.DecRef() - root.DecRef() + cwd.DecRef(t) + root.DecRef(t) if e != nil { return nil, syserr.FromError(e) } // Extract the endpoint if one is there. ep := d.Inode.BoundEndpoint(path) - d.DecRef() + d.DecRef(t) if ep == nil { // No socket! return nil, syserr.ErrConnectionRefused @@ -415,7 +415,7 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool if err != nil { return err } - defer ep.Release() + defer ep.Release(t) // Connect the server endpoint. err = s.ep.Connect(t, ep) @@ -473,7 +473,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b if err != nil { return 0, err } - defer ep.Release() + defer ep.Release(t) w.To = ep if ep.Passcred() && w.Control.Credentials == nil { diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 05c16fcfe..dfa25241a 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -136,7 +136,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) @@ -183,19 +183,19 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } } else { path := fspath.Parse(p) root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root relPath := !path.Absolute if relPath { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } pop := vfs.PathOperation{ Root: root, @@ -333,7 +333,7 @@ func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) f, err := NewSockfsFile(t, ep, stype) if err != nil { - ep.Close() + ep.Close(t) return nil, err } return f, nil @@ -357,14 +357,14 @@ func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (* ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) s1, err := NewSockfsFile(t, ep1, stype) if err != nil { - ep1.Close() - ep2.Close() + ep1.Close(t) + ep2.Close(t) return nil, nil, err } s2, err := NewSockfsFile(t, ep2, stype) if err != nil { - s1.DecRef() - ep2.Close() + s1.DecRef(t) + ep2.Close(t) return nil, nil, err } diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 68ca537c8..87b239730 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -147,14 +147,14 @@ func fd(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectory() var name string if wd != nil { - defer wd.DecRef() + defer wd.DecRef(t) name, _ = wd.FullName(root) } else { name = "(unknown cwd)" @@ -167,7 +167,7 @@ func fd(t *kernel.Task, fd int32) string { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } - defer file.DecRef() + defer file.DecRef(t) name, _ := file.Dirent.FullName(root) return fmt.Sprintf("%#x %s", fd, name) @@ -175,12 +175,12 @@ func fd(t *kernel.Task, fd int32) string { func fdVFS2(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) vfsObj := root.Mount().Filesystem().VirtualFilesystem() if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectoryVFS2() - defer wd.DecRef() + defer wd.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, wd) return fmt.Sprintf("AT_FDCWD %s", name) @@ -191,7 +191,7 @@ func fdVFS2(t *kernel.Task, fd int32) string { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } - defer file.DecRef() + defer file.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry()) return fmt.Sprintf("%#x %s", fd, name) diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index d9fb808c0..d23a0068a 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -28,7 +28,7 @@ import ( // CreateEpoll implements the epoll_create(2) linux syscall. func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) { file := epoll.NewEventPoll(t) - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: closeOnExec, @@ -47,14 +47,14 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -73,14 +73,14 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -99,14 +99,14 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -115,7 +115,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { } // Try to remove the entry. - return e.RemoveEntry(epoll.FileIdentifier{file, fd}) + return e.RemoveEntry(t, epoll.FileIdentifier{file, fd}) } // WaitEpoll implements the epoll_wait(2) linux syscall. @@ -125,7 +125,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve if epollfile == nil { return nil, syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index ba2557c52..e9d64dec5 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -247,7 +247,7 @@ func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linu ev.Result = -int64(kernel.ExtractErrno(err, 0)) } - file.DecRef() + file.DecRef(ctx) // Queue the result for delivery. actx.FinishRequest(ev) @@ -257,7 +257,7 @@ func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linu // wake up. if eventFile != nil { eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) - eventFile.DecRef() + eventFile.DecRef(ctx) } } } @@ -269,7 +269,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user // File not found. return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Was there an eventFD? Extract it. var eventFile *fs.File @@ -279,7 +279,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user // Bad FD. return syserror.EBADF } - defer eventFile.DecRef() + defer eventFile.DecRef(t) // Check that it is an eventfd. if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok { diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index ed3413ca6..3b4f879e4 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -37,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc event.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.EFD_NONBLOCK != 0, }) - defer event.DecRef() + defer event.DecRef(t) fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 8cf6401e7..1bc9b184e 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -40,7 +40,7 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, // Common case: we are accessing a file in the root. root := t.FSContext().RootDirectory() err := fn(root, root, name, linux.MaxSymlinkTraversals) - root.DecRef() + root.DecRef(t) return err } else if dir == "." && dirFD == linux.AT_FDCWD { // Common case: we are accessing a file relative to the current @@ -48,8 +48,8 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, wd := t.FSContext().WorkingDirectory() root := t.FSContext().RootDirectory() err := fn(root, wd, name, linux.MaxSymlinkTraversals) - wd.DecRef() - root.DecRef() + wd.DecRef(t) + root.DecRef(t) return err } @@ -97,19 +97,19 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro } else { d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals) } - root.DecRef() + root.DecRef(t) if wd != nil { - wd.DecRef() + wd.DecRef(t) } if f != nil { - f.DecRef() + f.DecRef(t) } if err != nil { return err } err = fn(root, d, remainingTraversals) - d.DecRef() + d.DecRef(t) return err } @@ -186,7 +186,7 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } - defer file.DecRef() + defer file.DecRef(t) // Success. newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ @@ -242,7 +242,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode if err != nil { return err } - file.DecRef() + file.DecRef(t) return nil case linux.ModeNamedPipe: @@ -332,7 +332,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { break } - defer found.DecRef() + defer found.DecRef(t) // We found something (possibly a symlink). If the // O_EXCL flag was passed, then we can immediately @@ -357,7 +357,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l resolved, err = found.Inode.Getlink(t) if err == nil { // No more resolution necessary. - defer resolved.DecRef() + defer resolved.DecRef(t) break } if err != fs.ErrResolveViaReadlink { @@ -384,7 +384,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { break } - defer newParent.DecRef() + defer newParent.DecRef(t) // Repeat the process with the parent and name of the // symlink target. @@ -416,7 +416,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } - defer newFile.DecRef() + defer newFile.DecRef(t) case syserror.ENOENT: // File does not exist. Proceed with creation. @@ -432,7 +432,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l // No luck, bail. return err } - defer newFile.DecRef() + defer newFile.DecRef(t) found = newFile.Dirent default: return err @@ -596,7 +596,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Shared flags between file and socket. switch request { @@ -671,9 +671,9 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() size := args[1].SizeT() cwd := t.FSContext().WorkingDirectory() - defer cwd.DecRef() + defer cwd.DecRef(t) root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) // Get our fullname from the root and preprend unreachable if the root was // unreachable from our current dirent this is the same behavior as on linux. @@ -722,7 +722,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return err } - t.FSContext().SetRootDirectory(d) + t.FSContext().SetRootDirectory(t, d) return nil }) } @@ -747,7 +747,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return err } - t.FSContext().SetWorkingDirectory(d) + t.FSContext().SetWorkingDirectory(t, d) return nil }) } @@ -760,7 +760,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is it a directory? if !fs.IsDir(file.Dirent.Inode.StableAttr) { @@ -772,7 +772,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } - t.FSContext().SetWorkingDirectory(file.Dirent) + t.FSContext().SetWorkingDirectory(t, file.Dirent) return 0, nil, nil } @@ -791,7 +791,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Flush(t) return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) @@ -805,7 +805,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { @@ -826,7 +826,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldFile == nil { return 0, nil, syserror.EBADF } - defer oldFile.DecRef() + defer oldFile.DecRef(t) return uintptr(newfd), nil, nil } @@ -850,7 +850,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldFile == nil { return 0, nil, syserror.EBADF } - defer oldFile.DecRef() + defer oldFile.DecRef(t) err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}) if err != nil { @@ -925,7 +925,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: @@ -1132,7 +1132,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if fs.IsPipe(file.Dirent.Inode.StableAttr) { @@ -1171,7 +1171,7 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode switch err { case nil: // The directory existed. - defer f.DecRef() + defer f.DecRef(t) return syserror.EEXIST case syserror.EACCES: // Permission denied while walking to the directory. @@ -1349,7 +1349,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32 if target == nil { return syserror.EBADF } - defer target.DecRef() + defer target.DecRef(t) if err := mayLinkAt(t, target.Dirent.Inode); err != nil { return err } @@ -1602,7 +1602,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Reject truncation if the file flags do not permit this operation. // This is different from truncate(2) above. @@ -1730,7 +1730,7 @@ func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bo if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return chown(t, file.Dirent, uid, gid) } @@ -1768,7 +1768,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, chown(t, file.Dirent, uid, gid) } @@ -1833,7 +1833,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, chmod(t, file.Dirent, mode) } @@ -1893,10 +1893,10 @@ func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, reso if f == nil { return syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals) } @@ -2088,7 +2088,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if offset < 0 || length <= 0 { return 0, nil, syserror.EINVAL @@ -2141,7 +2141,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB @@ -2224,8 +2224,8 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } - defer dirent.DecRef() - defer file.DecRef() + defer dirent.DecRef(t) + defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: cloExec, diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index f04d78856..9d1b2edb1 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -73,7 +73,7 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) } - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS) } @@ -95,7 +95,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add } remaining, err := t.BlockWithTimeout(w.C, !forever, duration) - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) if err == nil { return 0, nil } @@ -148,7 +148,7 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.A timer.Destroy() } - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index b126fecc0..f5699e55d 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -68,7 +68,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir if dir == nil { return 0, syserror.EBADF } - defer dir.DecRef() + defer dir.DecRef(t) w := &usermem.IOReadWriter{ Ctx: t, diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index b2c7b3444..cf47bb9dd 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -40,7 +40,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. NonBlocking: flags&linux.IN_NONBLOCK != 0, } n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) - defer n.DecRef() + defer n.DecRef(t) fd, err := t.NewFDFrom(0, n, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, @@ -71,7 +71,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. - file.DecRef() + file.DecRef(t) return nil, nil, syserror.EINVAL } @@ -98,7 +98,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -128,6 +128,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if err != nil { return 0, nil, err } - defer file.DecRef() - return 0, nil, ino.RmWatch(wd) + defer file.DecRef(t) + return 0, nil, ino.RmWatch(t, wd) } diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 3f7691eae..1c38f8f4f 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -33,7 +33,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) var sw fs.SeekWhence switch whence { diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 91694d374..72786b032 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -75,7 +75,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer func() { if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef() + opts.MappingIdentity.DecRef(t) } }() @@ -85,7 +85,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) flags := file.Flags() // mmap unconditionally requires that the FD is readable. diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go index eb5ff48f5..bd0633564 100644 --- a/pkg/sentry/syscalls/linux/sys_mount.go +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -115,7 +115,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall }); err != nil { // Something went wrong. Drop our ref on rootInode before // returning the error. - rootInode.DecRef() + rootInode.DecRef(t) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 43c510930..3149e4aad 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -34,10 +34,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) r.SetFlags(linuxToFlags(flags).Settable()) - defer r.DecRef() + defer r.DecRef(t) w.SetFlags(linuxToFlags(flags).Settable()) - defer w.DecRef() + defer w.DecRef(t) fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -49,7 +49,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if file, _ := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, err diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index f0198141c..3435bdf77 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -70,7 +70,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } if ch == nil { - defer file.DecRef() + defer file.DecRef(t) } else { state.file = file state.waiter, _ = waiter.NewChannelEntry(ch) @@ -82,11 +82,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } // releaseState releases all the pollState in "state". -func releaseState(state []pollState) { +func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) - state[i].file.DecRef() + state[i].file.DecRef(t) } } } @@ -107,7 +107,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) - defer releaseState(state) + defer releaseState(t, state) n := uintptr(0) for i := range pfd { initReadiness(t, &pfd[i], &state[i], ch) @@ -266,7 +266,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add if file == nil { return 0, syserror.EBADF } - file.DecRef() + file.DecRef(t) var mask int16 if (rV & m) != 0 { diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index f92bf8096..64a725296 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -128,7 +128,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { @@ -136,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } // Set the underlying executable. - t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file)) + t.MemoryManager().SetExecutable(t, fsbridge.NewFSFile(file)) case linux.PR_SET_MM_AUXV, linux.PR_SET_MM_START_CODE, diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 071b4bacc..3bbc3fa4b 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -48,7 +48,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -84,7 +84,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -118,7 +118,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -164,7 +164,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -195,7 +195,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -244,7 +244,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go index 4a8bc24a2..f0ae8fa8e 100644 --- a/pkg/sentry/syscalls/linux/sys_shm.go +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -39,7 +39,7 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer segment.DecRef() + defer segment.DecRef(t) return uintptr(segment.ID), nil, nil } @@ -66,7 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{ Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC, @@ -108,7 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) stat, err := segment.IPCStat(t) if err == nil { @@ -132,7 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) switch cmd { case linux.IPC_SET: @@ -145,7 +145,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err case linux.IPC_RMID: - segment.MarkDestroyed() + segment.MarkDestroyed(t) return 0, nil, nil case linux.SHM_LOCK, linux.SHM_UNLOCK: diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index d2b0012ae..20cb1a5cb 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -536,7 +536,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is this a signalfd? if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok { @@ -553,7 +553,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) // Set appropriate flags. file.SetFlags(fs.SettableFileFlags{ diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 414fce8e3..fec1c1974 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -200,7 +200,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal s.SetFlags(fs.SettableFileFlags{ NonBlocking: stype&linux.SOCK_NONBLOCK != 0, }) - defer s.DecRef() + defer s.DecRef(t) fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, @@ -235,8 +235,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } s1.SetFlags(fileFlags) s2.SetFlags(fileFlags) - defer s1.DecRef() - defer s2.DecRef() + defer s1.DecRef(t) + defer s2.DecRef(t) // Create the FDs for the sockets. fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{ @@ -250,7 +250,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if _, err := t.CopyOut(socks, fds); err != nil { for _, fd := range fds { if file, _ := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, nil, err @@ -270,7 +270,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -301,7 +301,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -360,7 +360,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -387,7 +387,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -416,7 +416,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -447,7 +447,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -529,7 +529,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -567,7 +567,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -595,7 +595,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -628,7 +628,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -681,7 +681,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -775,7 +775,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Release() + cms.Release(t) } if int(msg.Flags) != mflags { @@ -795,7 +795,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } - defer cms.Release() + defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) @@ -851,7 +851,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -880,7 +880,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Release() + cm.Release(t) if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } @@ -924,7 +924,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -962,7 +962,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -1066,7 +1066,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) if err != nil { - controlMessages.Release() + controlMessages.Release(t) } return uintptr(n), err } @@ -1084,7 +1084,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 77c78889d..b8846a10a 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -101,7 +101,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) if !inFile.Flags().Read { return 0, nil, syserror.EBADF @@ -111,7 +111,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) if !outFile.Flags().Write { return 0, nil, syserror.EBADF @@ -192,13 +192,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) // The operation is non-blocking if anything is non-blocking. // @@ -300,13 +300,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) // All files must be pipes. if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) { diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 46ebf27a2..a5826f2dd 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -58,7 +58,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, fstat(t, file, statAddr) } @@ -100,7 +100,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, fstat(t, file, statAddr) } @@ -158,7 +158,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) uattr, err := file.UnstableAttr(t) if err != nil { return 0, nil, err @@ -249,7 +249,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, statfsImpl(t, file.Dirent, statfsAddr) } diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 5ad465ae3..f2c0e5069 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -39,7 +39,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Use "sync-the-world" for now, it's guaranteed that fd is at least // on the root filesystem. @@ -54,7 +54,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll) return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) @@ -70,7 +70,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData) return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) @@ -103,7 +103,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the // specified range that have already been submitted to the device diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 00915fdde..2d16e4933 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -117,7 +117,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) var wd *fs.Dirent var executable fsbridge.File @@ -133,7 +133,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) closeOnExec = fdFlags.CloseOnExec if atEmptyPath && len(pathname) == 0 { @@ -155,7 +155,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user } } if wd != nil { - defer wd.DecRef() + defer wd.DecRef(t) } // Load the new TaskContext. diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index cf49b43db..34b03e4ee 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -43,7 +43,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } f := timerfd.NewFile(t, c) - defer f.DecRef() + defer f.DecRef(t) f.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.TFD_NONBLOCK != 0, }) @@ -73,7 +73,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { @@ -107,7 +107,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 6ec0de96e..485526e28 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -48,7 +48,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { @@ -85,7 +85,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -131,7 +131,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { @@ -162,7 +162,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -215,7 +215,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index c24946160..97474fd3c 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -49,7 +49,7 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size) if err != nil { @@ -153,7 +153,7 @@ func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags) } @@ -270,7 +270,7 @@ func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) n, err := listXattr(t, f.Dirent, listAddr, size) if err != nil { @@ -384,7 +384,7 @@ func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) return 0, nil, removeXattr(t, f.Dirent, nameAddr) } diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go index e5cdefc50..399b4f60c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/aio.go +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -88,7 +88,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user if fd == nil { return syserror.EBADF } - defer fd.DecRef() + defer fd.DecRef(t) // Was there an eventFD? Extract it. var eventFD *vfs.FileDescription @@ -97,7 +97,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user if eventFD == nil { return syserror.EBADF } - defer eventFD.DecRef() + defer eventFD.DecRef(t) // Check that it is an eventfd. if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { @@ -169,7 +169,7 @@ func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr use ev.Result = -int64(kernel.ExtractErrno(err, 0)) } - fd.DecRef() + fd.DecRef(ctx) // Queue the result for delivery. aioCtx.FinishRequest(ev) @@ -179,7 +179,7 @@ func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr use // wake up. if eventFD != nil { eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) - eventFD.DecRef() + eventFD.DecRef(ctx) } } } diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index 34c90ae3e..c62f03509 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -37,11 +37,11 @@ func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewEpollInstanceFD() + file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, @@ -62,11 +62,11 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewEpollInstanceFD() + file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { @@ -86,7 +86,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if epfile == nil { return 0, nil, syserror.EBADF } - defer epfile.DecRef() + defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, syserror.EINVAL @@ -95,7 +95,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if epfile == file { return 0, nil, syserror.EINVAL } @@ -135,7 +135,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if epfile == nil { return 0, nil, syserror.EBADF } - defer epfile.DecRef() + defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index aff1a2070..807f909da 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -38,11 +38,11 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 - eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) + eventfd, err := eventfd.New(t, vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } - defer eventfd.DecRef() + defer eventfd.DecRef(t) fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go index aef0078a8..066ee0863 100644 --- a/pkg/sentry/syscalls/linux/vfs2/execve.go +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -71,7 +71,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) var executable fsbridge.File closeOnExec := false if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { @@ -90,7 +90,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user } start := dirfile.VirtualDentry() start.IncRef() - dirfile.DecRef() + dirfile.DecRef(t) closeOnExec = dirfileFlags.CloseOnExec file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ Root: root, @@ -101,19 +101,19 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user Flags: linux.O_RDONLY, FileExec: true, }) - start.DecRef() + start.DecRef(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) executable = fsbridge.NewVFSFile(file) } // Load the new TaskContext. mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change - defer mntns.DecRef() + defer mntns.DecRef(t) wd := t.FSContext().WorkingDirectoryVFS2() - defer wd.DecRef() + defer wd.DecRef(t) remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ Opener: fsbridge.NewVFSLookup(mntns, root, wd), diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 67f191551..72ca916a0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -38,7 +38,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.OnClose(t) return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) @@ -52,7 +52,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { @@ -72,7 +72,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - file.DecRef() + file.DecRef(t) return uintptr(newfd), nil, nil } @@ -101,7 +101,7 @@ func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -121,7 +121,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: @@ -332,7 +332,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index b6d2ddd65..01e0f9010 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -56,7 +56,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i if err != nil { return err } - defer oldtpop.Release() + defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { @@ -66,7 +66,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i if err != nil { return err } - defer newtpop.Release() + defer newtpop.Release(t) return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) } @@ -95,7 +95,7 @@ func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), }) @@ -127,7 +127,7 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) // "Zero file type is equivalent to type S_IFREG." - mknod(2) if mode.FileType() == 0 { @@ -174,7 +174,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ Flags: flags | linux.O_LARGEFILE, @@ -183,7 +183,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -227,7 +227,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd if err != nil { return err } - defer oldtpop.Release() + defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { @@ -237,7 +237,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd if err != nil { return err } - defer newtpop.Release() + defer newtpop.Release(t) return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ Flags: flags, @@ -259,7 +259,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) } @@ -278,7 +278,7 @@ func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) } @@ -329,6 +329,6 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) } diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go index 317409a18..a7d4d2a36 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fscontext.go +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -31,8 +31,8 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal root := t.FSContext().RootDirectoryVFS2() wd := t.FSContext().WorkingDirectoryVFS2() s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) - root.DecRef() - wd.DecRef() + root.DecRef(t) + wd.DecRef(t) if err != nil { return 0, nil, err } @@ -67,7 +67,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -75,8 +75,8 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - t.FSContext().SetWorkingDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetWorkingDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } @@ -88,7 +88,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -96,8 +96,8 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - t.FSContext().SetWorkingDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetWorkingDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } @@ -117,7 +117,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -125,7 +125,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - t.FSContext().SetRootDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetRootDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go index c7c7bf7ce..5517595b5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/getdents.go +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -44,7 +44,7 @@ func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) cb := getGetdentsCallback(t, addr, size, isGetdents64) err := file.IterDirents(t, cb) diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go index 5d98134a5..11753d8e5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/inotify.go +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -35,7 +35,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if err != nil { return 0, nil, err } - defer ino.DecRef() + defer ino.DecRef(t) fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, @@ -66,7 +66,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, ino, ok := f.Impl().(*vfs.Inotify) if !ok { // Not an inotify fd. - f.DecRef() + f.DecRef(t) return nil, nil, syserror.EINVAL } @@ -96,7 +96,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer f.DecRef() + defer f.DecRef(t) path, err := copyInPath(t, addr) if err != nil { @@ -109,12 +109,12 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) if err != nil { return 0, nil, err } - defer d.DecRef() + defer d.DecRef(t) fd, err = ino.AddWatch(d.Dentry(), mask) if err != nil { @@ -132,6 +132,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if err != nil { return 0, nil, err } - defer f.DecRef() - return 0, nil, ino.RmWatch(wd) + defer f.DecRef(t) + return 0, nil, ino.RmWatch(t, wd) } diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index fd6ab94b2..38778a388 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -29,7 +29,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Handle ioctls that apply to all FDs. switch args[1].Int() { diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go index bf19028c4..b910b5a74 100644 --- a/pkg/sentry/syscalls/linux/vfs2/lock.go +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -32,7 +32,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go index bbe248d17..519583e4e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/memfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -47,7 +47,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } shmMount := t.Kernel().ShmMount() - file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go index 60a43f0a0..dc05c2994 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mmap.go +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -61,7 +61,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer func() { if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef() + opts.MappingIdentity.DecRef(t) } }() @@ -71,7 +71,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // mmap unconditionally requires that the FD is readable. if !file.IsReadable() { diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go index ea337de7c..4bd5c7ca2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mount.go +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -108,7 +108,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer target.Release() + defer target.Release(t) return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) } @@ -140,7 +140,7 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) opts := vfs.UmountOptions{ Flags: uint32(flags), diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go index 97da6c647..90a511d9a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/path.go +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -42,7 +42,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA haveStartRef := false if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { - root.DecRef() + root.DecRef(t) return taskPathOperation{}, syserror.ENOENT } if dirfd == linux.AT_FDCWD { @@ -51,13 +51,13 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - root.DecRef() + root.DecRef(t) return taskPathOperation{}, syserror.EBADF } start = dirfile.VirtualDentry() start.IncRef() haveStartRef = true - dirfile.DecRef() + dirfile.DecRef(t) } } return taskPathOperation{ @@ -71,10 +71,10 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA }, nil } -func (tpop *taskPathOperation) Release() { - tpop.pop.Root.DecRef() +func (tpop *taskPathOperation) Release(t *kernel.Task) { + tpop.pop.Root.DecRef(t) if tpop.haveStartRef { - tpop.pop.Start.DecRef() + tpop.pop.Start.DecRef(t) tpop.haveStartRef = false } } diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go index 4a01e4209..9b4848d9e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/pipe.go +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -42,8 +42,8 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { return syserror.EINVAL } r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(t) + defer w.DecRef(t) fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -54,7 +54,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if _, file := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return err diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go index ff1b25d7b..7b9d5e18a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/poll.go +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -73,7 +73,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } if ch == nil { - defer file.DecRef() + defer file.DecRef(t) } else { state.file = file state.waiter, _ = waiter.NewChannelEntry(ch) @@ -85,11 +85,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } // releaseState releases all the pollState in "state". -func releaseState(state []pollState) { +func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) - state[i].file.DecRef() + state[i].file.DecRef(t) } } } @@ -110,7 +110,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) - defer releaseState(state) + defer releaseState(t, state) n := uintptr(0) for i := range pfd { initReadiness(t, &pfd[i], &state[i], ch) @@ -269,7 +269,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add if file == nil { return 0, syserror.EBADF } - file.DecRef() + file.DecRef(t) var mask int16 if (rV & m) != 0 { diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index cd25597a7..a905dae0a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -44,7 +44,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the size is legitimate. si := int(size) @@ -75,7 +75,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Get the destination of the read. dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ @@ -94,7 +94,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -102,7 +102,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -135,7 +135,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -188,7 +188,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -226,7 +226,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -258,7 +258,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -266,7 +266,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -299,7 +299,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -314,7 +314,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the size is legitimate. si := int(size) @@ -345,7 +345,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Get the source of the write. src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ @@ -364,7 +364,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -372,7 +372,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -405,7 +405,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return total, err } @@ -421,7 +421,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -458,7 +458,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -496,7 +496,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -528,7 +528,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -536,7 +536,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -569,7 +569,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -601,7 +601,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newoff, err := file.Seek(t, offset, whence) return uintptr(newoff), nil, err @@ -617,7 +617,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.IsReadable() { diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 25cdb7a55..5e6eb13ba 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -66,7 +66,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ @@ -151,7 +151,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) var opts vfs.SetStatOptions if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { @@ -197,7 +197,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if !file.IsWritable() { return 0, nil, syserror.EINVAL @@ -224,7 +224,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if !file.IsWritable() { return 0, nil, syserror.EBADF @@ -258,7 +258,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return 0, nil, nil } @@ -438,7 +438,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { @@ -446,7 +446,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -457,13 +457,13 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa // VirtualFilesystem.SetStatAt(), since the former may be able // to use opened file state to expedite the SetStat. err := dirfile.SetStat(t, *opts) - dirfile.DecRef() + dirfile.DecRef(t) return err } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go index 623992f6f..b89f34cdb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/signal.go +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -45,7 +45,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is this a signalfd? if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok { @@ -68,7 +68,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) // Create a new descriptor. fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{ diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index 8096a8f9c..4a68c64f3 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -196,7 +196,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if e != nil { return 0, nil, e.ToError() } - defer s.DecRef() + defer s.DecRef(t) if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, err @@ -230,8 +230,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, e.ToError() } // Adding to the FD table will cause an extra reference to be acquired. - defer s1.DecRef() - defer s2.DecRef() + defer s1.DecRef(t) + defer s2.DecRef(t) nonblocking := uint32(stype & linux.SOCK_NONBLOCK) if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { @@ -253,7 +253,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if _, file := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, nil, err @@ -273,7 +273,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -304,7 +304,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -363,7 +363,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -390,7 +390,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -419,7 +419,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -450,7 +450,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -532,7 +532,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -570,7 +570,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -598,7 +598,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -631,7 +631,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -684,7 +684,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -778,7 +778,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Release() + cms.Release(t) } if int(msg.Flags) != mflags { @@ -798,7 +798,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } - defer cms.Release() + defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) @@ -854,7 +854,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -883,7 +883,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Release() + cm.Release(t) if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } @@ -927,7 +927,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -965,7 +965,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -1069,7 +1069,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) if err != nil { - controlMessages.Release() + controlMessages.Release(t) } return uintptr(n), err } @@ -1087,7 +1087,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 63ab11f8c..16f59fce9 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -53,12 +53,12 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { @@ -175,7 +175,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // On Linux, inotify behavior is not very consistent with splice(2). We try // our best to emulate Linux for very basic calls to splice, where for some // reason, events are generated for output files, but not input files. - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } @@ -203,12 +203,12 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { @@ -251,7 +251,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if n == 0 { return 0, nil, err } - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } @@ -266,7 +266,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) if !inFile.IsReadable() { return 0, nil, syserror.EBADF } @@ -275,7 +275,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) if !outFile.IsWritable() { return 0, nil, syserror.EBADF } @@ -419,8 +419,8 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } - inFile.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index bb1d5cac4..0f5d5189c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -65,7 +65,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { @@ -73,7 +73,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -85,7 +85,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) - dirfile.DecRef() + dirfile.DecRef(t) if err != nil { return err } @@ -96,8 +96,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } @@ -132,7 +132,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) statx, err := file.Stat(t, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, @@ -177,7 +177,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { @@ -185,7 +185,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -197,7 +197,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) - dirfile.DecRef() + dirfile.DecRef(t) if err != nil { return 0, nil, err } @@ -207,8 +207,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } @@ -282,7 +282,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) err if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) // access(2) and faccessat(2) check permissions using real // UID/GID, not effective UID/GID. @@ -328,7 +328,7 @@ func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, siz if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) if err != nil { @@ -358,7 +358,7 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { @@ -377,7 +377,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 0d0ebf46a..a6491ac37 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -34,7 +34,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.SyncFS(t) } @@ -47,7 +47,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.Sync(t) } @@ -77,7 +77,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support // is a full-file sync, i.e. fsync(2). As a result, there are severe diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 5ac79bc09..7a26890ef 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -50,11 +50,11 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } vfsObj := t.Kernel().VFS() - file, err := timerfd.New(vfsObj, clock, fileFlags) + file, err := timerfd.New(t, vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.TFD_CLOEXEC != 0, }) @@ -79,7 +79,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { @@ -113,7 +113,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go index af455d5c1..ef99246ed 100644 --- a/pkg/sentry/syscalls/linux/vfs2/xattr.go +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -49,7 +49,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size)) if err != nil { @@ -72,7 +72,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) names, err := file.Listxattr(t, uint64(size)) if err != nil { @@ -109,7 +109,7 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -141,7 +141,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -188,7 +188,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -222,7 +222,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -262,7 +262,7 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -281,7 +281,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 641e3e502..5a0e3e6b5 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -82,7 +82,7 @@ type anonDentry struct { } // Release implements FilesystemImpl.Release. -func (fs *anonFilesystem) Release() { +func (fs *anonFilesystem) Release(ctx context.Context) { } // Sync implements FilesystemImpl.Sync. @@ -294,7 +294,7 @@ func (d *anonDentry) TryIncRef() bool { } // DecRef implements DentryImpl.DecRef. -func (d *anonDentry) DecRef() { +func (d *anonDentry) DecRef(ctx context.Context) { // no-op } @@ -303,7 +303,7 @@ func (d *anonDentry) DecRef() { // Although Linux technically supports inotify on pseudo filesystems (inotify // is implemented at the vfs layer), it is not particularly useful. It is left // unimplemented until someone actually needs it. -func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {} +func (d *anonDentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {} // Watches implements DentryImpl.Watches. func (d *anonDentry) Watches() *Watches { @@ -311,4 +311,4 @@ func (d *anonDentry) Watches() *Watches { } // OnZeroWatches implements Dentry.OnZeroWatches. -func (d *anonDentry) OnZeroWatches() {} +func (d *anonDentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index cea3e6955..bc7ea93ea 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -17,6 +17,7 @@ package vfs import ( "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -102,7 +103,7 @@ type DentryImpl interface { TryIncRef() bool // DecRef decrements the Dentry's reference count. - DecRef() + DecRef(ctx context.Context) // InotifyWithParent notifies all watches on the targets represented by this // dentry and its parent. The parent's watches are notified first, followed @@ -113,7 +114,7 @@ type DentryImpl interface { // // Note that the events may not actually propagate up to the user, depending // on the event masks. - InotifyWithParent(events, cookie uint32, et EventType) + InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) // Watches returns the set of inotify watches for the file corresponding to // the Dentry. Dentries that are hard links to the same underlying file @@ -135,7 +136,7 @@ type DentryImpl interface { // The caller does not need to hold a reference on the dentry. OnZeroWatches // may acquire inotify locks, so to prevent deadlock, no inotify locks should // be held by the caller. - OnZeroWatches() + OnZeroWatches(ctx context.Context) } // IncRef increments d's reference count. @@ -150,8 +151,8 @@ func (d *Dentry) TryIncRef() bool { } // DecRef decrements d's reference count. -func (d *Dentry) DecRef() { - d.impl.DecRef() +func (d *Dentry) DecRef(ctx context.Context) { + d.impl.DecRef(ctx) } // IsDead returns true if d has been deleted or invalidated by its owning @@ -168,8 +169,8 @@ func (d *Dentry) isMounted() bool { // InotifyWithParent notifies all watches on the targets represented by d and // its parent of events. -func (d *Dentry) InotifyWithParent(events, cookie uint32, et EventType) { - d.impl.InotifyWithParent(events, cookie, et) +func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) { + d.impl.InotifyWithParent(ctx, events, cookie, et) } // Watches returns the set of inotify watches associated with d. @@ -182,8 +183,8 @@ func (d *Dentry) Watches() *Watches { // OnZeroWatches performs cleanup tasks whenever the number of watches on a // dentry drops to zero. -func (d *Dentry) OnZeroWatches() { - d.impl.OnZeroWatches() +func (d *Dentry) OnZeroWatches(ctx context.Context) { + d.impl.OnZeroWatches(ctx) } // The following functions are exported so that filesystem implementations can @@ -214,11 +215,11 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion // succeeds. -func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { +func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) { d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDeadMountpoint(d) + vfs.forgetDeadMountpoint(ctx, d) } } @@ -226,12 +227,12 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { // did for reasons outside of VFS' control (e.g. d represents the local state // of a file on a remote filesystem on which the file has already been // deleted). -func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) { +func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) { d.mu.Lock() d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDeadMountpoint(d) + vfs.forgetDeadMountpoint(ctx, d) } } @@ -278,13 +279,13 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // that was replaced by from. // // Preconditions: PrepareRenameDentry was previously called on from and to. -func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) { +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) { from.mu.Unlock() if to != nil { to.dead = true to.mu.Unlock() if to.isMounted() { - vfs.forgetDeadMountpoint(to) + vfs.forgetDeadMountpoint(ctx, to) } } } @@ -303,7 +304,7 @@ func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { // // forgetDeadMountpoint is analogous to Linux's // fs/namespace.c:__detach_mounts(). -func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { +func (vfs *VirtualFilesystem) forgetDeadMountpoint(ctx context.Context, d *Dentry) { var ( vdsToDecRef []VirtualDentry mountsToDecRef []*Mount @@ -316,9 +317,9 @@ func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } } diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 5b009b928..1b5af9f73 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -93,9 +93,9 @@ type epollInterest struct { // NewEpollInstanceFD returns a FileDescription representing a new epoll // instance. A reference is taken on the returned FileDescription. -func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { +func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) { vd := vfs.NewAnonVirtualDentry("[eventpoll]") - defer vd.DecRef() + defer vd.DecRef(ctx) ep := &EpollInstance{ interest: make(map[epollInterestKey]*epollInterest), } @@ -110,7 +110,7 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { } // Release implements FileDescriptionImpl.Release. -func (ep *EpollInstance) Release() { +func (ep *EpollInstance) Release(ctx context.Context) { // Unregister all polled fds. ep.interestMu.Lock() defer ep.interestMu.Unlock() diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 93861fb4a..576ab3920 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -171,7 +171,7 @@ func (fd *FileDescription) TryIncRef() bool { } // DecRef decrements fd's reference count. -func (fd *FileDescription) DecRef() { +func (fd *FileDescription) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { // Unregister fd from all epoll instances. fd.epollMu.Lock() @@ -196,11 +196,11 @@ func (fd *FileDescription) DecRef() { } // Release implementation resources. - fd.impl.Release() + fd.impl.Release(ctx) if fd.writable { fd.vd.mount.EndWrite() } - fd.vd.DecRef() + fd.vd.DecRef(ctx) fd.flagsMu.Lock() // TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1. if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { @@ -335,7 +335,7 @@ func (fd *FileDescription) Impl() FileDescriptionImpl { type FileDescriptionImpl interface { // Release is called when the associated FileDescription reaches zero // references. - Release() + Release(ctx context.Context) // OnClose is called when a file descriptor representing the // FileDescription is closed. Note that returning a non-nil error does not @@ -526,7 +526,7 @@ func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.St Start: fd.vd, }) stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return stat, err } return fd.impl.Stat(ctx, opts) @@ -541,7 +541,7 @@ func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) err Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.SetStat(ctx, opts) @@ -557,7 +557,7 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { Start: fd.vd, }) statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return statfs, err } return fd.impl.StatFS(ctx) @@ -674,7 +674,7 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string Start: fd.vd, }) names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return names, err } names, err := fd.impl.Listxattr(ctx, size) @@ -703,7 +703,7 @@ func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) Start: fd.vd, }) val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return val, err } return fd.impl.Getxattr(ctx, *opts) @@ -719,7 +719,7 @@ func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.Setxattr(ctx, *opts) @@ -735,7 +735,7 @@ func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { Start: fd.vd, }) err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.Removexattr(ctx, name) @@ -752,7 +752,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string { vfsroot := RootFromContext(ctx) s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd) if vfsroot.Ok() { - vfsroot.DecRef() + vfsroot.DecRef(ctx) } return s } diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 3b7e1c273..1cd607c0a 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -80,9 +80,9 @@ type testFD struct { data DynamicBytesSource } -func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { +func newTestFD(ctx context.Context, vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef() + defer vd.DecRef(ctx) var fd testFD fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}) fd.DynamicBytesFileDescriptionImpl.SetDataSource(data) @@ -90,7 +90,7 @@ func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesS } // Release implements FileDescriptionImpl.Release. -func (fd *testFD) Release() { +func (fd *testFD) Release(context.Context) { } // SetStatusFlags implements FileDescriptionImpl.SetStatusFlags. @@ -109,11 +109,11 @@ func TestGenCountFD(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } - fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) - defer fd.DecRef() + fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &genCount{}) + defer fd.DecRef(ctx) // The first read causes Generate to be called to fill the FD's buffer. buf := make([]byte, 2) @@ -167,11 +167,11 @@ func TestWritable(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } - fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) - defer fd.DecRef() + fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &storeData{data: "init"}) + defer fd.DecRef(ctx) buf := make([]byte, 10) ioseq := usermem.BytesIOSequence(buf) diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 6bb9ca180..df3758fd1 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -100,12 +100,12 @@ func (fs *Filesystem) TryIncRef() bool { } // DecRef decrements fs' reference count. -func (fs *Filesystem) DecRef() { +func (fs *Filesystem) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { fs.vfs.filesystemsMu.Lock() delete(fs.vfs.filesystems, fs) fs.vfs.filesystemsMu.Unlock() - fs.impl.Release() + fs.impl.Release(ctx) } else if refs < 0 { panic("Filesystem.decRef() called without holding a reference") } @@ -149,7 +149,7 @@ func (fs *Filesystem) DecRef() { type FilesystemImpl interface { // Release is called when the associated Filesystem reaches zero // references. - Release() + Release(ctx context.Context) // Sync "causes all pending modifications to filesystem metadata and cached // file data to be written to the underlying [filesystem]", as by syncfs(2). diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 167b731ac..aff220a61 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -100,7 +100,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) id := uniqueid.GlobalFromContext(ctx) vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) - defer vd.DecRef() + defer vd.DecRef(ctx) fd := &Inotify{ id: id, scratch: make([]byte, inotifyEventBaseSize), @@ -118,7 +118,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) // Release implements FileDescriptionImpl.Release. Release removes all // watches and frees all resources for an inotify instance. -func (i *Inotify) Release() { +func (i *Inotify) Release(ctx context.Context) { var ds []*Dentry // We need to hold i.mu to avoid a race with concurrent calls to @@ -144,7 +144,7 @@ func (i *Inotify) Release() { i.mu.Unlock() for _, d := range ds { - d.OnZeroWatches() + d.OnZeroWatches(ctx) } } @@ -350,7 +350,7 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { // RmWatch looks up an inotify watch for the given 'wd' and configures the // target to stop sending events to this inotify instance. -func (i *Inotify) RmWatch(wd int32) error { +func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. @@ -374,7 +374,7 @@ func (i *Inotify) RmWatch(wd int32) error { i.mu.Unlock() if remaining == 0 { - w.target.OnZeroWatches() + w.target.OnZeroWatches(ctx) } // Generate the event for the removal. @@ -462,7 +462,7 @@ func (w *Watches) Remove(id uint64) { // Notify queues a new event with watches in this set. Watches with // IN_EXCL_UNLINK are skipped if the event is coming from a child that has been // unlinked. -func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) { +func (w *Watches) Notify(ctx context.Context, name string, events, cookie uint32, et EventType, unlinked bool) { var hasExpired bool w.mu.RLock() for _, watch := range w.ws { @@ -476,13 +476,13 @@ func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlin w.mu.RUnlock() if hasExpired { - w.cleanupExpiredWatches() + w.cleanupExpiredWatches(ctx) } } // This function is relatively expensive and should only be called where there // are expired watches. -func (w *Watches) cleanupExpiredWatches() { +func (w *Watches) cleanupExpiredWatches(ctx context.Context) { // Because of lock ordering, we cannot acquire Inotify.mu for each watch // owner while holding w.mu. As a result, store expired watches locally // before removing. @@ -495,15 +495,15 @@ func (w *Watches) cleanupExpiredWatches() { } w.mu.RUnlock() for _, watch := range toRemove { - watch.owner.RmWatch(watch.wd) + watch.owner.RmWatch(ctx, watch.wd) } } // HandleDeletion is called when the watch target is destroyed. Clear the // watch set, detach watches from the inotify instances they belong to, and // generate the appropriate events. -func (w *Watches) HandleDeletion() { - w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) +func (w *Watches) HandleDeletion(ctx context.Context) { + w.Notify(ctx, "", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for // the owner of each watch being deleted. Instead, atomically store the @@ -744,12 +744,12 @@ func InotifyEventFromStatMask(mask uint32) uint32 { // InotifyRemoveChild sends the appriopriate notifications to the watch sets of // the child being removed and its parent. Note that unlike most pairs of // parent/child notifications, the child is notified first in this case. -func InotifyRemoveChild(self, parent *Watches, name string) { +func InotifyRemoveChild(ctx context.Context, self, parent *Watches, name string) { if self != nil { - self.Notify("", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) + self.Notify(ctx, "", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) } if parent != nil { - parent.Notify(name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) + parent.Notify(ctx, name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) } } @@ -762,13 +762,13 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, } cookie := uniqueid.InotifyCookie(ctx) if oldParent != nil { - oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) + oldParent.Notify(ctx, oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) } if newParent != nil { - newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) + newParent.Notify(ctx, newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) } // Somewhat surprisingly, self move events do not have a cookie. if renamed != nil { - renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) + renamed.Notify(ctx, "", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) } } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 32f901bd8..d1d29d0cd 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -200,8 +200,8 @@ func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth if err != nil { return nil, err } - defer root.DecRef() - defer fs.DecRef() + defer root.DecRef(ctx) + defer fs.DecRef(ctx) return vfs.NewDisconnectedMount(fs, root, opts) } @@ -221,7 +221,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr if vd.dentry.dead { vd.dentry.mu.Unlock() vfs.mountMu.Unlock() - vd.DecRef() + vd.DecRef(ctx) return syserror.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and @@ -243,7 +243,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr // This can't fail since we're holding vfs.mountMu. nextmnt.root.IncRef() vd.dentry.mu.Unlock() - vd.DecRef() + vd.DecRef(ctx) vd = VirtualDentry{ mount: nextmnt, dentry: nextmnt.root, @@ -268,7 +268,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia if err != nil { return err } - defer mnt.DecRef() + defer mnt.DecRef(ctx) if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { return err } @@ -293,13 +293,13 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti if err != nil { return err } - defer vd.DecRef() + defer vd.DecRef(ctx) if vd.dentry != vd.mount.root { return syserror.EINVAL } vfs.mountMu.Lock() if mntns := MountNamespaceFromContext(ctx); mntns != nil { - defer mntns.DecRef() + defer mntns.DecRef(ctx) if mntns != vd.mount.ns { vfs.mountMu.Unlock() return syserror.EINVAL @@ -335,10 +335,10 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } return nil } @@ -479,7 +479,7 @@ func (mnt *Mount) IncRef() { } // DecRef decrements mnt's reference count. -func (mnt *Mount) DecRef() { +func (mnt *Mount) DecRef(ctx context.Context) { refs := atomic.AddInt64(&mnt.refs, -1) if refs&^math.MinInt64 == 0 { // mask out MSB var vd VirtualDentry @@ -490,10 +490,10 @@ func (mnt *Mount) DecRef() { mnt.vfs.mounts.seq.EndWrite() mnt.vfs.mountMu.Unlock() } - mnt.root.DecRef() - mnt.fs.DecRef() + mnt.root.DecRef(ctx) + mnt.fs.DecRef(ctx) if vd.Ok() { - vd.DecRef() + vd.DecRef(ctx) } } } @@ -506,7 +506,7 @@ func (mntns *MountNamespace) IncRef() { } // DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef() { +func (mntns *MountNamespace) DecRef(ctx context.Context) { vfs := mntns.root.fs.VirtualFilesystem() if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { vfs.mountMu.Lock() @@ -517,10 +517,10 @@ func (mntns *MountNamespace) DecRef() { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } } else if refs < 0 { panic("MountNamespace.DecRef() called without holding a reference") @@ -534,7 +534,7 @@ func (mntns *MountNamespace) DecRef() { // getMountAt is analogous to Linux's fs/namei.c:follow_mount(). // // Preconditions: References are held on mnt and d. -func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { +func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount { // The first mount is special-cased: // // - The caller is assumed to have checked d.isMounted() already. (This @@ -565,7 +565,7 @@ retryFirst: // Raced with umount. continue } - mnt.DecRef() + mnt.DecRef(ctx) mnt = next d = next.root } @@ -578,7 +578,7 @@ retryFirst: // // Preconditions: References are held on mnt and root. vfsroot is not (mnt, // mnt.root). -func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry { +func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { // The first mount is special-cased: // // - The caller must have already checked mnt against vfsroot. @@ -602,12 +602,12 @@ retryFirst: if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can only // happen due to a racing change to Mount.key. - parent.DecRef() + parent.DecRef(ctx) goto retryFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.DecRef() - parent.DecRef() + point.DecRef(ctx) + parent.DecRef(ctx) goto retryFirst } mnt = parent @@ -635,16 +635,16 @@ retryFirst: if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can // only happen due to a racing change to Mount.key. - parent.DecRef() + parent.DecRef(ctx) goto retryNotFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.DecRef() - parent.DecRef() + point.DecRef(ctx) + parent.DecRef(ctx) goto retryNotFirst } - d.DecRef() - mnt.DecRef() + d.DecRef(ctx) + mnt.DecRef(ctx) mnt = parent d = point } diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index cd78d66bc..e4da15009 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -47,7 +47,7 @@ func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() @@ -64,12 +64,12 @@ loop: // of FilesystemImpl.PrependPath() may return nil instead. break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { break loop } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true @@ -101,7 +101,7 @@ func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() loop: @@ -112,12 +112,12 @@ loop: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { return "", nil } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true @@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() unreachable := false @@ -157,13 +157,13 @@ loop: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { unreachable = true break loop } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 9d047ff88..3304372d9 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" @@ -136,31 +137,31 @@ func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *Pat return rp } -func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { +func (vfs *VirtualFilesystem) putResolvingPath(ctx context.Context, rp *ResolvingPath) { rp.root = VirtualDentry{} - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = nil rp.start = nil - rp.releaseErrorState() + rp.releaseErrorState(ctx) resolvingPathPool.Put(rp) } -func (rp *ResolvingPath) decRefStartAndMount() { +func (rp *ResolvingPath) decRefStartAndMount(ctx context.Context) { if rp.flags&rpflagsHaveStartRef != 0 { - rp.start.DecRef() + rp.start.DecRef(ctx) } if rp.flags&rpflagsHaveMountRef != 0 { - rp.mount.DecRef() + rp.mount.DecRef(ctx) } } -func (rp *ResolvingPath) releaseErrorState() { +func (rp *ResolvingPath) releaseErrorState(ctx context.Context) { if rp.nextStart != nil { - rp.nextStart.DecRef() + rp.nextStart.DecRef(ctx) rp.nextStart = nil } if rp.nextMount != nil { - rp.nextMount.DecRef() + rp.nextMount.DecRef(ctx) rp.nextMount = nil } } @@ -236,13 +237,13 @@ func (rp *ResolvingPath) Advance() { // Restart resets the stream of path components represented by rp to its state // on entry to the current FilesystemImpl method. -func (rp *ResolvingPath) Restart() { +func (rp *ResolvingPath) Restart(ctx context.Context) { rp.pit = rp.origParts[rp.numOrigParts-1] rp.mustBeDir = rp.mustBeDirOrig rp.symlinks = rp.symlinksOrig rp.curPart = rp.numOrigParts - 1 copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) - rp.releaseErrorState() + rp.releaseErrorState(ctx) } func (rp *ResolvingPath) relpathCommit() { @@ -260,13 +261,13 @@ func (rp *ResolvingPath) relpathCommit() { // Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path // resolution should resolve d's parent normally, and CheckRoot returns (false, // nil). -func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { +func (rp *ResolvingPath) CheckRoot(ctx context.Context, d *Dentry) (bool, error) { if d == rp.root.dentry && rp.mount == rp.root.mount { // At contextual VFS root (due to e.g. chroot(2)). return true, nil } else if d == rp.mount.root { // At mount root ... - vd := rp.vfs.getMountpointAt(rp.mount, rp.root) + vd := rp.vfs.getMountpointAt(ctx, rp.mount, rp.root) if vd.Ok() { // ... of non-root mount. rp.nextMount = vd.mount @@ -283,11 +284,11 @@ func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { // to d. If d is a mount point, such that path resolution should switch to // another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount // returns nil. -func (rp *ResolvingPath) CheckMount(d *Dentry) error { +func (rp *ResolvingPath) CheckMount(ctx context.Context, d *Dentry) error { if !d.isMounted() { return nil } - if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil { + if mnt := rp.vfs.getMountAt(ctx, rp.mount, d); mnt != nil { rp.nextMount = mnt return resolveMountPointError{} } @@ -389,11 +390,11 @@ func (rp *ResolvingPath) HandleJump(target VirtualDentry) error { return resolveMountRootOrJumpError{} } -func (rp *ResolvingPath) handleError(err error) bool { +func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool { switch err.(type) { case resolveMountRootOrJumpError: // Switch to the new Mount. We hold references on the Mount and Dentry. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextStart rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef @@ -412,7 +413,7 @@ func (rp *ResolvingPath) handleError(err error) bool { case resolveMountPointError: // Switch to the new Mount. We hold a reference on the Mount, but // borrow the reference on the mount root from the Mount. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextMount.root rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef @@ -423,12 +424,12 @@ func (rp *ResolvingPath) handleError(err error) bool { // path. rp.relpathCommit() // Restart path resolution on the new Mount. - rp.releaseErrorState() + rp.releaseErrorState(ctx) return true case resolveAbsSymlinkError: // Switch to the new Mount. References are borrowed from rp.root. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.root.mount rp.start = rp.root.dentry rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef @@ -440,7 +441,7 @@ func (rp *ResolvingPath) handleError(err error) bool { // path, including the symlink target we just prepended. rp.relpathCommit() // Restart path resolution on the new Mount. - rp.releaseErrorState() + rp.releaseErrorState(ctx) return true default: diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 522e27475..9c2420683 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -122,7 +122,7 @@ type VirtualFilesystem struct { } // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. -func (vfs *VirtualFilesystem) Init() error { +func (vfs *VirtualFilesystem) Init(ctx context.Context) error { if vfs.mountpoints != nil { panic("VFS already initialized") } @@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) Init() error { devMinor: anonfsDevMinor, } anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) - defer anonfs.vfsfs.DecRef() + defer anonfs.vfsfs.DecRef(ctx) anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) if err != nil { // We should not be passing any MountOptions that would cause @@ -192,11 +192,11 @@ func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -214,11 +214,11 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede dentry: d, } rp.mount.IncRef() - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return vd, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return VirtualDentry{}, err } } @@ -236,7 +236,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au } rp.mount.IncRef() name := rp.Component() - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return parentVD, name, nil } if checkInvariants { @@ -244,8 +244,8 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return VirtualDentry{}, "", err } } @@ -260,14 +260,14 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential } if !newpop.Path.Begin.Ok() { - oldVD.DecRef() + oldVD.DecRef(ctx) if newpop.Path.Absolute { return syserror.EEXIST } return syserror.ENOENT } if newpop.FollowFinalSymlink { - oldVD.DecRef() + oldVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") return syserror.EINVAL } @@ -276,8 +276,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential for { err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) if err == nil { - vfs.putResolvingPath(rp) - oldVD.DecRef() + vfs.putResolvingPath(ctx, rp) + oldVD.DecRef(ctx) return nil } if checkInvariants { @@ -285,9 +285,9 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - oldVD.DecRef() + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) + oldVD.DecRef(ctx) return err } } @@ -313,7 +313,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -321,8 +321,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -346,7 +346,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -354,8 +354,8 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -408,31 +408,31 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential for { fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) if opts.FileExec { if fd.Mount().Flags.NoExec { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EACCES } // Only a regular file can be executed. stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) if err != nil { - fd.DecRef() + fd.DecRef(ctx) return nil, err } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EACCES } } - fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) + fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) return fd, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -444,11 +444,11 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden for { target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return target, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return "", err } } @@ -472,19 +472,19 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti return err } if oldName == "." || oldName == ".." { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) return syserror.EBUSY } if !newpop.Path.Begin.Ok() { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) if newpop.Path.Absolute { return syserror.EBUSY } return syserror.ENOENT } if newpop.FollowFinalSymlink { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") return syserror.EINVAL } @@ -497,8 +497,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) if err == nil { - vfs.putResolvingPath(rp) - oldParentVD.DecRef() + vfs.putResolvingPath(ctx, rp) + oldParentVD.DecRef(ctx) return nil } if checkInvariants { @@ -506,9 +506,9 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - oldParentVD.DecRef() + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) + oldParentVD.DecRef(ctx) return err } } @@ -531,7 +531,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.RmdirAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -539,8 +539,8 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -552,11 +552,11 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent for { err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -568,11 +568,11 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential for { stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return stat, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return linux.Statx{}, err } } @@ -585,11 +585,11 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti for { statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return statfs, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return linux.Statfs{}, err } } @@ -612,7 +612,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent for { err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -620,8 +620,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -644,7 +644,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.UnlinkAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -652,8 +652,8 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -671,7 +671,7 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C for { bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return bep, nil } if checkInvariants { @@ -679,8 +679,8 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -693,7 +693,7 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede for { names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return names, nil } if err == syserror.ENOTSUP { @@ -701,11 +701,11 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede // fs/xattr.c:vfs_listxattr() falls back to allowing the security // subsystem to return security extended attributes, which by // default don't exist. - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -718,11 +718,11 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden for { val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return val, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return "", err } } @@ -735,11 +735,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden for { err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -751,11 +751,11 @@ func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Cre for { err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -777,7 +777,7 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { if err := fs.impl.Sync(ctx); err != nil && retErr == nil { retErr = err } - fs.DecRef() + fs.DecRef(ctx) } return retErr } @@ -831,9 +831,9 @@ func (vd VirtualDentry) IncRef() { // DecRef decrements the reference counts on the Mount and Dentry represented // by vd. -func (vd VirtualDentry) DecRef() { - vd.dentry.DecRef() - vd.mount.DecRef() +func (vd VirtualDentry) DecRef(ctx context.Context) { + vd.dentry.DecRef(ctx) + vd.mount.DecRef(ctx) } // Mount returns the Mount associated with vd. It does not take a reference on diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD index e0db6cf54..6c137f693 100644 --- a/pkg/tcpip/link/tun/BUILD +++ b/pkg/tcpip/link/tun/BUILD @@ -12,6 +12,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/refs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go index 04ae58e59..22b0a12bd 100644 --- a/pkg/tcpip/link/tun/device.go +++ b/pkg/tcpip/link/tun/device.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -64,14 +65,14 @@ func (d *Device) beforeSave() { } // Release implements fs.FileOperations.Release. -func (d *Device) Release() { +func (d *Device) Release(ctx context.Context) { d.mu.Lock() defer d.mu.Unlock() // Decrease refcount if there is an endpoint associated with this file. if d.endpoint != nil { d.endpoint.RemoveNotify(d.notifyHandle) - d.endpoint.DecRef() + d.endpoint.DecRef(ctx) d.endpoint = nil } } @@ -341,8 +342,8 @@ type tunEndpoint struct { } // DecRef decrements refcount of e, removes NIC if refcount goes to 0. -func (e *tunEndpoint) DecRef() { - e.DecRefWithDestructor(func() { +func (e *tunEndpoint) DecRef(ctx context.Context) { + e.DecRefWithDestructor(ctx, func(context.Context) { e.stack.RemoveNIC(e.nicID) }) } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 59639ba19..9dd5b0184 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -640,7 +640,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, m := range c.mounts { log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) @@ -868,7 +868,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns if err != nil { return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) } - defer dirent.DecRef() + defer dirent.DecRef(ctx) if err := mns.Mount(ctx, dirent, inode); err != nil { return fmt.Errorf("mount %q error: %v", m.Destination, err) } @@ -889,12 +889,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun if err != nil { return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) } - defer target.DecRef() + defer target.DecRef(ctx) // Take a ref on the inode that is about to be (re)-mounted. source.root.IncRef() if err := mns.Mount(ctx, target, source.root); err != nil { - source.root.DecRef() + source.root.DecRef(ctx) return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) } @@ -997,12 +997,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M switch err { case nil: // Found '/tmp' in filesystem, check if it's empty. - defer tmp.DecRef() + defer tmp.DecRef(ctx) f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) if err != nil { return err } - defer f.DecRef() + defer f.DecRef(ctx) serializer := &fs.CollectEntriesSerializer{} if err := f.Readdir(ctx, serializer); err != nil { return err diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 9cd9c5909..e0d077f5a 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -346,7 +346,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) } - defer hostFilesystem.DecRef() + defer hostFilesystem.DecRef(k.SupervisorContext()) hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { return nil, fmt.Errorf("failed to create hostfs mount: %v", err) @@ -755,7 +755,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn return nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. - info.procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. @@ -890,22 +890,20 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { // Add the HOME environment variable if it is not already set. if kernel.VFS2Enabled { - defer args.MountNamespaceVFS2.DecRef() - root := args.MountNamespaceVFS2.Root() - defer root.DecRef() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespaceVFS2.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err } args.Envv = envv } else { - defer args.MountNamespace.DecRef() - root := args.MountNamespace.Root() - defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespace.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err @@ -1263,7 +1261,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable := k.NewFDTable() ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) if err != nil { - fdTable.DecRef() + fdTable.DecRef(ctx) return nil, nil, nil, err } return fdTable, ttyFile, ttyFileVFS2, nil diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 8e6fe57e1..aa3fdf96c 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { maxTraversals := uint(0) if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) @@ -491,7 +491,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { target := &vfs.PathOperation{ Root: root, @@ -502,7 +502,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index cfe2d36aa..252ca07e3 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -103,7 +103,7 @@ func registerFilesystems(k *kernel.Kernel) error { if err != nil { return fmt.Errorf("creating devtmpfs accessor: %w", err) } - defer a.Release() + defer a.Release(ctx) if err := a.UserspaceInit(ctx); err != nil { return fmt.Errorf("initializing userspace: %w", err) @@ -252,7 +252,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, @@ -387,7 +387,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, @@ -481,10 +481,10 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Co if err != nil { return err } - defer newMnt.DecRef() + defer newMnt.DecRef(ctx) root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { return err } -- cgit v1.2.3 From 25798f214c6d1991916906ea8fca9e7029a8c423 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 3 Aug 2020 22:06:46 -0700 Subject: Add callbacks to support lazy loading/restoring thread states PiperOrigin-RevId: 324748508 --- pkg/sentry/kernel/kernel.go | 7 +++++ pkg/sentry/kernel/ptrace.go | 10 +++++++- pkg/sentry/kernel/task_clone.go | 4 +++ pkg/sentry/kernel/task_log.go | 43 +++++++++++++++++++++++++++++++ pkg/sentry/kernel/task_run.go | 2 +- pkg/sentry/kernel/task_signals.go | 10 ++++++-- pkg/sentry/kernel/task_stop.go | 16 ++++++++++++ pkg/sentry/memmap/memmap.go | 7 +++++ pkg/sentry/mm/lifecycle.go | 2 ++ pkg/sentry/mm/vma.go | 7 ++++- pkg/sentry/platform/kvm/BUILD | 1 + pkg/sentry/platform/kvm/address_space.go | 6 +++++ pkg/sentry/platform/kvm/context.go | 10 +++++++- pkg/sentry/platform/platform.go | 44 +++++++++++++++++++++++++++++++- pkg/sentry/platform/ptrace/BUILD | 1 + pkg/sentry/platform/ptrace/ptrace.go | 10 +++++++- pkg/sentry/platform/ptrace/subprocess.go | 6 +++++ pkg/sentry/state/state.go | 1 + 18 files changed, 179 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 316df249d..1028d13c6 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1263,6 +1263,13 @@ func (k *Kernel) Pause() { k.tasks.aioGoroutines.Wait() } +// ReceiveTaskStates receives full states for all tasks. +func (k *Kernel) ReceiveTaskStates() { + k.extMu.Lock() + k.tasks.PullFullState() + k.extMu.Unlock() +} + // Unpause ends the effect of a previous call to Pause. If Unpause is called // without a matching preceding call to Pause, Unpause may panic. func (k *Kernel) Unpause() { diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index e23e796ef..6c03d9041 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -1018,6 +1018,9 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + ar := ars.Head() n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, @@ -1044,10 +1047,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + + mm := t.MemoryManager() + t.p.PullFullState(mm.AddressSpace(), t.Arch()) + ar := ars.Head() n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, - IO: t.MemoryManager(), + IO: mm, Addr: ar.Start, Opts: usermem.IOOpts{ AddressSpaceActive: true, @@ -1056,6 +1063,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + t.p.FloatingPointStateChanged() ar.End -= usermem.Addr(n) return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index fe6ba6041..9d7a9128f 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -161,6 +161,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { return 0, nil, syserror.EINVAL } + // Pull task registers and FPU state, a cloned task will inherit the + // state of the current task. + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a // single clone(2) or unshare(2) call, the user namespace is guaranteed to // be created first, giving the child (clone(2)) or caller (unshare(2)) diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index ab86ceedc..d23cea802 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -27,6 +27,9 @@ const ( // maxStackDebugBytes is the maximum number of user stack bytes that may be // printed by debugDumpStack. maxStackDebugBytes = 1024 + // maxCodeDebugBytes is the maximum number of user code bytes that may be + // printed by debugDumpCode. + maxCodeDebugBytes = 128 ) // Infof logs an formatted info message by calling log.Infof. @@ -61,6 +64,7 @@ func (t *Task) IsLogging(level log.Level) bool { func (t *Task) DebugDumpState() { t.debugDumpRegisters() t.debugDumpStack() + t.debugDumpCode() if mm := t.MemoryManager(); mm != nil { t.Debugf("Mappings:\n%s", mm) } @@ -128,6 +132,45 @@ func (t *Task) debugDumpStack() { } } +// debugDumpCode logs user code contents at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpCode() { + if !t.IsLogging(log.Debug) { + return + } + m := t.MemoryManager() + if m == nil { + t.Debugf("Memory manager for task is gone, skipping application code dump.") + return + } + t.Debugf("Code:") + // Print code on both sides of the instruction register. + start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2 + // Round addr down to a 16-byte boundary. + start &= ^usermem.Addr(15) + // Print 16 bytes per line, one byte at a time. + for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 { + addr, ok := start.AddLength(offset) + if !ok { + break + } + var data [16]byte + n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ + IgnorePermissions: true, + }) + // Print as much of the line as we can, even if an error was + // encountered. + if n > 0 { + t.Debugf("%x: % x", addr, data[:n]) + } + if err != nil { + t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + break + } + } +} + // trace definitions. // // Note that all region names are prefixed by ':' in order to ensure that they diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 7d4f44caf..abaf29216 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -260,7 +260,7 @@ func (app *runApp) execute(t *Task) taskRunState { region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) - info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) + info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) region.End() diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 79766cafe..2180fd27d 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -255,10 +255,11 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) } } + mm := t.MemoryManager() // Set up the signal handler. If we have a saved signal mask, the signal // handler should run with the current mask, but sigreturn should restore // the saved one. - st := &arch.Stack{t.Arch(), t.MemoryManager(), sp} + st := &arch.Stack{t.Arch(), mm, sp} mask := t.signalMask if t.haveSavedSignalMask { mask = t.savedSignalMask @@ -273,12 +274,13 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) // Please see the linux code as reference: // linux/arch/arm64/kernel/signal.c:setup_return() if act.Flags&linux.SA_RESTORER == 0 { - act.Restorer = t.MemoryManager().VDSOSigReturn() + act.Restorer = mm.VDSOSigReturn() } if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { return err } + t.p.FloatingPointStateChanged() t.haveSavedSignalMask = false // Add our signal mask. @@ -310,6 +312,7 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. t.SetSignalMask(sigset &^ UnblockableSignals) + t.p.FloatingPointStateChanged() return ctrlResume, nil } @@ -636,6 +639,7 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { // SignalStack returns the task-private signal stack. func (t *Task) SignalStack() arch.SignalStack { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) alt := t.signalStack if t.onSignalStack(alt) { alt.Flags |= arch.SignalStackFlagOnStack @@ -1050,6 +1054,8 @@ func (*runInterrupt) execute(t *Task) taskRunState { // Are there signals pending? if info := t.dequeueSignalLocked(t.signalMask); info != nil { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 { // Indicate that we've dequeued a stop signal before unlocking the // signal mutex; initiateGroupStop will check for races with diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go index 10c6e455c..296735d32 100644 --- a/pkg/sentry/kernel/task_stop.go +++ b/pkg/sentry/kernel/task_stop.go @@ -205,6 +205,22 @@ func (ts *TaskSet) BeginExternalStop() { } } +// PullFullState receives full states for all tasks. +func (ts *TaskSet) PullFullState() { + ts.mu.Lock() + defer ts.mu.Unlock() + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.Activate() + if mm := t.MemoryManager(); mm != nil { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + } + t.Deactivate() + } +} + // EndExternalStop indicates the end of an external stop started by a previous // call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task // goroutines to resume. diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index 59c92c7e8..65d83096f 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -360,6 +360,13 @@ type MMapOpts struct { // // TODO(jamieliu): Replace entirely with MappingIdentity? Hint string + + // Force means to skip validation checks of Addr and Length. It can be + // used to create special mappings below mm.layout.MinAddr and + // mm.layout.MaxAddr. It has to be used with caution. + // + // If Force is true, Unmap and Fixed must be true. + Force bool } // File represents a host file that may be mapped into an platform.AddressSpace. diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 4d7773f8b..09dbc06a4 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -57,6 +57,8 @@ func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arc // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or // clone() (without CLONE_VM). func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { + mm.AddressSpace().PreFork() + defer mm.AddressSpace().PostFork() mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.mappingMu.RLock() diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index bd751d696..c4e1989ed 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -42,7 +42,12 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp Map32Bit: opts.Map32Bit, }) if err != nil { - return vmaIterator{}, usermem.AddrRange{}, err + // Can't force without opts.Unmap and opts.Fixed. + if opts.Force && opts.Unmap && opts.Fixed { + addr = opts.Addr + } else { + return vmaIterator{}, usermem.AddrRange{}, err + } } ar, _ := addr.ToRange(opts.Length) diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index b5d27a72a..3970dd81d 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -41,6 +41,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/atomicbitops", + "//pkg/context", "//pkg/cpuid", "//pkg/log", "//pkg/procid", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index 98a3e539d..af5c5e191 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -248,3 +248,9 @@ func (as *addressSpace) Release() { // Drop all cached machine references. as.machine.dropPageTables(as.pageTables) } + +// PreFork implements platform.AddressSpace.PreFork. +func (as *addressSpace) PreFork() {} + +// PostFork implements platform.AddressSpace.PostFork. +func (as *addressSpace) PostFork() {} diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index 6507121ea..eb92721fb 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -15,6 +15,7 @@ package kvm import ( + pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" @@ -37,7 +38,8 @@ type context struct { } // Switch runs the provided context in the given address space. -func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) { +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) { + as := mm.AddressSpace() localAS := as.(*addressSpace) // Grab a vCPU. @@ -88,3 +90,9 @@ func (c *context) Interrupt() { // Release implements platform.Context.Release(). func (c *context) Release() {} + +// FloatingPointStateChanged implements platform.Context.FloatingPointStateChanged. +func (c *context) FloatingPointStateChanged() {} + +// PullFullState implements platform.Context.PullFullState. +func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {} diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 4b13eec30..3f99afdd1 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -22,6 +22,7 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -114,6 +115,17 @@ func (NoCPUPreemptionDetection) PreemptAllCPUs() error { panic("This platform does not support CPU preemption detection") } +// MemoryManager represents an abstraction above the platform address space +// which manages memory mappings and their contents. +type MemoryManager interface { + //usermem.IO provides access to the contents of a virtual memory space. + usermem.IO + // MMap establishes a memory mapping. + MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) + // AddressSpace returns the AddressSpace bound to mm. + AddressSpace() AddressSpace +} + // Context represents the execution context for a single thread. type Context interface { // Switch resumes execution of the thread specified by the arch.Context @@ -143,7 +155,30 @@ type Context interface { // concurrent call to Switch(). // // - ErrContextCPUPreempted: See the definition of that error for details. - Switch(as AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) + Switch(ctx context.Context, mm MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) + + // PullFullState() pulls a full state of the application thread. + // + // A platform can support lazy loading/restoring of a thread state + // which includes registers and a floating point state. + // + // For example, when the Sentry handles a system call, it may have only + // syscall arguments without other registers and a floating point + // state. And in this case, if the Sentry will need to construct a + // signal frame to call a signal handler, it will need to call + // PullFullState() to load all registers and FPU state. + // + // Preconditions: The caller must be running on the task goroutine. + PullFullState(as AddressSpace, ac arch.Context) + + // FloatingPointStateChanged forces restoring a full state of the application thread. + // + // A platform can support lazy loading/restoring of a thread state. + // This means that if the Sentry has not changed a thread state, + // the platform may not restore it. + // + // Preconditions: The caller must be running on the task goroutine. + FloatingPointStateChanged() // Interrupt interrupts a concurrent call to Switch(), causing it to return // ErrContextInterrupt. @@ -218,6 +253,13 @@ type AddressSpace interface { // must be acquired via platform.NewAddressSpace(). Release() + // PreFork() is called before creating a copy of AddressSpace. This + // guarantees that this address space will be in a consistent state. + PreFork() + + // PostFork() is called after creating a copy of AddressSpace. + PostFork() + // AddressSpaceIO methods are supported iff the associated platform's // Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this // does not hold may panic if AddressSpaceIO methods are invoked. diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 29fd23cc3..e04165fbf 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -24,6 +24,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", "//pkg/procid", "//pkg/safecopy", diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 08d055e05..45ff2bcc3 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -48,6 +48,7 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" + pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" @@ -95,7 +96,8 @@ type context struct { } // Switch runs the provided context in the given address space. -func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) { +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) { + as := mm.AddressSpace() s := as.(*subprocess) isSyscall := s.switchToApp(c, ac) @@ -180,6 +182,12 @@ func (c *context) Interrupt() { // Release implements platform.Context.Release(). func (c *context) Release() {} +// FloatingPointStateChanged implements platform.Context.FloatingPointStateChanged. +func (c *context) FloatingPointStateChanged() {} + +// PullFullState implements platform.Context.PullFullState. +func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {} + // PTrace represents a collection of ptrace subprocesses. type PTrace struct { platform.MMapMinAddr diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index c990f3454..e1d54d8a2 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -662,3 +662,9 @@ func (s *subprocess) Unmap(addr usermem.Addr, length uint64) { panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) } } + +// PreFork implements platform.AddressSpace.PreFork. +func (s *subprocess) PreFork() {} + +// PostFork implements platform.AddressSpace.PostFork. +func (s *subprocess) PostFork() {} diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go index 9eb626b76..a06c9b8ab 100644 --- a/pkg/sentry/state/state.go +++ b/pkg/sentry/state/state.go @@ -60,6 +60,7 @@ type SaveOpts struct { func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error { log.Infof("Sandbox save started, pausing all tasks.") k.Pause() + k.ReceiveTaskStates() defer k.Unpause() defer log.Infof("Tasks resumed after save.") -- cgit v1.2.3 From 13a8ae81b2361cd32f8e73d14ca5b9bca9569b1a Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 7 Aug 2020 22:47:55 -0700 Subject: Add context.FullStateChanged() It indicates that the Sentry has changed the state of the thread and next calls of PullFullState() has to do nothing. PiperOrigin-RevId: 325567415 --- pkg/sentry/kernel/ptrace.go | 2 +- pkg/sentry/kernel/task_exec.go | 1 + pkg/sentry/kernel/task_signals.go | 4 ++-- pkg/sentry/platform/kvm/context.go | 4 ++-- pkg/sentry/platform/platform.go | 14 ++++++++++---- pkg/sentry/platform/ptrace/ptrace.go | 4 ++-- 6 files changed, 18 insertions(+), 11 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 6c03d9041..619b0cb7c 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -1063,7 +1063,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } - t.p.FloatingPointStateChanged() + t.p.FullStateChanged() ar.End -= usermem.Addr(n) return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 47c28b8ff..5e4fb3e3a 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -226,6 +226,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tc = *r.tc t.mu.Unlock() t.unstopVforkParent() + t.p.FullStateChanged() // NOTE(b/30316266): All locks must be dropped prior to calling Activate. t.MemoryManager().Activate(t) diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 2180fd27d..cff2a8365 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -280,7 +280,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { return err } - t.p.FloatingPointStateChanged() + t.p.FullStateChanged() t.haveSavedSignalMask = false // Add our signal mask. @@ -312,7 +312,7 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. t.SetSignalMask(sigset &^ UnblockableSignals) - t.p.FloatingPointStateChanged() + t.p.FullStateChanged() return ctrlResume, nil } diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index eb92721fb..6e6b76416 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -91,8 +91,8 @@ func (c *context) Interrupt() { // Release implements platform.Context.Release(). func (c *context) Release() {} -// FloatingPointStateChanged implements platform.Context.FloatingPointStateChanged. -func (c *context) FloatingPointStateChanged() {} +// FullStateChanged implements platform.Context.FullStateChanged. +func (c *context) FullStateChanged() {} // PullFullState implements platform.Context.PullFullState. func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {} diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 3f99afdd1..ba031516a 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -171,14 +171,20 @@ type Context interface { // Preconditions: The caller must be running on the task goroutine. PullFullState(as AddressSpace, ac arch.Context) - // FloatingPointStateChanged forces restoring a full state of the application thread. + // FullStateChanged() indicates that a thread state has been changed by + // the Sentry. This happens in case of the rt_sigreturn, execve, etc. // - // A platform can support lazy loading/restoring of a thread state. - // This means that if the Sentry has not changed a thread state, + // First, it indicates that the Sentry has the full state of the thread + // and PullFullState() has to do nothing if it is called after + // FullStateChanged(). + // + // Second, it forces restoring the full state of the application + // thread. A platform can support lazy loading/restoring of a thread + // state. This means that if the Sentry has not changed a thread state, // the platform may not restore it. // // Preconditions: The caller must be running on the task goroutine. - FloatingPointStateChanged() + FullStateChanged() // Interrupt interrupts a concurrent call to Switch(), causing it to return // ErrContextInterrupt. diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 45ff2bcc3..b52d0fbd8 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -182,8 +182,8 @@ func (c *context) Interrupt() { // Release implements platform.Context.Release(). func (c *context) Release() {} -// FloatingPointStateChanged implements platform.Context.FloatingPointStateChanged. -func (c *context) FloatingPointStateChanged() {} +// FullStateChanged implements platform.Context.FullStateChanged. +func (c *context) FullStateChanged() {} // PullFullState implements platform.Context.PullFullState. func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {} -- cgit v1.2.3 From 3bd066d5032c297e501f5c71be301ffa2fe9ed34 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Mon, 17 Aug 2020 11:40:08 -0700 Subject: Remove weak references from unix sockets. The abstract socket namespace no longer holds any references on sockets. Instead, TryIncRef() is used when a socket is being retrieved in BoundEndpoint(). Abstract sockets are now responsible for removing themselves from the namespace they are in, when they are destroyed. Updates #1486. PiperOrigin-RevId: 327064173 --- pkg/refs_vfs2/BUILD | 6 +- pkg/refs_vfs2/refs.go | 4 +- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/abstract_socket_namespace.go | 77 +++++++++++++++++--------- pkg/sentry/socket/unix/BUILD | 14 +++++ pkg/sentry/socket/unix/unix.go | 22 ++++++-- pkg/sentry/socket/unix/unix_vfs2.go | 6 +- 7 files changed, 91 insertions(+), 39 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/refs_vfs2/BUILD b/pkg/refs_vfs2/BUILD index 7f180c7bd..7b3e10683 100644 --- a/pkg/refs_vfs2/BUILD +++ b/pkg/refs_vfs2/BUILD @@ -19,10 +19,8 @@ go_template( ) go_library( - name = "refs", - srcs = [ - "refs.go", - ], + name = "refs_vfs2", + srcs = ["refs.go"], visibility = ["//pkg/sentry:internal"], deps = ["//pkg/context"], ) diff --git a/pkg/refs_vfs2/refs.go b/pkg/refs_vfs2/refs.go index ee01b17b0..99a074e96 100644 --- a/pkg/refs_vfs2/refs.go +++ b/pkg/refs_vfs2/refs.go @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package refs defines an interface for a reference-counted object. -package refs +// Package refs_vfs2 defines an interface for a reference-counted object. +package refs_vfs2 import ( "gvisor.dev/gvisor/pkg/context" diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index f6886a758..5416a310d 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -163,6 +163,7 @@ go_library( "//pkg/log", "//pkg/metric", "//pkg/refs", + "//pkg/refs_vfs2", "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 52ed5cea2..1b9721534 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -15,29 +15,21 @@ package kernel import ( + "fmt" "syscall" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/refs_vfs2" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" ) // +stateify savable type abstractEndpoint struct { - ep transport.BoundEndpoint - wr *refs.WeakRef - name string - ns *AbstractSocketNamespace -} - -// WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (e *abstractEndpoint) WeakRefGone(context.Context) { - e.ns.mu.Lock() - if e.ns.endpoints[e.name].ep == e.ep { - delete(e.ns.endpoints, e.name) - } - e.ns.mu.Unlock() + ep transport.BoundEndpoint + socket refs_vfs2.RefCounter + name string + ns *AbstractSocketNamespace } // AbstractSocketNamespace is used to implement the Linux abstract socket functionality. @@ -46,7 +38,11 @@ func (e *abstractEndpoint) WeakRefGone(context.Context) { type AbstractSocketNamespace struct { mu sync.Mutex `state:"nosave"` - // Keeps mapping from name to endpoint. + // Keeps a mapping from name to endpoint. AbstractSocketNamespace does not hold + // any references on any sockets that it contains; when retrieving a socket, + // TryIncRef() must be called in case the socket is concurrently being + // destroyed. It is the responsibility of the socket to remove itself from the + // abstract socket namespace when it is destroyed. endpoints map[string]abstractEndpoint } @@ -58,15 +54,15 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace { } // A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on -// its backing object. +// its backing socket. type boundEndpoint struct { transport.BoundEndpoint - rc refs.RefCounter + socket refs_vfs2.RefCounter } // Release implements transport.BoundEndpoint.Release. func (e *boundEndpoint) Release(ctx context.Context) { - e.rc.DecRef(ctx) + e.socket.DecRef(ctx) e.BoundEndpoint.Release(ctx) } @@ -81,32 +77,59 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp return nil } - rc := ep.wr.Get() - if rc == nil { - delete(a.endpoints, name) + if !ep.socket.TryIncRef() { + // The socket has reached zero references and is being destroyed. return nil } - return &boundEndpoint{ep.ep, rc} + return &boundEndpoint{ep.ep, ep.socket} } // Bind binds the given socket. // -// When the last reference managed by rc is dropped, ep may be removed from the +// When the last reference managed by socket is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refs_vfs2.RefCounter) error { a.mu.Lock() defer a.mu.Unlock() + // Check if there is already a socket (which has not yet been destroyed) bound at name. if ep, ok := a.endpoints[name]; ok { - if rc := ep.wr.Get(); rc != nil { - rc.DecRef(ctx) + if ep.socket.TryIncRef() { + ep.socket.DecRef(ctx) return syscall.EADDRINUSE } } ae := abstractEndpoint{ep: ep, name: name, ns: a} - ae.wr = refs.NewWeakRef(rc, &ae) + ae.socket = socket a.endpoints[name] = ae return nil } + +// Remove removes the specified socket at name from the abstract socket +// namespace, if it has not yet been replaced. +func (a *AbstractSocketNamespace) Remove(name string, socket refs_vfs2.RefCounter) { + a.mu.Lock() + defer a.mu.Unlock() + + ep, ok := a.endpoints[name] + if !ok { + // We never delete a map entry apart from a socket's destructor (although the + // map entry may be overwritten). Therefore, a socket should exist, even if it + // may not be the one we expect. + panic(fmt.Sprintf("expected socket to exist at '%s' in abstract socket namespace", name)) + } + + // A Bind() operation may race with callers of Remove(), e.g. in the + // following case: + // socket1 reaches zero references and begins destruction + // a.Bind("foo", ep, socket2) replaces socket1 with socket2 + // socket1's destructor calls a.Remove("foo", socket1) + // + // Therefore, we need to check that the socket at name is what we expect + // before modifying the map. + if ep.socket == socket { + delete(a.endpoints, name) + } +} diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 061a689a9..cb953e4dc 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -1,12 +1,25 @@ load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +go_template_instance( + name = "socket_refs", + out = "socket_refs.go", + package = "unix", + prefix = "socketOpsCommon", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "socketOpsCommon", + }, +) + go_library( name = "unix", srcs = [ "device.go", "io.go", + "socket_refs.go", "unix.go", "unix_vfs2.go", ], @@ -15,6 +28,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/log", "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 2b8454edb..b7e8e4325 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -80,7 +79,7 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty stype: stype, }, } - s.EnableLeakCheck("unix.SocketOperations") + s.EnableLeakCheck() return fs.NewFile(ctx, d, flags, &s) } @@ -89,17 +88,26 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty // // +stateify savable type socketOpsCommon struct { - refs.AtomicRefCount + socketOpsCommonRefs socket.SendReceiveTimeout ep transport.Endpoint stype linux.SockType + + // abstractName and abstractNamespace indicate the name and namespace of the + // socket if it is bound to an abstract socket namespace. Once the socket is + // bound, they cannot be modified. + abstractName string + abstractNamespace *kernel.AbstractSocketNamespace } // DecRef implements RefCounter.DecRef. func (s *socketOpsCommon) DecRef(ctx context.Context) { - s.DecRefWithDestructor(ctx, func(context.Context) { + s.socketOpsCommonRefs.DecRef(func() { s.ep.Close(ctx) + if s.abstractNamespace != nil { + s.abstractNamespace.Remove(s.abstractName, s) + } }) } @@ -284,10 +292,14 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { + asn := t.AbstractSockets() + name := p[1:] + if err := asn.Bind(t, name, bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } + s.abstractName = name + s.abstractNamespace = asn } else { // The parent and name. var d *fs.Dirent diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index dfa25241a..d066ef8ab 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -183,10 +183,14 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { + asn := t.AbstractSockets() + name := p[1:] + if err := asn.Bind(t, name, bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } + s.abstractName = name + s.abstractNamespace = asn } else { path := fspath.Parse(p) root := t.FSContext().RootDirectoryVFS2() -- cgit v1.2.3